
; Filename: dooperfplib.s
; Last modified: 01 July 2002

; *** StrongARM floating point routines (when compiling
; *** with GCC's "-msoft-float" and the output passed through fpconv)
; You may contact the author by email at:  mech@toth.org.uk

;  Copyright (C) 2000-2001  Daniel Maloney.  Excepting:
;  Contains code derived from: ylib by Claus Vohwinkel
;  Also the clever square root routine is originally by Jan Vlietinck

; This library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Library General Public
; License as published by the Free Software Foundation; either
; version 2 of the License, or (at your option) any later version.

; This library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
; Library General Public License for more details.

; You should have received a copy of the GNU Library General Public
; License along with this library; if not, write to the
; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
; Boston, MA 02111-1307, USA.


; *** Notes ***
; This code is designed and optimised solely for the SA110 processor.
; It deals with a non-standard floating point number
; representation for speed.  Basically the first 64-bits of ARM (v4)
; extended precision.  This has ramifications if you intend to link
; it with any library not compiled with this same system.  You might
; get away with a veneer (see the code for atof() at the end of this
; file).  Apart from that, deciding if and how to link to dooperfplib
; is left as an exercise for the reader.

; Note that the code makes no provision for infinity, NaNs, negative
; zero, denormalised numbers, or rounding.  Overflow/underflow is
; not detected, but fairly unlikely given the range of the exponent.
; I suggest you take all this into account before resorting to this
; system.

; If you are writing something from scratch I strongly suggest that
; you use fixed point maths, but you may find dooperfplib more useful
; when porting large amounts of code that already assumes an FPU
; (such as LAME).  The advantage obviously being that you don't have
; to alter the source much and so can keep your port up to date with
; subsequent releases more easily.

; This scheme is not all that elegant, but it ought to be good enough
; for most speed-critical uses and it is about as fast as possible in
; the absence of an FPU (except trig - see below).

; This file also should have been distributed with the companion BBC
; BASIC script "fpconv" which can convert all the double precision
; literals in source generated by ARM GCC v2.95.4 to dooperfplib
; format.

; Lastly: For some less common functions, dooperfplib relies on
; the floating point (FPA) instructions.  However this only really
; affects inverse trig and powers of negative numbers.

; My email address is: mech@toth.org.uk
; Thanks.


 AREA |__dooperfplib|, CODE, READONLY

 EXPORT |__truncdfsf2|
 EXPORT |__extendsfdf2|

 EXPORT |multwopowx|
 EXPORT |m_frexp|
 EXPORT |__negdf2|
 EXPORT |fabsd|
 EXPORT |z_cutoff|
 EXPORT |eq0df|
 EXPORT |__eqdf2|
 EXPORT |__nedf2|
 EXPORT |__ledf2|
 EXPORT |__ltdf2|
 EXPORT |__gedf2|
 EXPORT |__gtdf2|
 EXPORT |__fixdfsi|
 EXPORT |__fixunsdfsi|
 EXPORT |__floatsidf|
 EXPORT |floor|
 EXPORT |rint|
 EXPORT |rintftoi|
 EXPORT |halftimes|
 EXPORT |twotimes|
 EXPORT |__squaredf|
 EXPORT |__muldf3|
 EXPORT |__adddf3|
 EXPORT |__subdf3|
 EXPORT |__divdf3|
 EXPORT |timesroot2|
 EXPORT |divroot2|
 EXPORT |sqrt|
 EXPORT |log|
 EXPORT |log10|
 EXPORT |exp|
 EXPORT |pow|
 EXPORT |m_hypot|
 EXPORT |cos|
 EXPORT |sin|
 EXPORT |tan|
 EXPORT |acos|
 EXPORT |asin|
 EXPORT |atan|
 EXPORT |m_atof|
 EXPORT |ifimul|


|__truncdfsf2|          ; Provided for completeness.  Not sure if this works correctly
                        ; casts dooperlib 64-bit float to ISO single prec. float.
 bic r12, r0, #1<<31
 and r0, r0, #1<<31
 teq r1, #0
 moveq r15, r14         ; exit if number is zero
 adds r1, r1, #1<<7
 addcs r12, r12, #1
 orr r0, r0, r1, lsr #8
 bic r0, r0, #1<<23
 sub r12, r12, #0x3F80
 orr r0, r0, r12, lsl #23
 mov r15, r14


|__extendsfdf2|         ; Provided for completeness.  Not sure if this works correctly
                        ; casts ISO single prec. float to dooperlib 64-bit sized float.
 movs r0, r0, lsl #1
 mov r1, r0, lsl #7
 orrne r1, r1, #1<<31
 mov r0, r0, lsr #24
 addne r0, r0, #0x3F80
 orrcs r0, r0, #1<<31
 mov r15, r14


;|__fixdfdi|             ; casts float to 64-bit integer
; bic r3, r0, #1<<31
; rsb r12, r3, #0x4000
; adds r12, r12, #0x3E
; ble fix_ovf
; cmp r12, #64
; bge dfix_unf
; cmp r12, #32
; bge fdsc               ; (hope I've got the endianness right)...
; teq r0, #0
; rsb r3, r12, #32
; mov r0, r1, lsl r3
; mov r1, r1, lsr r12
; movpl r15, r14
; rsbs r0, r0, #0
; rsc r1, r1, #0
; mov r15, r14

;fdsc
; subne r12, r12, #32
; teq r0, #0
; mov r0, r1, lsr #12
; mvnmi r1, #0
; movpl r1, #0
; rsbmi r0, r0, #0
; mov r15, r14


;|__floatdidf|           ; casts 64-bit integer to float
; orrs r2, r1, r0
; moveq r15, r14         ; exit if zero
; ands r3, r1, #1<<31
; bpl asda
; rsbs r0, r0, #0        ; abs
; rsc r1, r1, #0
;asda
; add r3, r3, #0x4000
; cmp r1, #0
; addeq r3, r3, #32
; moveq r1, r0
; moveq r0, r3
; beq qule
; sub r3, r3, #1
; mov r2, #0
; movs r12, r1, lsr #16   ; normalisation:
;  moveq r1, r1, lsl #16
;   addne r2, r2, #16
; movs r12, r1, lsr #24
;  moveq r1, r1, lsl #8
;   addne r2, r2, #8
; movs r12, r1, lsr #28
;  moveq r1, r1, lsl #4
;   addne r2, r2, #4
; movs r12, r1, lsr #30
;  moveq r1, r1, lsl #2
;   addne r2, r2, #2
; movs r12, r1, lsr #31
;  moveq r1, r1, lsl #1
;   addne r2, r2, #1
; rsb r12, r2, #32
; add r1, r1, r0, lsr r12
; add r0, r3, r2
; mov r15, r14


|__fixunsdfsi|          ; unsigned...  erm...  I think.  Not implemented.
 cmp r0, #0
 swimi 263

|__fixdfsi|             ; casts float to int
 bic r3, r0, #1<<31
 rsb r3, r3, #0x4000
 adds r3, r3, #0x1E
 ble fix_ovf
 cmp r3, #32
 bge fix_unf
 teq r0, #0
 mov r0, r1, lsr r3
 rsbmi r0, r0, #0
 mov r15, r14

fix_ovf                 ; saturation:
 mvn r1, #1<<31
 eor r0, r1, r0, asr #31
 mov r15, r14

dfix_unf
 mov r1,#0

fix_unf
 mov r0, #0
 mov r15, r14


|__floatsidf|           ; casts int to float
 movs r1, r0
 moveq r15, r14         ; exit if zero
 ands r0, r0, #1<<31
 rsbmi r1, r1, #0       ; abs
 add r0, r0, #0x4000
qule
 movs r2, r1, lsr #16   ; normalisation:
  moveq r1, r1, lsl #16
   addne r0, r0, #16
 movs r2, r1, lsr #24
  moveq r1, r1, lsl #8
   addne r0, r0, #8
 movs r2, r1, lsr #28
  moveq r1, r1, lsl #4
   addne r0, r0, #4
 movs r2, r1, lsr #30
  moveq r1, r1, lsl #2
   addne r0, r0, #2
 movs r2, r1, lsr #31
  moveq r1, r1, lsl #1
   subeq r0, r0, #1
 mov r15, r14


|floor|
; teq r1, #0
; moveq r15, r14         ; exit if zero
 bic r3, r0, #1<<31
 rsb r2, r3, #0x4000
 adds r2, r2, #0x1E
 movle r15, r14         ; exit if already integer
 cmp r2, #32
 bge zero_result        ; exit if zero result
 mov r1, r1, lsr r2     ; fraction bits removed by shifting back and forth
 mov r1, r1, lsl r2
 mov r15, r14


|rint|
; teq r1, #0
; moveq r15, r14         ; exit if zero
 bic r3, r0, #1<<31
 rsb r2, r3, #0x4000
 adds r2, r2, #0x1E
 movle r15, r14         ; exit if already integer
 cmp r2, #32
 bgt zero_result        ; exit if zero result
 movs r1, r1, lsr r2    ; fraction bits removed by shifting back and forth
 adc r1, r1, #0
 movs r1, r1, lsl r2
 movcs r1, #1<<31
 addcs r0, r0, #1
 mov r15, r14


|rintftoi|
 bic r3, r0, #1<<31
 rsb r12, r3, #0x4000
 adds r12, r12, #0x1E
 ble fix_ovf
 cmp r12, #32
 bgt fix_unf
 movs r1, r1, lsr r12
 teq r0, #0
 adc r0, r1, #0
 rsbmi r0, r0, #0
 mov r15, r14


|z_cutoff|              ; if x<0 then x=0
 bic r1, r1, r0, asr #31
 bic r0, r0, r0, asr #31
 mov r15, r14


;inlined;|eq0df|
;inlined; mov r0, r1
;inlined; mov r15, r14


;inlined;|__eqdf2|
;inlined;|__nedf2|
;inlined; eors r0, r0, r2
;inlined; eoreq r0, r1, r3
;inlined; mov r15, r14


|__gtdf2|
|__ledf2|
|__gedf2|
|__ltdf2|
 mov r12, r0, lsl #1
 cmp r12, r2, lsl #1
 cmpeq r1, r3
 mvncc r0, r2
 subeq r0, r0, r2
 mov r15, r14


;inlined;|__negdf2|
;inlined; eor r0, r0, #1<<31
;inlined; mov r15, r14


;inlined;|fabsd|
;inlined; bic r0, r0, #1<<31
;inlined; mov r15, r14


;inlined;|halftimes|
;inlined; movs r3, r0, lsl #1
;inlined; subne r0, r0, #1
;inlined; mov r15, r14


;inlined;|twotimes|
;inlined; movs r3, r0, lsl #1
;inlined; addne r0, r0, #1
;inlined; mov r15, r14


|__squaredf|
 mov r2, r0
 mov r3, r1

|__muldf3|
 umull r12, r1, r3, r1
 add r0, r0, r2         ; add the exponents and eor the sign
 sub r0, r0, #0x3FC0    ; exponent correction
 movs r2, r1, lsl #1    ; test for normalisation
 sbc r0, r0, #0x3E      ; more exponent correction, combined with a decrement if normalisation needed
 movcs r15, r14         ; exit if normalised
 orrne r1, r2, r12, lsr #31 ; no point wasting the msb of r12 if we have to normalise
 movne r15, r14
 mov r0, #0             ; result is zero
 mov r15, r14


|__subdf3|
 eor r2, r2, #1<<31     ; change sign to make subtraction

|__adddf3|              ; NB: the following code handles zero inputs Ok without checking for them here
 teq r0, r2             ; is it essentially an add or a subtract?
 bmi do_subdf
|__adddf3uns|
 subs r12, r0, r2       ; which exponent was smaller?
 bgt Ba303_s
 rsb r12, r12, #0       ; make r12 positive (difference between exponents)
; cmp r12, #32           ; limit to a sensible value.  Only the bottom byte is used in a shift by a reg,
; movgt r12, #32         ; however r12>255 is unlikely unless r0=0 (when r1=0 anyway), so cheating may be possible
 adds r1, r3, r1, lsr r12 ; align fractions and add
 adc r0, r2, #0         ; make r0=exponent and increment on a carry
 movcc r15, r14         ; exit if the result did not carry (more likely than not)
 mov r1, r1, rrx        ; bring the carried bit back into the fraction
 mov r15, r14

Ba303_s                 ; second exponent was smaller
; cmp r12, #32           ; can exit now if r3 is shifted to zero.  Only the bottom byte is used in a shift by a reg,
; movgt r15, r14         ; however r12>255 is unlikely unless r2=0 (when r3=0 anyway), so cheating may be possible
 adds r1, r1, r3, lsr r12 ; align fractions and add
 movcc r15, r14         ; exit if result did not carry (more likely than not)
 add r0, r0, #1         ; increment exponent on a carry
 mov r1, r1, rrx        ; and bring the carried bit back into the fraction
 mov r15, r14

do_subdf
 eor r12, r2, #1<<31    ; make signs cancel out on the following instruction
 subs r12, r0, r12      ; which exponent was smaller?
 bgt Ba303_subs
 beq straight_subtract
 rsb r12, r12, #0       ; make r12 positive (difference between exponents)
; cmp r12, #32           ; limit to a sensible value.  Only the bottom byte is used in a shift by a reg,
; movgt r12, #32         ; however r12>255 is unlikely unless r2=0 (when r3=0 anyway), so cheating may be possible
 subs r1, r3, r1, lsr r12 ; align fractions and subtract
 mov r0, r2
 movmi r15, r14         ; exit if result is normalised
 sub r0, r0, #1         ; need to decrement the exponent
 movs r1, r1, lsl #1    ; and shift the fraction up one
 movmi r15, r14         ; exit if result is normalised now (most subtracts will be finished now)
 movs r12, r1, lsr #24  ; normalisation code.  Assumes small shift amounts are most likely:
 beq top8clear
 movs r12, r1, lsr #30
 adreq r12, ffs_table
 ldreqb r12, [r12, r1, lsr #24]
 sub r0, r0, r12
 mov r1, r1, lsl r12
 mov r15, r14

straight_subtract       ; special case subtract for when exponents are equal
 subs r1, r1, r3
 eormi r0, r0, #1<<31   ; if the result is negative, correct the sign bit
 rsbmi r1, r1, #0       ;  ...and make the mantissa positive
 movs r1, r1, lsl #1    ; always normalise at least once
 sub r0, r0, #1         ; always decrement the exponent at least once
 movmi r15, r14         ; exit if result is normalised
 beq zero_result        ; exit if result is zero
 movs r12, r1, lsr #24  ; normalisation code.  Assumes small shift amounts are most common:
 beq top8clear
 movs r12, r1, lsr #30
 adreq r12, ffs_table
 ldreqb r12, [r12, r1, lsr #24]
 sub r0, r0, r12        ; exponent correction
 mov r1, r1, lsl r12    ; fraction normalisation
 mov r15, r14

Ba303_subs              ; second exponent was smaller
; cmp r12, #32           ; can exit now if r3 is shifted to zero.  Only the bottom byte is used in a shift by a reg,
; movgt r15, r14         ; however r12>255 is unlikely unless r2=0 (when r3=0 anyway), so cheating may be possible
 subs r1, r1, r3, lsr r12 ; align fractions and subtract
 movmi r15, r14         ; exit if result is normalised
 sub r0, r0, #1         ; need to decrement the exponent
 movs r1, r1, lsl #1    ; and normalise the fraction
 movmi r15, r14         ; exit if result is normalised (most subtracts will be finished by now)
normalise
 movs r12, r1, lsr #24  ; normalisation code.  Assumes small shift amounts are most common:
 beq top8clear
 movs r12, r1, lsr #30
 adreq r12, ffs_table
 ldreqb r12, [r12, r1, lsr #24]
 sub r0, r0, r12        ; exponent correction
 mov r1, r1, lsl r12    ; fraction normalisation
 mov r15, r14

top8clear
 tst r1, #0x00ff0000
 beq top16clear
 movs r1, r1, lsl #8    ; at least 8 places
 sub r0, r0, #8
 movmi r15, r14         ; exit if now normalised
 movs r12, r1, lsr #30
 adreq r12, ffs_table
 ldreqb r12, [r12, r1, lsr #24]
 sub r0, r0, r12        ; exponent correction
 mov r1, r1, lsl r12    ; fraction normalisation
 mov r15, r14

top16clear
 tst r1, #0x0000ff00
 beq top24clear
 movs r1, r1, lsl #16   ; at least 16 places
 sub r0, r0, #16
 movmi r15, r14         ; exit if now normalised
 movs r12, r1, lsr #30
 adreq r12, ffs_table
 ldreqb r12, [r12, r1, lsr #24]
 sub r0, r0, r12        ; exponent correction
 mov r1, r1, lsl r12    ; fraction normalisation
 mov r15, r14

top24clear
 movs r1, r1, lsl #24   ; at least 24 places
 sub r0, r0, #24
 movmi r15, r14         ; exit if now normalised
 movs r12, r1, lsr #30
 adreq r12, ffs_table
 ldreqb r12, [r12, r1, lsr #24]
 sub r0, r0, r12        ; exponent correction
 mov r1, r1, lsl r12    ; fraction normalisation
 mov r15, r14


|__divdf3|
 ands r12, r1, r3
 beq zero_in_div
 teq r2, r0
 bicmi r0, r0, #1<<31   ; remove the sign if necessary
 sub r0, r0, r2
 add r0, r0, #0x4000
 sub r0, r0, #1
 orrmi r0, r0, #1<<31   ; put back the sign
 subs r2, r1, r3
 movcc r2, r1
 adc r1, r1, r1
   GBLA divit
divit SETA 1
   WHILE divit < 31
     subs r12, r2, r3, lsr #divit
     movcs r2, r12
     adc r1, r1, r1
divit SETA divit+1
   WEND
 subs r2, r2, r3, lsr #31
 adcs r1, r1, r1
 movmi r15, r14         ; exit if normalised
 mov r1, r1, lsl #1
 sub r0, r0, #1
 mov r15, r14

zero_in_div
 teq r3, #0
 beq div_by_zero
zero_result
 mov r0, #0
 mov r1, #0
 mov r15, r14

div_by_zero
 teq r0, r2
 adr r0, bignum
 ldmia r0, {r0, r1}
 orrmi r0, r0, #1<<31
 mov r15, r14


|timesroot2|
 ldr r3, SQX2
 umull r12, r1, r3, r1
 add r0, r0, #1
 movs r2, r1, lsl #1    ; test for nomalisation
 movcs r15, r14         ; exit if normalised
 subne r0, r0, #1
 orrne r1, r2, r12, lsr #31 ; no point wasting the msb of r12 if we have to normalise
 movne r15, r14
 mov r0, #0
 mov r15, r14           ; result is zero


|divroot2|
 ldr r3, SQX2
 umull r12, r1, r3, r1
 movs r2, r1, lsl #1    ; test for nomalisation
 movcs r15, r14         ; exit if normalised
 subne r0, r0, #1
 orrne r1, r2, r12, lsr #31 ; no point wasting the msb of r12 if we have to normalise
 movne r15, r14
 mov r0, #0
 mov r15, r14           ; result is zero


 align 32

ffs_table
 dcb 8, 7, 6, 6, 5, 5, 5, 5
 dcb 4, 4, 4, 4, 4, 4, 4, 4
 dcb 3, 3, 3, 3, 3, 3, 3, 3
 dcb 3, 3, 3, 3, 3, 3, 3, 3
 dcb 2, 2, 2, 2, 2, 2, 2, 2
 dcb 2, 2, 2, 2, 2, 2, 2, 2
 dcb 2, 2, 2, 2, 2, 2, 2, 2
 dcb 2, 2, 2, 2, 2, 2, 2, 2

bignum & &7FFF, 1<<31

L10E & &3FFD, &DE5BD8A9  ; 1/ln(10)

SQX2  & &B504F334        ; sqrt(2) (mantissa only)
LGCO7 & &3FFC, &98AA3FC1
ZCO6  & &3FFC, &9CB03E63
      & &3FFC, &BA34901B
      & &3FFC, &E38E20BB
      & &3FFD, &924924AE
      & &3FFD, &CCCCCCCD
      & &3FFE, &AAAAAAAB
      & &4000, &80000000
LOG2  & &3FFE, &B17217F8 ; log(2)

LOG2E & &3FFF, &B8AA3B29
P2    & &3FF9, &BD2E42AB
P1    & &4003, &A19DD499
P0    & &4009, &BD3D047F
Q1    & &4006, &E92F287B
Q0    & &400B, &8881B17C


|sqrt|                  ; Jan Vlietinck's amazing and super-fast sqrt algorithm
 orrs r12, r1, r0, lsl #1
 moveq r15, r14
 bcs inval_op
 add r0, r0, #0x4000
 sub r0, r0, #1
 movs r0, r0, lsr #1
 movcc r1, r1, lsr #1
 adds r3, r1, r1, lsr #1
 addccs r3, r3, r3, lsr #1
 movcs r3, r1
 addcc r1, r1, r1, lsr #1
  GBLA sqrtit
sqrtit SETA  1
  WHILE sqrtit <= 16
 adds r2, r3, r3, lsr #sqrtit
 addccs r2, r2, r2, lsr #sqrtit
 movcc r3, r2
 addcc r1, r1, r1, lsr #sqrtit
sqrtit SETA  sqrtit+1
  WEND
 mov r2, #1<<31
 sub r3, r2, r3, lsr #1      ; (1-r3)/2
 movs r2, r1, lsr #16
 mulne r2, r3, r2
 add r1, r1, #1
 add r1, r1, r2, lsr #16     ; 1+r1+r1*(1-r3)/2
 mov r15, r14

inval_op
 adr r0, bignum
 ldmia r0, {r0, r1}
 mov r15, r14


|log10|                 ; Algorithm from ylib
 cmp r0, #0
 ble inval_op
 stmfd r13!, {r14}
 bl logb
 adr r2, L10E
 ldmia r2, {r2, r3}     ; ldfd f1, L10E
 ldmfd r13!, {r14}
 b __muldf3             ; fmld f0, f0, f1


|log|                   ; Algorithm from ylib
 cmp r0, #0
 ble inval_op
logb
 stmfd r13!, {r4 - r10, r14}
 mov r4, #0xff
 orr r4, r4, #0x3f00
 sub r6, r0, r4
 ldr r3, SQX2
 mov r5, r1
 cmp r3, r1
 mov r2, r4
 addgt r6, r6, #1       ; adjusting exponents
 subgt r4, r4, #1
 mov r0, r4
 mov r3, #1<<31
 bl __adddf3uns         ; adfd f1, f0, #1
 mov r9, r0
 mov r10, r1
 mov r0, r4
 mov r1, r5
 bl __subdf3            ; sufd f0, f0, #1
 mov r2, r9
 mov r3, r10
 bl __divdf3            ; frdd f0, f1, f0
 mov r4, r0             ; f0 = (f0-1)/(f0+1)
 mov r5, r1
 bl __squaredf          ; fmld f1, f0, f0
 mov r9, r0
 mov r10, r1
 adr r7, LGCO7
 ldmia r7!, {r2, r3}    ; ldfd f2, [r7], #8
 bl __muldf3            ; fmld f2, f1, f2
 ldmia r7!, {r2, r3}    ; ldfd f3, [r7], #8
 bl __adddf3uns         ; adfd f2, f2, f3
 mov r8, #5
LG2                     ; loop 5x
 umull r12, r1, r10, r1 ; fmld f2, f2, f1
 ldmia r7!, {r2, r3}    ; ldfd f3, [r7], #8
 add r0, r0, r9         ; exponent
 sub r0, r0, #0x3FC0
 movs r12, r1, lsl #1   ; (NB: zero never happens here)
 sbc r0, r0, #0x3E
 movcc r1, r12
 bl __adddf3uns         ; adfd f2, f2, f3
 subs r8, r8, #1
  bpl LG2               ; end of loop
 mov r2, r4
 mov r3, r5
 bl __muldf3            ; fmld f0, f2, f0
 teq r6, #0
 beq finish_log
 mov r4, r0
 mov r5, r1
 mov r0, r6
 bl __floatsidf         ; fltd f1, r6
 ldmia r7, {r2, r3}     ; ldfd f3, [r7]
 bl __muldf3            ; fmld f1, f1, f3
 mov r2, r4
 mov r3, r5
 ldmfd r13!, {r4 - r10, r14}
 b __adddf3             ; adfd f0, f0, f1

finish_log
 ldmfd r13!, {r4 - r10, r15}


POWUSU                  ; for difficult powers
 bl logb
 mov r2, r6
 mov r3, r7
 bl __muldf3
 ldmia r13!, {r4 - r8, r14} ; fall through to exp...->


|exp|                   ; Calculates e^x.  Algorithm from ylib
 teq r1, #0
 beq pz
 stmfd r13!, {r4 - r9, r14}
 adr r4, LOG2E
 ldmia r4!, {r2, r3}    ; ldfd f1, [r4], #8
 bl |__muldf3|          ; fmld f0, f0, f1
 bic r3, r0, #1<<31
 rsb r3, r3, #0x4000
 adds r3, r3, #0x1E
 ble inval_op_exp       ; overflow
 cmp r3, #32
 movgt r3, #32
 mov r9, r1, lsr r3     ; lose fraction bits
 cmp r9, #0x4000
 bge inval_op_exp       ; overflow
 teq r0, #0
 rsbmi r9, r9, #0
 rsbs r2, r3, #32
 biclt r2, r2, r2, asr #31
 movs r6, r1, lsl r2    ; lose integer bits
 beq nofraction         ; result is whole power of two
 umull r12, r8, r6, r6  ; fmld f1, f0, f0
 sub r5, r0, r2
 add r7, r5, r5
 ldmia r4!, {r2, r3}    ; ldfd f2, [r4], #8   ; P2
 sub r7, r7, #0x3FC0
 movs r0, r8, lsl #1
 sbc r7, r7, #0x3E
 orrcc r8, r0, r12, lsr #31
 mov r0, r7
 mov r1, r8
 bl __muldf3            ; fmld f2, f1, f2
 ldmia r4!, {r2, r3}    ; ldfd f3, [r4], #8   ; P1
 bl __adddf3uns         ; adfd f2, f2, f3
 mov r2, r7
 mov r3, r8
 bl __muldf3            ; fmld f2, f2, f1
 ldmia r4!, {r2, r3}    ; ldfd f3, [r4], #8   ; P0
 bl __adddf3uns         ; adfd f2, f2, f3
 mov r2, r5
 mov r3, r6
 bl __muldf3            ; fmld f0, f2, f0
 mov r5, r0
 mov r6, r1
 ldmia r4!, {r0, r1}    ; ldfd f2, [r4], #8    ; Q1
 mov r2, r7
 mov r3, r8
 bl __adddf3uns         ; adfd f2, f2, f1
 bl __muldf3            ; fmld f2, f2, f1
 ldmia r4!, {r2, r3}    ; ldfd f1, [r4], #8    ; Q0
 bl __adddf3uns         ; adfd f2, f2, f1
 mov r7, r0
 mov r8, r1
 mov r2, r5
 mov r3, r6
 bl __subdf3            ; sufd f1, f2, f0
 eor r2, r2, #1<<31     ; (subtraction changed the sign of r2)
 mov r5, r0
 mov r6, r1
 mov r0, r7
 mov r1, r8
 bl __adddf3            ; adfd f0, f2, f0
 mov r2, r5
 mov r3, r6
 add r0, r0, r9         ; adjust the final exponent (before tail routine)
 ldmfd r13!, {r4 - r9, r14}
 b __divdf3             ; fdvd f0, f0, f1

inval_op_exp
 ldmia r13!, {r4 - r9, r14}
 b inval_op

nofraction
 mov r0, #0x3F00
 orr r0, r0, #0xFF 
 add r0, r0, r9
 mov r1, #1<<31
 ldmfd r13!, {r4 - r9, r15}


|pow|                   ; Calculates powers.  Algorithm from ylib
 teq r1, #0
 beq zero_result        ; 0^x
 teq r3, r0, lsr #32
 beq pz                 ; numbers to the power zero
 bcs POWNEG             ; powers of negative numbers
 stmfd r13!, {r4 - r8, r14}
 mov r6, r2
 mov r7, r3
 bic r12, r2, #1<<31
 sub r12, r12, #0x4000
 cmp r12, #0x1D
 ble POWUSU             ; non-integer powers
 cmp r12, #24
 bge POWUSU             ; powers greater than about 2^26
 mov r4, r0
 mov r5, r1
 mov r0, r6
 mov r1, r7
 bl __fixdfsi           ; r8 = int(f1)
 movs r8, r0
 rsbmi r8, r8, #0       ; make power positive
 bpl skipinvpow
 mov r2, r4             ; find reciprocal if power was -ve
 mov r3, r5
 mov r0, #0x3F00
 orr r0, r0, #0xFF
 mov r1, #1<<31
 bl __divdf3            ; 1/f1
 mov r4, r0
 mov r5, r1
skipinvpow
 mov r6, #0x3F00        ; #1
 orr r6, r6, #0xFF
 mov r7, #1<<31

powloop
 movs r8, r8, lsr #1
 bcc skipmpow
 umull r0, r7, r5, r7   ; fmlcsd f0, f0, f1
 add r6, r6, r4         ; exponent
 sub r6, r6, #0x3FC0
 movs r12, r7, lsl #1   ; zero never happens
 sbc r6, r6, #0x3E
 orrcc r7, r12, r0, lsr #31
 teq r8, #0
skipmpow
  beq endpow
 umull r0, r1, r5, r5   ; fmld f1, f1, f1
 add r4, r4, r4         ; exponent
 sub r4, r4, #0x3FC0
 sub r4, r4, #0x3E
 movs r5, r1
 bmi powloop            ; if normalised, loop
 movs r5, r5, lsl #1
 subne r4, r4, #1
 bne powloop
 mov r4, #0             ; =0
 b powloop

endpow
 mov r0, r6
 mov r1, r7
 ldmia r13!, {r4 - r8, r15}

pz                      ; handle x^0
 mov r0, #0x3F00
 orr r0, r0, #0xFF
 mov r1, #1<<31         ; =1
 mov r15, r14


                        ; The following code relies upon floating point instructions...
POWNEG
 mov r12, #0
 stmfd r13!, {r2, r3, r12, r14}
 stmfd r13!, {r0, r1, r12}
  lfm f0, 2, [r13], #24 ; f0, f1
  powdz f0, f0, f1      ; Let FPEmulator take the strain
  sfm f0, 1, [r13, #-12]!
 ldmia r13!, {r0 - r2, r15}


|multwopowx|
 add r0, r0, r2
 mov r15, r14


|m_frexp|
 mov r12, #0
 stmfd r13!, {r0, r1, r12, r14}
  ldfe f0, [r13], #12
  stfd f0, [r13, #-8]!
 ldmia r13!, {r0, r1}
 bl |frexp|
  stfe f0, [r13, #-12]!
 ldmfd r13!, {r0, r1, r12, r15}


|m_hypot|
 movs r12, r3
 beq isarg1
 str r14, [r13, #-4]!
 movs r14, r1
 umullne r3, r1, r14, r14
 beq isarg2
 mov r0, r0, lsl #1     ; double the exponent and lose the sign
 sub r0, r0, #0x3FC0    ; exponent correction
 movs r14, r1, lsl #1   ; test for denormalisation
 movcc r1, r14          ; if carry clear, it needed normalising
 umull r14, r3, r12, r12
 sbc r0, r0, #0x3E      ; more exponent correction, combined with a decrement on normalisation
 mov r2, r2, lsl #1     ; double the exponent and lose the sign
 sub r2, r2, #0x3FC0    ; exponent correction
 movs r12, r3, lsl #1   ; test for normalisation
 movcc r3, r12          ; if carry clear, it needed normalising
 sbc r2, r2, #0x3E      ; more exponent correction, combined with a decrement on normalisation
 bl __adddf3
 ldr r14, [r13], #4
 b sqrt

isarg1
 bic r0, r0, #1<<31
 mov r15, r14

isarg2
 bic r0, r2, #1<<31
 mov r1, r3
 ldr r15, [r13], #4


|cos|
 stmfd r13!, {r4 - r8, r14}
 adr r12, OPI
 ldmia r12, {r2, r3}
 bl |__muldf3|          ; mufd f0, f0, f1
 mov r2, #0x3f00
 orr r2, r2, #0xfe
 mov r3, #1<<31
 eor r0, r0, #1<<31
 bl |__adddf3|          ; rsfd f0, f0, #0.5
 b YSIN1a


|sin|
 stmfd r13!, {r4 - r8, r14}
 adr r12, OPI
 ldmia r12, {r2, r3}
 bl |__muldf3|          ; mufd f0, f1, f0
YSIN1a
 stmfd r13!, {r0, r1}
 orrs r2, r1, r0, lsl #1
 bicne r2, r0, #1<<31
 subne r2, r2, #0x3C00
 mov r4, r2, lsl #21
 bl |rint|              ; rndd f1, f0
 cmp r4, #&83000000
 bhi INVARG
 mov r5, r0
 mov r6, r1
 bl |__fixdfsi|         ; fixz r4, f1
 mov r4, r0
 ldmia r13, {r0, r1}
 mov r2, r5
 mov r3, r6
 bl |__subdf3|          ; sufd f0, f0, f1
 tst r4, #1
 eorne r0, r0, #1<<31   ; mnfned f0, f0
 stmia r13, {r0, r1}
SIN1
 mov r5, #7
 adr r6, sCO8
 bl |__squaredf|        ; mufd f1, f0, f0
 mov r7, r0
 mov r8, r1
 ldmia r6!, {r0, r1}    ; ldfd f2, [r6], #8

SIN2
 mov r2, r7
 mov r3, r8
 bl |__muldf3|          ; mufd f2, f2, f1
 ldmia r6!, {r2, r3}    ; ldfd f3, [r6], #8
 bl |__adddf3|          ; adfd f2, f2, f3
 subs r5, r5, #1
 bpl SIN2

 ldmia r13!, {r2 - r8, r14}
 b |__muldf3|           ; mufd f0, f0, f2


INVARG
 mov r4, r0
 mov r5, r1
 ldmia r13, {r2, r3}
 eor r0, r0, #1<<31
 bl |__adddf3|          ; sufd f0, f0, f1
 mov r6, r0
 mov r7, r1
 movs r0, r4, lsl #1
 subne r0, r4, #1       ; mufd f2, f1, #0.5    ; = -1  -.5  0  .5  1
 mov r1, r5
 bl |rint|              ; rndd f2, f2          ; = -1  -1/0 0  0/1  1
 mov r2, r0
 mov r3, r1
 movs r0, r4, lsl #1
 subne r0, r4, #1       ; mufd f1, f1, #0.5    ; = -1  -.5  0  .5  1
 mov r1, r5
 bl |__subdf3|          ; sufd f1, f1, f2      ; = 0   +-.5 0  +-.5 0
 movs r12, r0, lsl #1
 addne r0, r0, #1       ; mufd f2, f1, #2      ; = 0   +-1  0  +-1  0
 bl |__fixdfsi|         ; fix r4, f1
 tst r0, #1
 eorne r6, r6, #1<<31   ; mnfned f0, f0
 stmia r13, {r6, r7}
 b SIN1

OPI  & &00003FFD, &A2F9836E ; 1/PI
sCO8 & &00003FEA, &CE789708
sCO7 & &80003FEF, &B7BC195F
sCO6 & &00003FF3, &F479AB60
sCO5 & &80003FF7, &F183A6E2
sCO4 & &00003FFB, &A83C1A42
sCO3 & &80003FFE, &99696673
sCO2 & &00004000, &A335E33C
sCO1 & &80004001, &A55DE731
sCO0 & &00004000, &C90FDAA2


;|cos|
; mov r12, #0
; stmfd r13!, {r0, r1, r12, r14}
;  ldfe f0, [r13], #12
;  cosdz f0, f0
;  stfe f0, [r13, #-12]!
; ldmfd r13!, {r0 - r2, r15}


;|sin|
; mov r12, #0
; stmfd r13!, {r0, r1, r12, r14}
;  ldfe f0, [r13], #12
;  sindz f0, f0
;  stfe f0, [r13, #-12]!
; ldmfd r13!, {r0 - r2, r15}


|tan|
 mov r12, #0
 stmfd r13!, {r0, r1, r12, r14}
  ldfe f0, [r13], #12
  tandz f0, f0
  stfe f0, [r13, #-12]!
 ldmfd r13!, {r0 - r2, r15}


|asin|
 mov r12, #0
 stmfd r13!, {r0, r1, r12, r14}
  ldfe f0, [r13], #12
  asndz f0, f0
  stfe f0, [r13, #-12]!
 ldmfd r13!, {r0 - r2, r15}


|acos|
 mov r12, #0
 stmfd r13!, {r0, r1, r12, r14}
  ldfe f0, [r13], #12
  acsdz f0, f0
  stfe f0, [r13, #-12]!
 ldmfd r13!, {r0 - r2, r15}


|atan|
 mov r12, #0
 stmfd r13!, {r0, r1, r12, r14}
  ldfe f0, [r13], #12
  atndz f0, f0
  stfe f0, [r13, #-12]!
 ldmfd r13!, {r0 - r2, r15}


|m_atof|
 stmfd r13!, {r14}
 bl |strtod|
  stfe f0, [r13, #-12]!
 ldmfd r13!, {r0 - r2, r15}


ifimul
 umull r12, r2, r1, r2
 bic r3, r0, #1<<31
 rsb r3, r3, #0x4000
 adds r3, r3, #0x1E
 ble fix_ovf
 cmp r3, #32
 blt ifiuselowrd
 cmp r3, #64
 bge fix_unf
 sub r3, r3, #32
 movs r1, r2, lsr r3
 teq r0, #0
 rsbmi r1, r1, #0
 adc r0, r1, #0
 mov r15, r14

ifiuselowrd
 movs r1, r12, lsr r3
 rsb r12, r3, #32
 orr r1, r1, r2, lsl r12
 addcs r1, r1, #1
 mov r2, r2, lsr r3
 orrs r2, r2, r1, asr #31
 bne fix_ovf
 teq r0, #0
 rsbmi r1, r1, #0
 mov r0, r1
 mov r15, r14

 END
