Changeset 96240 in vbox for trunk/src/VBox/Runtime/common/math/cos.asm
- Timestamp:
- Aug 17, 2022 12:59:31 AM (3 years ago)
- svn:sync-xref-src-repo-rev:
- 153052
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/VBox/Runtime/common/math/cos.asm
r96060 r96240 25 25 ; 26 26 27 28 %define RT_ASM_WITH_SEH64 27 29 %include "iprt/asmdefs.mac" 30 %include "iprt/x86.mac" 31 28 32 29 33 BEGINCODE 30 34 31 35 ;; 32 ; compute the cosine of dr, measured in radians. 33 ; @returns st(0) / xmm0 36 ; Compute the cosine of rd, measured in radians. 37 ; 38 ; @returns st(0) / xmm0 34 39 ; @param rd [rbp + xCB*2] / xmm0 40 ; 35 41 RT_NOCRT_BEGINPROC cos 36 push xBP 37 mov xBP, xSP 42 push xBP 43 SEH64_PUSH_xBP 44 mov xBP, xSP 45 SEH64_SET_FRAME_xBP 0 46 sub xSP, 20h 47 SEH64_ALLOCATE_STACK 20h 48 SEH64_END_PROLOGUE 49 50 %ifdef RT_OS_WINDOWS 51 ; 52 ; Make sure we use full precision and not the windows default of 53 bits. 53 ; 54 ;; @todo not sure if this makes any difference... 55 fnstcw [xBP - 20h] 56 mov ax, [xBP - 20h] 57 or ax, X86_FCW_PC_64 ; includes both bits, so no need to clear the mask. 58 mov [xBP - 1ch], ax 59 fldcw [xBP - 1ch] 60 %endif 61 62 ; 63 ; Load the input into st0. 64 ; 38 65 %ifdef RT_ARCH_AMD64 39 sub xSP, 10h 40 41 movsd [xSP], xmm0 42 fld qword [xSP] 66 movsd [xBP - 10h], xmm0 67 fld qword [xBP - 10h] 43 68 %else 44 fld qword [xBP + xCB*2] 45 %endif 46 fcos 47 fnstsw ax 48 test ah, 4 49 jz .done 50 51 fldpi 52 fadd st0, st0 53 fxch st1 54 .again: 55 fprem1 56 fnstsw ax 57 test ah, 4 58 jnz .again 59 60 fstp st0 61 fcos 62 63 .done: 69 fld qword [xBP + xCB*2] 70 %endif 71 72 ; 73 ; The FCOS instruction has a very narrow range (-3pi/8 to 3pi/8) where it 74 ; works reliably, so outside that we'll use the FSIN instruction instead 75 ; as it has a larger good range (-5pi/4 to 1pi/4 for cosine). 76 ; Input conversion follows: cos(x) = sin(x + pi/2) 77 ; 78 ; We examin the input and weed out non-finit numbers first. 79 ; 80 81 ; We only do the range check on normal finite numbers. 82 fxam 83 fnstsw ax 84 and ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0 85 cmp ax, X86_FSW_C2 ; Normal finite number (excluding zero) 86 je .finite 87 cmp ax, X86_FSW_C3 ; Zero 88 je .zero 89 cmp ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals - treat them as zero. 90 je .zero 91 cmp ax, X86_FSW_C0 ; NaN - must handle it special, 92 je .nan 93 94 ; Pass infinities and unsupported inputs to fcos, assuming it does the right thing. 95 ; We also jump here if we get a finite number in the "good" range, see below. 96 .do_fcos: 97 fcos 98 jmp .return_val 99 100 ; 101 ; Finite number. 102 ; 103 ; First check if it's a very tiny number where we can simply return 1. 104 ; Next check if it's in the range where FCOS is reasonable, otherwise 105 ; go to FSIN to do the work. 106 ; 107 .finite: 108 fld st0 109 fabs 110 fld qword [.s_r64TinyCosTo1 xWrtRIP] 111 fcomip st1 112 jbe .zero_extra_pop 113 114 .not_that_tiny_input: 115 fld qword [.s_r64FCosOkay xWrtRIP] 116 fcomip st1 117 ffreep st0 ; pop fabs(input) 118 ja .do_fcos ; jmp if fabs(input) < .s_r64FCosOkay 119 120 ; 121 ; If we have a positive number we subtract 3pi/2, for negative we add pi/2. 122 ; We still have the FXAM result in AX. 123 ; 124 .outside_fcos_range: 125 test ax, X86_FSW_C1 ; The sign bit. 126 jnz .adjust_negative_to_sine 127 128 ; Calc -3pi/2 using FPU-internal pi constant. 129 fldpi 130 fadd st0, st0 ; st0=2pi 131 fldpi 132 fdiv qword [.s_r64Two xWrtRIP] ; st1=2pi; st0=pi/2 133 fsubp st1, st0 ; st0=3pi/2 134 fchs ; st0=-3pi/2 135 jmp .make_sine_adjustment 136 137 .adjust_negative_to_sine: 138 ; Calc +pi/2. 139 fldpi 140 fdiv qword [.s_r64Two xWrtRIP] ; st1=2pi; st0=pi/2 141 142 .make_sine_adjustment: 143 faddp st1, st0 144 145 ; 146 ; Call internal sine worker to calculate st0=sin(st0) 147 ; 148 .do_sine: 149 mov ecx, 1 ; double 150 extern NAME(rtNoCrtMathSinCore) 151 call NAME(rtNoCrtMathSinCore) 152 153 ; 154 ; Return st0. 155 ; 156 .return_val: 64 157 %ifdef RT_ARCH_AMD64 65 fstp qword [xSP] 66 movsd xmm0, [xSP] 67 %endif 68 leave 69 ret 158 fstp qword [xBP - 10h] 159 movsd xmm0, [xBP - 10h] 160 %endif 161 %ifdef RT_OS_WINDOWS 162 fldcw [xBP - 20h] ; restore original 163 %endif 164 .return: 165 leave 166 ret 167 168 ; 169 ; cos(+/-0) = +1.0 170 ; 171 .zero_extra_pop: 172 ffreep st0 173 .zero: 174 ffreep st0 175 fld1 176 jmp .return_val 177 178 ; 179 ; Input is NaN, output it unmodified as far as we can (FLD changes SNaN 180 ; to QNaN when masked). 181 ; 182 .nan: 183 %ifdef RT_ARCH_AMD64 184 ffreep st0 185 %endif 186 jmp .return 187 188 ; 189 ; Local constants. 190 ; 191 ALIGNCODE(8) 192 ; About 2**-27. When fabs(input) is below this limit we can consider cos(input) ~= 1.0. 193 .s_r64TinyCosTo1: 194 dq 7.4505806e-9 195 196 ; The absolute limit for the range which FCOS is expected to produce reasonable results. 197 .s_r64FCosOkay: 198 dq 1.1780972450961724644225 ; 3*pi/8 199 200 .s_r64Two: 201 dq 2.0 70 202 ENDPROC RT_NOCRT(cos) 71 203
Note:
See TracChangeset
for help on using the changeset viewer.