VirtualBox

source: vbox/trunk/include/iprt/asm-math.h@ 52455

Last change on this file since 52455 was 52455, checked in by vboxsync, 11 years ago

Updated ASMMultU32ByU32DivByU32 and ASMMultU64ByU32DivByU32 documentation. (Don't put unscoped implementation details in the function description, or at least not immediately below the function brief!)

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
Line 
1/** @file
2 * IPRT - Assembly Routines for Optimizing some Integers Math Operations.
3 */
4
5/*
6 * Copyright (C) 2006-2010 Oracle Corporation
7 *
8 * This file is part of VirtualBox Open Source Edition (OSE), as
9 * available from http://www.215389.xyz. This file is free software;
10 * you can redistribute it and/or modify it under the terms of the GNU
11 * General Public License (GPL) as published by the Free Software
12 * Foundation, in version 2 as it comes in the "COPYING" file of the
13 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15 *
16 * The contents of this file may alternatively be used under the terms
17 * of the Common Development and Distribution License Version 1.0
18 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19 * VirtualBox OSE distribution, in which case the provisions of the
20 * CDDL are applicable instead of those of the GPL.
21 *
22 * You may elect to license modified versions of this file under the
23 * terms and conditions of either the GPL or the CDDL or both.
24 */
25
26#ifndef ___iprt_asm_math_h
27#define ___iprt_asm_math_h
28
29#include <iprt/types.h>
30
31#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
32# include <intrin.h>
33 /* Emit the intrinsics at all optimization levels. */
34# pragma intrinsic(__emul)
35# pragma intrinsic(__emulu)
36# ifdef RT_ARCH_AMD64
37# pragma intrinsic(_mul128)
38# pragma intrinsic(_umul128)
39# endif
40#endif
41
42
43/** @defgroup grp_rt_asm_math Interger Math Optimizations
44 * @ingroup grp_rt_asm
45 * @{ */
46
47/**
48 * Multiplies two unsigned 32-bit values returning an unsigned 64-bit result.
49 *
50 * @returns u32F1 * u32F2.
51 */
52
53#if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)
54DECLASM(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2);
55#else
56DECLINLINE(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2)
57{
58# ifdef RT_ARCH_X86
59 uint64_t u64;
60# if RT_INLINE_ASM_GNU_STYLE
61 __asm__ __volatile__("mull %%edx"
62 : "=A" (u64)
63 : "a" (u32F2), "d" (u32F1));
64# elif RT_INLINE_ASM_USES_INTRIN
65 u64 = __emulu(u32F1, u32F2);
66# else
67 __asm
68 {
69 mov edx, [u32F1]
70 mov eax, [u32F2]
71 mul edx
72 mov dword ptr [u64], eax
73 mov dword ptr [u64 + 4], edx
74 }
75# endif
76 return u64;
77# else /* generic: */
78 return (uint64_t)u32F1 * u32F2;
79# endif
80}
81#endif
82
83
84/**
85 * Multiplies two signed 32-bit values returning a signed 64-bit result.
86 *
87 * @returns u32F1 * u32F2.
88 */
89#if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)
90DECLASM(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2);
91#else
92DECLINLINE(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2)
93{
94# ifdef RT_ARCH_X86
95 int64_t i64;
96# if RT_INLINE_ASM_GNU_STYLE
97 __asm__ __volatile__("imull %%edx"
98 : "=A" (i64)
99 : "a" (i32F2), "d" (i32F1));
100# elif RT_INLINE_ASM_USES_INTRIN
101 i64 = __emul(i32F1, i32F2);
102# else
103 __asm
104 {
105 mov edx, [i32F1]
106 mov eax, [i32F2]
107 imul edx
108 mov dword ptr [i64], eax
109 mov dword ptr [i64 + 4], edx
110 }
111# endif
112 return i64;
113# else /* generic: */
114 return (int64_t)i32F1 * i32F2;
115# endif
116}
117#endif
118
119
120#if ARCH_BITS == 64
121DECLINLINE(uint64_t) ASMMult2xU64Ret2xU64(uint64_t u64F1, uint64_t u64F2, uint64_t *pu64ProdHi)
122{
123# if defined(RT_ARCH_AMD64) && (RT_INLINE_ASM_GNU_STYLE || RT_INLINE_ASM_USES_INTRIN)
124# if RT_INLINE_ASM_GNU_STYLE
125 uint64_t u64Low, u64High;
126 __asm__ __volatile__("mulq %%rdx"
127 : "=a" (u64Low), "=d" (u64High)
128 : "0" (u64F1), "1" (u64F2));
129 *pu64ProdHi = u64High;
130 return u64Low;
131# elif RT_INLINE_ASM_USES_INTRIN
132 return _umul128(u64F1, u64F2, pu64ProdHi);
133# else
134# error "hmm"
135# endif
136# else /* generic: */
137 /*
138 * F1 * F2 = Prod
139 * -- --
140 * ab * cd = b*d + a*d*10 + b*c*10 + a*c*100
141 *
142 * Where a, b, c and d are 'digits', and 10 is max digit + 1.
143 *
144 * Our digits are 32-bit wide, so instead of 10 we multiply by 4G.
145 * Prod = F1.s.Lo*F2.s.Lo + F1.s.Hi*F2.s.Lo*4G
146 * + F1.s.Lo*F2.s.Hi*4G + F1.s.Hi*F2.s.Hi*4G*4G
147 */
148 RTUINT128U Prod;
149 RTUINT64U Tmp1;
150 uint64_t u64Tmp;
151 RTUINT64U F1, F2;
152 F1.u = u64F1;
153 F2.u = u64F2;
154
155 Prod.s.Lo = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Lo);
156
157 Tmp1.u = ASMMult2xU32RetU64(F1.s.Hi, F2.s.Lo);
158 u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;
159 Prod.DWords.dw1 = (uint32_t)u64Tmp;
160 Prod.s.Hi = Tmp1.s.Hi;
161 Prod.s.Hi += u64Tmp >> 32; /* carry */
162
163 Tmp1.u = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Hi);
164 u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;
165 Prod.DWords.dw1 = (uint32_t)u64Tmp;
166 u64Tmp >>= 32; /* carry */
167 u64Tmp += Prod.DWords.dw2;
168 u64Tmp += Tmp1.s.Hi;
169 Prod.DWords.dw2 = (uint32_t)u64Tmp;
170 Prod.DWords.dw3 += u64Tmp >> 32; /* carry */
171
172 Prod.s.Hi += ASMMult2xU32RetU64(F1.s.Hi, F2.s.Hi);
173 *pu64ProdHi = Prod.s.Hi;
174 return Prod.s.Lo;
175# endif
176}
177#endif
178
179
180
181/**
182 * Divides a 64-bit unsigned by a 32-bit unsigned returning an unsigned 32-bit result.
183 *
184 * @returns u64 / u32.
185 */
186#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
187DECLASM(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32);
188#else
189DECLINLINE(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32)
190{
191# ifdef RT_ARCH_X86
192# if RT_INLINE_ASM_GNU_STYLE
193 RTCCUINTREG uDummy;
194 __asm__ __volatile__("divl %3"
195 : "=a" (u32), "=d"(uDummy)
196 : "A" (u64), "r" (u32));
197# else
198 __asm
199 {
200 mov eax, dword ptr [u64]
201 mov edx, dword ptr [u64 + 4]
202 mov ecx, [u32]
203 div ecx
204 mov [u32], eax
205 }
206# endif
207 return u32;
208# else /* generic: */
209 return (uint32_t)(u64 / u32);
210# endif
211}
212#endif
213
214
215/**
216 * Divides a 64-bit signed by a 32-bit signed returning a signed 32-bit result.
217 *
218 * @returns u64 / u32.
219 */
220#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
221DECLASM(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32);
222#else
223DECLINLINE(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32)
224{
225# ifdef RT_ARCH_X86
226# if RT_INLINE_ASM_GNU_STYLE
227 RTCCUINTREG iDummy;
228 __asm__ __volatile__("idivl %3"
229 : "=a" (i32), "=d"(iDummy)
230 : "A" (i64), "r" (i32));
231# else
232 __asm
233 {
234 mov eax, dword ptr [i64]
235 mov edx, dword ptr [i64 + 4]
236 mov ecx, [i32]
237 idiv ecx
238 mov [i32], eax
239 }
240# endif
241 return i32;
242# else /* generic: */
243 return (int32_t)(i64 / i32);
244# endif
245}
246#endif
247
248
249/**
250 * Performs 64-bit unsigned by a 32-bit unsigned division with a 32-bit unsigned result,
251 * returning the rest.
252 *
253 * @returns u64 % u32.
254 *
255 * @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.
256 */
257#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
258DECLASM(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32);
259#else
260DECLINLINE(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32)
261{
262# ifdef RT_ARCH_X86
263# if RT_INLINE_ASM_GNU_STYLE
264 RTCCUINTREG uDummy;
265 __asm__ __volatile__("divl %3"
266 : "=a" (uDummy), "=d"(u32)
267 : "A" (u64), "r" (u32));
268# else
269 __asm
270 {
271 mov eax, dword ptr [u64]
272 mov edx, dword ptr [u64 + 4]
273 mov ecx, [u32]
274 div ecx
275 mov [u32], edx
276 }
277# endif
278 return u32;
279# else /* generic: */
280 return (uint32_t)(u64 % u32);
281# endif
282}
283#endif
284
285
286/**
287 * Performs 64-bit signed by a 32-bit signed division with a 32-bit signed result,
288 * returning the rest.
289 *
290 * @returns u64 % u32.
291 *
292 * @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.
293 */
294#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
295DECLASM(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32);
296#else
297DECLINLINE(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32)
298{
299# ifdef RT_ARCH_X86
300# if RT_INLINE_ASM_GNU_STYLE
301 RTCCUINTREG iDummy;
302 __asm__ __volatile__("idivl %3"
303 : "=a" (iDummy), "=d"(i32)
304 : "A" (i64), "r" (i32));
305# else
306 __asm
307 {
308 mov eax, dword ptr [i64]
309 mov edx, dword ptr [i64 + 4]
310 mov ecx, [i32]
311 idiv ecx
312 mov [i32], edx
313 }
314# endif
315 return i32;
316# else /* generic: */
317 return (int32_t)(i64 % i32);
318# endif
319}
320#endif
321
322
323/**
324 * Multiple a 32-bit by a 32-bit integer and divide the result by a 32-bit integer
325 * using a 64 bit intermediate result.
326 *
327 * @returns (u32A * u32B) / u32C.
328 * @param u32A The 32-bit value (A).
329 * @param u32B The 32-bit value to multiple by A.
330 * @param u32C The 32-bit value to divide A*B by.
331 *
332 * @remarks Architecture specific.
333 * @remarks Make sure the result won't ever exceed 32-bit, because hardware
334 * exception may be raised if it does.
335 * @remarks On x86 this may be used to avoid dragging in 64-bit builtin
336 * arithmetics functions.
337 */
338#if RT_INLINE_ASM_EXTERNAL && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
339DECLASM(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C);
340#else
341DECLINLINE(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C)
342{
343# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
344 uint32_t u32Result, u32Spill;
345 __asm__ __volatile__("mull %2\n\t"
346 "divl %3\n\t"
347 : "=&a" (u32Result),
348 "=&d" (u32Spill)
349 : "r" (u32B),
350 "r" (u32C),
351 "0" (u32A),
352 "1" (0));
353 return u32Result;
354# else
355 return (uint32_t)(((uint64_t)u32A * u32B) / u32C);
356# endif
357}
358#endif
359
360
361/**
362 * Multiple a 64-bit by a 32-bit integer and divide the result by a 32-bit integer
363 * using a 96 bit intermediate result.
364 *
365 * @returns (u64A * u32B) / u32C.
366 * @param u64A The 64-bit value.
367 * @param u32B The 32-bit value to multiple by A.
368 * @param u32C The 32-bit value to divide A*B by.
369 *
370 * @remarks Architecture specific.
371 * @remarks Make sure the result won't ever exceed 64-bit, because hardware
372 * exception may be raised if it does.
373 * @remarks On x86 this may be used to avoid dragging in 64-bit builtin
374 * arithmetics function.
375 */
376#if RT_INLINE_ASM_EXTERNAL || !defined(__GNUC__) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
377DECLASM(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C);
378#else
379DECLINLINE(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C)
380{
381# if RT_INLINE_ASM_GNU_STYLE
382# ifdef RT_ARCH_AMD64
383 uint64_t u64Result, u64Spill;
384 __asm__ __volatile__("mulq %2\n\t"
385 "divq %3\n\t"
386 : "=&a" (u64Result),
387 "=&d" (u64Spill)
388 : "r" ((uint64_t)u32B),
389 "r" ((uint64_t)u32C),
390 "0" (u64A),
391 "1" (0));
392 return u64Result;
393# else
394 uint32_t u32Dummy;
395 uint64_t u64Result;
396 __asm__ __volatile__("mull %%ecx \n\t" /* eax = u64Lo.lo = (u64A.lo * u32B).lo
397 edx = u64Lo.hi = (u64A.lo * u32B).hi */
398 "xchg %%eax,%%esi \n\t" /* esi = u64Lo.lo
399 eax = u64A.hi */
400 "xchg %%edx,%%edi \n\t" /* edi = u64Low.hi
401 edx = u32C */
402 "xchg %%edx,%%ecx \n\t" /* ecx = u32C
403 edx = u32B */
404 "mull %%edx \n\t" /* eax = u64Hi.lo = (u64A.hi * u32B).lo
405 edx = u64Hi.hi = (u64A.hi * u32B).hi */
406 "addl %%edi,%%eax \n\t" /* u64Hi.lo += u64Lo.hi */
407 "adcl $0,%%edx \n\t" /* u64Hi.hi += carry */
408 "divl %%ecx \n\t" /* eax = u64Hi / u32C
409 edx = u64Hi % u32C */
410 "movl %%eax,%%edi \n\t" /* edi = u64Result.hi = u64Hi / u32C */
411 "movl %%esi,%%eax \n\t" /* eax = u64Lo.lo */
412 "divl %%ecx \n\t" /* u64Result.lo */
413 "movl %%edi,%%edx \n\t" /* u64Result.hi */
414 : "=A"(u64Result), "=c"(u32Dummy),
415 "=S"(u32Dummy), "=D"(u32Dummy)
416 : "a"((uint32_t)u64A),
417 "S"((uint32_t)(u64A >> 32)),
418 "c"(u32B),
419 "D"(u32C));
420 return u64Result;
421# endif
422# else
423 RTUINT64U u;
424 uint64_t u64Lo = (uint64_t)(u64A & 0xffffffff) * u32B;
425 uint64_t u64Hi = (uint64_t)(u64A >> 32) * u32B;
426 u64Hi += (u64Lo >> 32);
427 u.s.Hi = (uint32_t)(u64Hi / u32C);
428 u.s.Lo = (uint32_t)((((u64Hi % u32C) << 32) + (u64Lo & 0xffffffff)) / u32C);
429 return u.u;
430# endif
431}
432#endif
433
434/** @} */
435#endif
436
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette