asm-math.h@ 52455

Last change on this file since 52455 was 52455, checked in by vboxsync, 11 years ago
Updated ASMMultU32ByU32DivByU32 and ASMMultU64ByU32DivByU32 documentation. (Don't put unscoped implementation details in the function description, or at least not immediately below the function brief!)
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 13.1 KB

Line
1	/** @file
2	* IPRT - Assembly Routines for Optimizing some Integers Math Operations.
3	*/
4
5	/*
6	* Copyright (C) 2006-2010 Oracle Corporation
7	*
8	* This file is part of VirtualBox Open Source Edition (OSE), as
9	* available from http://www.215389.xyz. This file is free software;
10	* you can redistribute it and/or modify it under the terms of the GNU
11	* General Public License (GPL) as published by the Free Software
12	* Foundation, in version 2 as it comes in the "COPYING" file of the
13	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15	*
16	* The contents of this file may alternatively be used under the terms
17	* of the Common Development and Distribution License Version 1.0
18	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19	* VirtualBox OSE distribution, in which case the provisions of the
20	* CDDL are applicable instead of those of the GPL.
21	*
22	* You may elect to license modified versions of this file under the
23	* terms and conditions of either the GPL or the CDDL or both.
24	*/
25
26	#ifndef ___iprt_asm_math_h
27	#define ___iprt_asm_math_h
28
29	#include <iprt/types.h>
30
31	#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
32	# include <intrin.h>
33	/* Emit the intrinsics at all optimization levels. */
34	# pragma intrinsic(__emul)
35	# pragma intrinsic(__emulu)
36	# ifdef RT_ARCH_AMD64
37	# pragma intrinsic(_mul128)
38	# pragma intrinsic(_umul128)
39	# endif
40	#endif
41
42
43	/** @defgroup grp_rt_asm_math Interger Math Optimizations
44	* @ingroup grp_rt_asm
45	* @{ */
46
47	/**
48	* Multiplies two unsigned 32-bit values returning an unsigned 64-bit result.
49	*
50	* @returns u32F1 * u32F2.
51	*/
52
53	#if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)
54	DECLASM(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2);
55	#else
56	DECLINLINE(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2)
57	{
58	# ifdef RT_ARCH_X86
59	uint64_t u64;
60	# if RT_INLINE_ASM_GNU_STYLE
61	__asm__ __volatile__("mull %%edx"
62	: "=A" (u64)
63	: "a" (u32F2), "d" (u32F1));
64	# elif RT_INLINE_ASM_USES_INTRIN
65	u64 = __emulu(u32F1, u32F2);
66	# else
67	__asm
68	{
69	mov edx, [u32F1]
70	mov eax, [u32F2]
71	mul edx
72	mov dword ptr [u64], eax
73	mov dword ptr [u64 + 4], edx
74	}
75	# endif
76	return u64;
77	# else /* generic: */
78	return (uint64_t)u32F1 * u32F2;
79	# endif
80	}
81	#endif
82
83
84	/**
85	* Multiplies two signed 32-bit values returning a signed 64-bit result.
86	*
87	* @returns u32F1 * u32F2.
88	*/
89	#if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)
90	DECLASM(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2);
91	#else
92	DECLINLINE(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2)
93	{
94	# ifdef RT_ARCH_X86
95	int64_t i64;
96	# if RT_INLINE_ASM_GNU_STYLE
97	__asm__ __volatile__("imull %%edx"
98	: "=A" (i64)
99	: "a" (i32F2), "d" (i32F1));
100	# elif RT_INLINE_ASM_USES_INTRIN
101	i64 = __emul(i32F1, i32F2);
102	# else
103	__asm
104	{
105	mov edx, [i32F1]
106	mov eax, [i32F2]
107	imul edx
108	mov dword ptr [i64], eax
109	mov dword ptr [i64 + 4], edx
110	}
111	# endif
112	return i64;
113	# else /* generic: */
114	return (int64_t)i32F1 * i32F2;
115	# endif
116	}
117	#endif
118
119
120	#if ARCH_BITS == 64
121	DECLINLINE(uint64_t) ASMMult2xU64Ret2xU64(uint64_t u64F1, uint64_t u64F2, uint64_t *pu64ProdHi)
122	{
123	# if defined(RT_ARCH_AMD64) && (RT_INLINE_ASM_GNU_STYLE \|\| RT_INLINE_ASM_USES_INTRIN)
124	# if RT_INLINE_ASM_GNU_STYLE
125	uint64_t u64Low, u64High;
126	__asm__ __volatile__("mulq %%rdx"
127	: "=a" (u64Low), "=d" (u64High)
128	: "0" (u64F1), "1" (u64F2));
129	*pu64ProdHi = u64High;
130	return u64Low;
131	# elif RT_INLINE_ASM_USES_INTRIN
132	return _umul128(u64F1, u64F2, pu64ProdHi);
133	# else
134	# error "hmm"
135	# endif
136	# else /* generic: */
137	/*
138	* F1 * F2 = Prod
139	* -- --
140	* ab * cd = bd + ad10 + bc10 + ac*100
141	*
142	* Where a, b, c and d are 'digits', and 10 is max digit + 1.
143	*
144	* Our digits are 32-bit wide, so instead of 10 we multiply by 4G.
145	* Prod = F1.s.LoF2.s.Lo + F1.s.HiF2.s.Lo*4G
146	* + F1.s.LoF2.s.Hi4G + F1.s.HiF2.s.Hi4G*4G
147	*/
148	RTUINT128U Prod;
149	RTUINT64U Tmp1;
150	uint64_t u64Tmp;
151	RTUINT64U F1, F2;
152	F1.u = u64F1;
153	F2.u = u64F2;
154
155	Prod.s.Lo = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Lo);
156
157	Tmp1.u = ASMMult2xU32RetU64(F1.s.Hi, F2.s.Lo);
158	u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;
159	Prod.DWords.dw1 = (uint32_t)u64Tmp;
160	Prod.s.Hi = Tmp1.s.Hi;
161	Prod.s.Hi += u64Tmp >> 32; /* carry */
162
163	Tmp1.u = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Hi);
164	u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;
165	Prod.DWords.dw1 = (uint32_t)u64Tmp;
166	u64Tmp >>= 32; /* carry */
167	u64Tmp += Prod.DWords.dw2;
168	u64Tmp += Tmp1.s.Hi;
169	Prod.DWords.dw2 = (uint32_t)u64Tmp;
170	Prod.DWords.dw3 += u64Tmp >> 32; /* carry */
171
172	Prod.s.Hi += ASMMult2xU32RetU64(F1.s.Hi, F2.s.Hi);
173	*pu64ProdHi = Prod.s.Hi;
174	return Prod.s.Lo;
175	# endif
176	}
177	#endif
178
179
180
181	/**
182	* Divides a 64-bit unsigned by a 32-bit unsigned returning an unsigned 32-bit result.
183	*
184	* @returns u64 / u32.
185	*/
186	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
187	DECLASM(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32);
188	#else
189	DECLINLINE(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32)
190	{
191	# ifdef RT_ARCH_X86
192	# if RT_INLINE_ASM_GNU_STYLE
193	RTCCUINTREG uDummy;
194	__asm__ __volatile__("divl %3"
195	: "=a" (u32), "=d"(uDummy)
196	: "A" (u64), "r" (u32));
197	# else
198	__asm
199	{
200	mov eax, dword ptr [u64]
201	mov edx, dword ptr [u64 + 4]
202	mov ecx, [u32]
203	div ecx
204	mov [u32], eax
205	}
206	# endif
207	return u32;
208	# else /* generic: */
209	return (uint32_t)(u64 / u32);
210	# endif
211	}
212	#endif
213
214
215	/**
216	* Divides a 64-bit signed by a 32-bit signed returning a signed 32-bit result.
217	*
218	* @returns u64 / u32.
219	*/
220	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
221	DECLASM(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32);
222	#else
223	DECLINLINE(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32)
224	{
225	# ifdef RT_ARCH_X86
226	# if RT_INLINE_ASM_GNU_STYLE
227	RTCCUINTREG iDummy;
228	__asm__ __volatile__("idivl %3"
229	: "=a" (i32), "=d"(iDummy)
230	: "A" (i64), "r" (i32));
231	# else
232	__asm
233	{
234	mov eax, dword ptr [i64]
235	mov edx, dword ptr [i64 + 4]
236	mov ecx, [i32]
237	idiv ecx
238	mov [i32], eax
239	}
240	# endif
241	return i32;
242	# else /* generic: */
243	return (int32_t)(i64 / i32);
244	# endif
245	}
246	#endif
247
248
249	/**
250	* Performs 64-bit unsigned by a 32-bit unsigned division with a 32-bit unsigned result,
251	* returning the rest.
252	*
253	* @returns u64 % u32.
254	*
255	* @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.
256	*/
257	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
258	DECLASM(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32);
259	#else
260	DECLINLINE(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32)
261	{
262	# ifdef RT_ARCH_X86
263	# if RT_INLINE_ASM_GNU_STYLE
264	RTCCUINTREG uDummy;
265	__asm__ __volatile__("divl %3"
266	: "=a" (uDummy), "=d"(u32)
267	: "A" (u64), "r" (u32));
268	# else
269	__asm
270	{
271	mov eax, dword ptr [u64]
272	mov edx, dword ptr [u64 + 4]
273	mov ecx, [u32]
274	div ecx
275	mov [u32], edx
276	}
277	# endif
278	return u32;
279	# else /* generic: */
280	return (uint32_t)(u64 % u32);
281	# endif
282	}
283	#endif
284
285
286	/**
287	* Performs 64-bit signed by a 32-bit signed division with a 32-bit signed result,
288	* returning the rest.
289	*
290	* @returns u64 % u32.
291	*
292	* @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.
293	*/
294	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
295	DECLASM(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32);
296	#else
297	DECLINLINE(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32)
298	{
299	# ifdef RT_ARCH_X86
300	# if RT_INLINE_ASM_GNU_STYLE
301	RTCCUINTREG iDummy;
302	__asm__ __volatile__("idivl %3"
303	: "=a" (iDummy), "=d"(i32)
304	: "A" (i64), "r" (i32));
305	# else
306	__asm
307	{
308	mov eax, dword ptr [i64]
309	mov edx, dword ptr [i64 + 4]
310	mov ecx, [i32]
311	idiv ecx
312	mov [i32], edx
313	}
314	# endif
315	return i32;
316	# else /* generic: */
317	return (int32_t)(i64 % i32);
318	# endif
319	}
320	#endif
321
322
323	/**
324	* Multiple a 32-bit by a 32-bit integer and divide the result by a 32-bit integer
325	* using a 64 bit intermediate result.
326	*
327	* @returns (u32A * u32B) / u32C.
328	* @param u32A The 32-bit value (A).
329	* @param u32B The 32-bit value to multiple by A.
330	* @param u32C The 32-bit value to divide A*B by.
331	*
332	* @remarks Architecture specific.
333	* @remarks Make sure the result won't ever exceed 32-bit, because hardware
334	* exception may be raised if it does.
335	* @remarks On x86 this may be used to avoid dragging in 64-bit builtin
336	* arithmetics functions.
337	*/
338	#if RT_INLINE_ASM_EXTERNAL && (defined(RT_ARCH_AMD64) \|\| defined(RT_ARCH_X86))
339	DECLASM(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C);
340	#else
341	DECLINLINE(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C)
342	{
343	# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) \|\| defined(RT_ARCH_X86))
344	uint32_t u32Result, u32Spill;
345	__asm__ __volatile__("mull %2\n\t"
346	"divl %3\n\t"
347	: "=&a" (u32Result),
348	"=&d" (u32Spill)
349	: "r" (u32B),
350	"r" (u32C),
351	"0" (u32A),
352	"1" (0));
353	return u32Result;
354	# else
355	return (uint32_t)(((uint64_t)u32A * u32B) / u32C);
356	# endif
357	}
358	#endif
359
360
361	/**
362	* Multiple a 64-bit by a 32-bit integer and divide the result by a 32-bit integer
363	* using a 96 bit intermediate result.
364	*
365	* @returns (u64A * u32B) / u32C.
366	* @param u64A The 64-bit value.
367	* @param u32B The 32-bit value to multiple by A.
368	* @param u32C The 32-bit value to divide A*B by.
369	*
370	* @remarks Architecture specific.
371	* @remarks Make sure the result won't ever exceed 64-bit, because hardware
372	* exception may be raised if it does.
373	* @remarks On x86 this may be used to avoid dragging in 64-bit builtin
374	* arithmetics function.
375	*/
376	#if RT_INLINE_ASM_EXTERNAL \|\| !defined(__GNUC__) \|\| (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
377	DECLASM(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C);
378	#else
379	DECLINLINE(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C)
380	{
381	# if RT_INLINE_ASM_GNU_STYLE
382	# ifdef RT_ARCH_AMD64
383	uint64_t u64Result, u64Spill;
384	__asm__ __volatile__("mulq %2\n\t"
385	"divq %3\n\t"
386	: "=&a" (u64Result),
387	"=&d" (u64Spill)
388	: "r" ((uint64_t)u32B),
389	"r" ((uint64_t)u32C),
390	"0" (u64A),
391	"1" (0));
392	return u64Result;
393	# else
394	uint32_t u32Dummy;
395	uint64_t u64Result;
396	__asm__ __volatile__("mull %%ecx \n\t" /* eax = u64Lo.lo = (u64A.lo * u32B).lo
397	edx = u64Lo.hi = (u64A.lo * u32B).hi */
398	"xchg %%eax,%%esi \n\t" /* esi = u64Lo.lo
399	eax = u64A.hi */
400	"xchg %%edx,%%edi \n\t" /* edi = u64Low.hi
401	edx = u32C */
402	"xchg %%edx,%%ecx \n\t" /* ecx = u32C
403	edx = u32B */
404	"mull %%edx \n\t" /* eax = u64Hi.lo = (u64A.hi * u32B).lo
405	edx = u64Hi.hi = (u64A.hi * u32B).hi */
406	"addl %%edi,%%eax \n\t" /* u64Hi.lo += u64Lo.hi */
407	"adcl $0,%%edx \n\t" /* u64Hi.hi += carry */
408	"divl %%ecx \n\t" /* eax = u64Hi / u32C
409	edx = u64Hi % u32C */
410	"movl %%eax,%%edi \n\t" /* edi = u64Result.hi = u64Hi / u32C */
411	"movl %%esi,%%eax \n\t" /* eax = u64Lo.lo */
412	"divl %%ecx \n\t" /* u64Result.lo */
413	"movl %%edi,%%edx \n\t" /* u64Result.hi */
414	: "=A"(u64Result), "=c"(u32Dummy),
415	"=S"(u32Dummy), "=D"(u32Dummy)
416	: "a"((uint32_t)u64A),
417	"S"((uint32_t)(u64A >> 32)),
418	"c"(u32B),
419	"D"(u32C));
420	return u64Result;
421	# endif
422	# else
423	RTUINT64U u;
424	uint64_t u64Lo = (uint64_t)(u64A & 0xffffffff) * u32B;
425	uint64_t u64Hi = (uint64_t)(u64A >> 32) * u32B;
426	u64Hi += (u64Lo >> 32);
427	u.s.Hi = (uint32_t)(u64Hi / u32C);
428	u.s.Lo = (uint32_t)((((u64Hi % u32C) << 32) + (u64Lo & 0xffffffff)) / u32C);
429	return u.u;
430	# endif
431	}
432	#endif
433
434	/** @} */
435	#endif
436

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/include/iprt/asm-math.h@ 52455

Download in other formats: