strcache2.c@ 1943

Last change on this file since 1943 was 1943, checked in by bird, 17 years ago
strcache2.c: fixed the length check.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 38.3 KB

Line
1	/* $Id: strcache2.c 1943 2008-10-26 02:19:06Z bird $ */
2	/** @file
3	* strcache2 - New string cache.
4	*/
5
6	/*
7	* Copyright (c) 2008 knut st. osmundsen <bird-src-spam@anduin.net>
8	*
9	* This file is part of kBuild.
10	*
11	* kBuild is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* kBuild is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with kBuild; if not, write to the Free Software
23	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24	*
25	*/
26
27
28	/*******************************************************************************
29	* Header Files *
30	*******************************************************************************/
31	#include "make.h"
32	#include "strcache2.h"
33
34	#include <assert.h>
35
36	#include "debug.h"
37
38	#ifdef _MSC_VER
39	typedef unsigned char uint8_t;
40	typedef unsigned short uint16_t;
41	typedef unsigned int uint32_t;
42	typedef signed char int8_t;
43	typedef signed short int16_t;
44	typedef signed int int32_t;
45	#else
46	# include <stdint.h>
47	#endif
48
49	#ifdef WINDOWS32
50	# include <io.h>
51	# include <process.h>
52	# include <Windows.h>
53	# define PARSE_IN_WORKER
54	#endif
55
56	#ifdef __OS2__
57	# include <sys/fmutex.h>
58	#endif
59
60	#ifdef HAVE_PTHREAD
61	# include <pthread.h>
62	#endif
63
64
65	/*******************************************************************************
66	* Defined Constants And Macros *
67	*******************************************************************************/
68	/* The default size of a memory segment (1MB). */
69	#define STRCACHE2_SEG_SIZE (1024U*1024U)
70	/* The default hash table shift (hash size give as a power of two). */
71	#define STRCACHE2_HASH_SHIFT 16
72	/** Does the modding / masking of a hash number into an index. */
73	#ifdef STRCACHE2_USE_MASK
74	# define STRCACHE2_MOD_IT(cache, hash) ((hash) & (cache)->hash_mask)
75	#else
76	# define STRCACHE2_MOD_IT(cache, hash) ((hash) % (cache)->hash_div)
77	#endif
78
79	# if defined(__amd64__) \|\| defined(__x86_64__) \|\| defined(__AMD64__) \|\| defined(_M_X64) \|\| defined(__amd64) \
80	\|\| defined(__i386__) \|\| defined(__x86__) \|\| defined(__X86__) \|\| defined(_M_IX86) \|\| defined(__i386)
81	# define strcache2_get_unaligned_16bits(ptr) ( ((const uint16_t )(ptr)))
82	# else
83	/* (endian doesn't matter) */
84	# define strcache2_get_unaligned_16bits(ptr) ( (((const uint8_t *)(ptr))[0] << 8) \
85	\| (((const uint8_t *)(ptr))[1]) )
86	# endif
87
88
89	/*******************************************************************************
90	* Global Variables *
91	*******************************************************************************/
92	/* List of initialized string caches. */
93	static struct strcache2 *strcache_head;
94
95
96	/** Finds the closest primary number for power of two value (or something else
97	* useful if not support). */
98	MY_INLINE unsigned int strcache2_find_prime(unsigned int shift)
99	{
100	switch (shift)
101	{
102	case 5: return 31;
103	case 6: return 61;
104	case 7: return 127;
105	case 8: return 251;
106	case 9: return 509;
107	case 10: return 1021;
108	case 11: return 2039;
109	case 12: return 4093;
110	case 13: return 8191;
111	case 14: return 16381;
112	case 15: return 32749;
113	case 16: return 65521;
114	case 17: return 131063;
115	case 18: return 262139;
116	case 19: return 524269;
117	case 20: return 1048573;
118	case 21: return 2097143;
119	case 22: return 4194301;
120	case 23: return 8388593;
121
122	default:
123	assert (0);
124	return (1 << shift) - 1;
125	}
126	}
127
128	/* The following is a bit experiment. It produces longer chains, i.e. worse
129	distribution of the strings in the table, however the actual make
130	performances is better (<time). The explanation is probably that the
131	collisions only really increase for entries that aren't looked up that
132	much and that it actually improoves the situation for those that is. Or
133	that we spend so much less time hashing that it makes up (and more) for
134	the pentalty we suffer from the longer chains and worse distribution.
135
136	XXX: Check how this works out with different path lengths. I suspect it
137	might depend on the length of PATH_ROOT and the depth of the files
138	in the project as well. If it does, this might make matters worse
139	for some and better for others which isn't very cool... */
140
141	#if 0
142	# define BIG_HASH_SIZE 32 /* kinda fast */
143	# define BIG_HASH_HEAD 16
144	# define BIG_HASH_TAIL 12
145	#elif 0
146	# define BIG_HASH_SIZE 68 /* kinda safe */
147	# define BIG_HASH_HEAD 24
148	# define BIG_HASH_TAIL 24
149	#elif 0
150	# define BIG_HASH_SIZE 128 /* safe */
151	# define BIG_HASH_HEAD 32
152	# define BIG_HASH_TAIL 32
153	#endif
154
155	#ifdef BIG_HASH_SIZE
156	/* long string: hash head and tail, drop the middle. */
157	MY_INLINE unsigned int
158	strcache2_case_sensitive_hash_big (const char *str, unsigned int len)
159	{
160	uint32_t hash = len;
161	uint32_t tmp;
162	unsigned int head;
163
164	/* head BIG_HASH_HEAD bytes */
165	head = (BIG_HASH_HEAD >> 2);
166	while (head-- > 0)
167	{
168	hash += strcache2_get_unaligned_16bits (str);
169	tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
170	hash = (hash << 16) ^ tmp;
171	str += 2 * sizeof (uint16_t);
172	hash += hash >> 11;
173	}
174
175	/* tail BIG_HASH_TAIL bytes (minus the odd ones) */
176	str += (len - BIG_HASH_HEAD - BIG_HASH_TAIL) & ~3U;
177	head = (BIG_HASH_TAIL >> 2);
178	while (head-- > 0)
179	{
180	hash += strcache2_get_unaligned_16bits (str);
181	tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
182	hash = (hash << 16) ^ tmp;
183	str += 2 * sizeof (uint16_t);
184	hash += hash >> 11;
185	}
186
187	/* force "avalanching" of final 127 bits. */
188	hash ^= hash << 3;
189	hash += hash >> 5;
190	hash ^= hash << 4;
191	hash += hash >> 17;
192	hash ^= hash << 25;
193	hash += hash >> 6;
194
195	return hash;
196	}
197	#endif /* BIG_HASH_SIZE */
198
199	MY_INLINE unsigned int
200	strcache2_case_sensitive_hash (const char *str, unsigned int len)
201	{
202	#if 1
203	/* Paul Hsieh hash SuperFast function:
204	http://www.azillionmonkeys.com/qed/hash.html
205
206	This performs very good and as a sligtly better distribution than
207	STRING_N_HASH_1 on a typical kBuild run.
208
209	It is also 37% faster than return_STRING_N_HASH_1 when running the
210	two 100 times over typical kBuild strings that end up here (did a
211	fprintf here and built kBuild). Compiler was 32-bit gcc 4.0.1, darwin,
212	with -O2.
213
214	FIXME: A path for well aligned data should be added to speed up
215	execution on alignment sensitive systems. */
216	unsigned int rem;
217	uint32_t hash;
218	uint32_t tmp;
219
220	assert (sizeof (uint8_t) == sizeof (char));
221
222	# ifdef BIG_HASH_SIZE
223	/* long string? */
224	# if 0 /BIG_HASH_SIZE > 128/
225	if (MY_PREDICT_FALSE(len >= BIG_HASH_SIZE))
226	# else
227	if (len >= BIG_HASH_SIZE)
228	# endif
229	return strcache2_case_sensitive_hash_big (str, len);
230	# endif
231
232	/* short string: main loop, walking on 2 x uint16_t */
233	hash = len;
234	rem = len & 3;
235	len >>= 2;
236	while (len > 0)
237	{
238	hash += strcache2_get_unaligned_16bits (str);
239	tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
240	hash = (hash << 16) ^ tmp;
241	str += 2 * sizeof (uint16_t);
242	hash += hash >> 11;
243	len--;
244	}
245
246	/* the remainder */
247	switch (rem)
248	{
249	case 3:
250	hash += strcache2_get_unaligned_16bits (str);
251	hash ^= hash << 16;
252	hash ^= str[sizeof (uint16_t)] << 18;
253	hash += hash >> 11;
254	break;
255	case 2:
256	hash += strcache2_get_unaligned_16bits (str);
257	hash ^= hash << 11;
258	hash += hash >> 17;
259	break;
260	case 1:
261	hash += *str;
262	hash ^= hash << 10;
263	hash += hash >> 1;
264	break;
265	}
266
267	/* force "avalanching" of final 127 bits. */
268	hash ^= hash << 3;
269	hash += hash >> 5;
270	hash ^= hash << 4;
271	hash += hash >> 17;
272	hash ^= hash << 25;
273	hash += hash >> 6;
274
275	return hash;
276
277	#elif 1
278	/* Note! This implementation is 18% faster than return_STRING_N_HASH_1
279	when running the two 100 times over typical kBuild strings that
280	end up here (did a fprintf here and built kBuild).
281	Compiler was 32-bit gcc 4.0.1, darwin, with -O2. */
282
283	unsigned int hash = 0;
284	if (MY_PREDICT_TRUE(len >= 2))
285	{
286	unsigned int ch0 = *str++;
287	hash = 0;
288	len--;
289	while (len >= 2)
290	{
291	unsigned int ch1 = *str++;
292	hash += ch0 << (ch1 & 0xf);
293
294	ch0 = *str++;
295	hash += ch1 << (ch0 & 0xf);
296
297	len -= 2;
298	}
299	if (len == 1)
300	{
301	unsigned ch1 = *str;
302	hash += ch0 << (ch1 & 0xf);
303
304	hash += ch1;
305	}
306	else
307	hash += ch0;
308	}
309	else if (len)
310	{
311	hash = *str;
312	hash += hash << (hash & 0xf);
313	}
314	else
315	hash = 0;
316	return hash;
317
318	#elif 1
319	# if 0
320	/* This is SDBM. This specific form/unroll was benchmarked to be 28% faster
321	than return_STRING_N_HASH_1. (Now the weird thing is that putting the (ch)
322	first in the assignment made it noticably slower.)
323
324	However, it is noticably slower in practice, most likely because of more
325	collisions. Hrmpf. */
326
327	# define UPDATE_HASH(ch) hash = (hash << 6) + (hash << 16) - hash + (ch)
328	unsigned int hash = 0;
329
330	# else
331	/* This is DJB2. This specific form/unroll was benchmarked to be 27%
332	fast than return_STRING_N_HASH_1.
333
334	Ditto. */
335
336	# define UPDATE_HASH(ch) hash = (hash << 5) + hash + (ch)
337	unsigned int hash = 5381;
338	# endif
339
340
341	while (len >= 4)
342	{
343	UPDATE_HASH (str[0]);
344	UPDATE_HASH (str[1]);
345	UPDATE_HASH (str[2]);
346	UPDATE_HASH (str[3]);
347	str += 4;
348	len -= 4;
349	}
350	switch (len)
351	{
352	default:
353	case 0:
354	return hash;
355	case 1:
356	UPDATE_HASH (str[0]);
357	return hash;
358	case 2:
359	UPDATE_HASH (str[0]);
360	UPDATE_HASH (str[1]);
361	return hash;
362	case 3:
363	UPDATE_HASH (str[0]);
364	UPDATE_HASH (str[1]);
365	UPDATE_HASH (str[2]);
366	return hash;
367	}
368	#endif
369	}
370
371	MY_INLINE unsigned int
372	strcache2_case_insensitive_hash (const char *str, unsigned int len)
373	{
374	unsigned int hash = 0;
375	if (MY_PREDICT_TRUE(len >= 2))
376	{
377	unsigned int ch0 = *str++;
378	ch0 = tolower (ch0);
379	hash = 0;
380	len--;
381	while (len >= 2)
382	{
383	unsigned int ch1 = *str++;
384	ch1 = tolower (ch1);
385	hash += ch0 << (ch1 & 0xf);
386
387	ch0 = *str++;
388	ch0 = tolower (ch0);
389	hash += ch1 << (ch0 & 0xf);
390
391	len -= 2;
392	}
393	if (len == 1)
394	{
395	unsigned ch1 = *str;
396	ch1 = tolower (ch1);
397	hash += ch0 << (ch1 & 0xf);
398
399	hash += ch1;
400	}
401	else
402	hash += ch0;
403	}
404	else if (len)
405	{
406	hash = *str;
407	hash += hash << (hash & 0xf);
408	}
409	else
410	hash = 0;
411	return hash;
412	}
413
414	MY_INLINE int
415	strcache2_memcmp_inline_short (const char xs, const char ys, unsigned int length)
416	{
417	if (length <= 8)
418	{
419	/* short string compare - ~50% of the kBuild calls. */
420	assert ( !((size_t)ys & 3) );
421	if (!((size_t)xs & 3))
422	{
423	/* aligned */
424	int result;
425	switch (length)
426	{
427	default: /* memcmp for longer strings */
428	return memcmp (xs, ys, length);
429	case 8:
430	result = (int32_t)(xs + 4) - (int32_t)(ys + 4);
431	result \|= (int32_t)xs - (int32_t)ys;
432	return result;
433	case 7:
434	result = xs[6] - ys[6];
435	result \|= xs[5] - ys[5];
436	result \|= xs[4] - ys[4];
437	result \|= (int32_t)xs - (int32_t)ys;
438	return result;
439	case 6:
440	result = xs[5] - ys[5];
441	result \|= xs[4] - ys[4];
442	result \|= (int32_t)xs - (int32_t)ys;
443	return result;
444	case 5:
445	result = xs[4] - ys[4];
446	result \|= (int32_t)xs - (int32_t)ys;
447	return result;
448	case 4:
449	return (int32_t)xs - (int32_t)ys;
450	case 3:
451	result = xs[2] - ys[2];
452	result \|= xs[1] - ys[1];
453	result \|= xs[0] - ys[0];
454	return result;
455	case 2:
456	result = xs[1] - ys[1];
457	result \|= xs[0] - ys[0];
458	return result;
459	case 1:
460	return xs - ys;
461	case 0:
462	return 0;
463	}
464	}
465	else
466	{
467	/* unaligned */
468	int result = 0;
469	switch (length)
470	{
471	case 8: result \|= xs[7] - ys[7];
472	case 7: result \|= xs[6] - ys[6];
473	case 6: result \|= xs[5] - ys[5];
474	case 5: result \|= xs[4] - ys[4];
475	case 4: result \|= xs[3] - ys[3];
476	case 3: result \|= xs[2] - ys[2];
477	case 2: result \|= xs[1] - ys[1];
478	case 1: result \|= xs[0] - ys[0];
479	case 0:
480	return result;
481	}
482	}
483	}
484
485	/* memcmp for longer strings */
486	return memcmp (xs, ys, length);
487	}
488
489	MY_INLINE int
490	strcache2_memcmp_inlined (const char xs, const char ys, unsigned int length)
491	{
492	#ifndef ELECTRIC_HEAP
493	assert ( !((size_t)ys & 3) );
494	#endif
495	if (!((size_t)xs & 3))
496	{
497	int result;
498	/* aligned */
499	while (length >= 8)
500	{
501	result = (int32_t)xs - (int32_t)ys;
502	result \|= (int32_t)(xs + 4) - (int32_t)(ys + 4);
503	if (MY_PREDICT_FALSE(result))
504	return result;
505	xs += 8;
506	ys += 8;
507	length -= 8;
508	}
509	switch (length)
510	{
511	case 7:
512	result = (int32_t)xs - (int32_t)ys;
513	result \|= xs[6] - ys[6];
514	result \|= xs[5] - ys[5];
515	result \|= xs[4] - ys[4];
516	return result;
517	case 6:
518	result = (int32_t)xs - (int32_t)ys;
519	result \|= xs[5] - ys[5];
520	result \|= xs[4] - ys[4];
521	return result;
522	case 5:
523	result = (int32_t)xs - (int32_t)ys;
524	result \|= xs[4] - ys[4];
525	return result;
526	case 4:
527	return (int32_t)xs - (int32_t)ys;
528	case 3:
529	result = xs[2] - ys[2];
530	result \|= xs[1] - ys[1];
531	result \|= xs[0] - ys[0];
532	return result;
533	case 2:
534	result = xs[1] - ys[1];
535	result \|= xs[0] - ys[0];
536	return result;
537	case 1:
538	return xs - ys;
539	default:
540	case 0:
541	return 0;
542	}
543	}
544	else
545	{
546	/* unaligned */
547	int result;
548	while (length >= 8)
549	{
550	#if defined(__i386__) \|\| defined(__x86_64__)
551	result = ( ((int32_t)xs[3] << 24)
552	\| ((int32_t)xs[2] << 16)
553	\| ((int32_t)xs[1] << 8)
554	\| xs[0] )
555	- (int32_t)ys;
556	result \|= ( ((int32_t)xs[7] << 24)
557	\| ((int32_t)xs[6] << 16)
558	\| ((int32_t)xs[5] << 8)
559	\| xs[4] )
560	- (int32_t)(ys + 4);
561	#else
562	result = xs[3] - ys[3];
563	result \|= xs[2] - ys[2];
564	result \|= xs[1] - ys[1];
565	result \|= xs[0] - ys[0];
566	result \|= xs[7] - ys[7];
567	result \|= xs[6] - ys[6];
568	result \|= xs[5] - ys[5];
569	result \|= xs[4] - ys[4];
570	#endif
571	if (MY_PREDICT_FALSE(result))
572	return result;
573	xs += 8;
574	ys += 8;
575	length -= 8;
576	}
577	result = 0;
578	switch (length)
579	{
580	case 7: result \|= xs[6] - ys[6];
581	case 6: result \|= xs[5] - ys[5];
582	case 5: result \|= xs[4] - ys[4];
583	case 4: result \|= xs[3] - ys[3];
584	case 3: result \|= xs[2] - ys[2];
585	case 2: result \|= xs[1] - ys[1];
586	case 1: result \|= xs[0] - ys[0];
587	return result;
588	default:
589	case 0:
590	return 0;
591	}
592	}
593	}
594
595	MY_INLINE int
596	strcache2_is_equal (struct strcache2 cache, struct strcache2_entry const entry,
597	const char *str, unsigned int length, unsigned int hash)
598	{
599	assert (!cache->case_insensitive);
600
601	/* the simple stuff first. */
602	if ( entry->hash != hash
603	\|\| entry->length != length)
604	return 0;
605
606	#if 0
607	return memcmp (str, entry + 1, length) == 0;
608	#elif 1
609	return strcache2_memcmp_inlined (str, (const char *)(entry + 1), length) == 0;
610	#else
611	return strcache2_memcmp_inline_short (str, (const char *)(entry + 1), length) == 0;
612	#endif
613	}
614
615	MY_INLINE int
616	strcache2_is_iequal (struct strcache2 cache, struct strcache2_entry const entry,
617	const char *str, unsigned int length, unsigned int hash)
618	{
619	assert (cache->case_insensitive);
620
621	/* the simple stuff first. */
622	if ( entry->hash != hash
623	\|\| entry->length != length)
624	return 0;
625
626	#if defined(_MSC_VER) \|\| defined(__OS2__)
627	return _memicmp (entry + 1, str, length) == 0;
628	#else
629	return strncasecmp ((const char *)(entry + 1), str, length) == 0;
630	#endif
631	}
632
633	static void
634	strcache2_rehash (struct strcache2 *cache)
635	{
636	unsigned int src = cache->hash_size;
637	struct strcache2_entry **src_tab = cache->hash_tab;
638	struct strcache2_entry **dst_tab;
639	#ifndef STRCACHE2_USE_MASK
640	unsigned int hash_shift;
641	#endif
642
643	/* Allocate a new hash table twice the size of the current. */
644	cache->hash_size <<= 1;
645	#ifdef STRCACHE2_USE_MASK
646	cache->hash_mask <<= 1;
647	cache->hash_mask \|= 1;
648	#else
649	for (hash_shift = 1; (1U << hash_shift) < cache->hash_size; hash_shift++)
650	/* nothing */;
651	cache->hash_div = strcache2_find_prime (hash_shift);
652	#endif
653	cache->rehash_count <<= 1;
654	cache->hash_tab = dst_tab = (struct strcache2_entry **)
655	xmalloc (cache->hash_size * sizeof (struct strcache2_entry *));
656	memset (dst_tab, '\0', cache->hash_size * sizeof (struct strcache2_entry *));
657
658	/* Copy the entries from the old to the new hash table. */
659	cache->collision_count = 0;
660	while (src-- > 0)
661	{
662	struct strcache2_entry *entry = src_tab[src];
663	while (entry)
664	{
665	struct strcache2_entry *next = entry->next;
666	unsigned int dst = STRCACHE2_MOD_IT (cache, entry->hash);
667	if ((entry->next = dst_tab[dst]) != 0)
668	cache->collision_count++;
669	dst_tab[dst] = entry;
670
671	entry = next;
672	}
673	}
674
675	/* That's it, just free the old table and we're done. */
676	free (src_tab);
677	}
678
679	static struct strcache2_seg *
680	strcache2_new_seg (struct strcache2 *cache, unsigned int minlen)
681	{
682	struct strcache2_seg *seg;
683	size_t size;
684	size_t off;
685
686	size = cache->def_seg_size;
687	if (size < (size_t)minlen + sizeof (struct strcache2_seg) + STRCACHE2_ENTRY_ALIGNMENT)
688	{
689	size = (size_t)minlen * 2;
690	size = (size + 0xfff) & ~(size_t)0xfff;
691	}
692
693	seg = xmalloc (size);
694	seg->start = (char *)(seg + 1);
695	seg->size = size - sizeof (struct strcache2_seg);
696	off = (size_t)seg->start & (STRCACHE2_ENTRY_ALIGNMENT - 1);
697	if (off)
698	{
699	off = STRCACHE2_ENTRY_ALIGNMENT - off;
700	seg->start += off;
701	seg->size -= off;
702	}
703	assert (seg->size > minlen);
704	seg->cursor = seg->start;
705	seg->avail = seg->size;
706
707	seg->next = cache->seg_head;
708	cache->seg_head = seg;
709
710	return seg;
711	}
712
713	/* Internal worker that enters a new string into the cache. */
714	static const char *
715	strcache2_enter_string (struct strcache2 *cache, unsigned int idx,
716	const char *str, unsigned int length,
717	unsigned int hash)
718	{
719	struct strcache2_entry *entry;
720	struct strcache2_seg *seg;
721	unsigned int size;
722	char *str_copy;
723
724	/* Allocate space for the string. */
725
726	size = length + 1 + sizeof (struct strcache2_entry);
727	size = (size + STRCACHE2_ENTRY_ALIGNMENT - 1) & ~(STRCACHE2_ENTRY_ALIGNMENT - 1U);
728
729	seg = cache->seg_head;
730	if (MY_PREDICT_FALSE(seg->avail < size))
731	{
732	do
733	seg = seg->next;
734	while (seg && seg->avail < size);
735	if (!seg)
736	seg = strcache2_new_seg (cache, size);
737	}
738
739	entry = (struct strcache2_entry *) seg->cursor;
740	assert (!((size_t)entry & (STRCACHE2_ENTRY_ALIGNMENT - 1)));
741	seg->cursor += size;
742	seg->avail -= size;
743
744	/* Setup the entry, copy the string and insert it into the hash table. */
745
746	entry->user = NULL;
747	entry->length = length;
748	entry->hash = hash;
749	str_copy = (char *) memcpy (entry + 1, str, length);
750	str_copy[length] = '\0';
751
752	if ((entry->next = cache->hash_tab[idx]) != 0)
753	cache->collision_count++;
754	cache->hash_tab[idx] = entry;
755	cache->count++;
756	if (cache->count >= cache->rehash_count)
757	strcache2_rehash (cache);
758
759	return str_copy;
760	}
761
762	/* The public add string interface. */
763	const char *
764	strcache2_add (struct strcache2 cache, const char str, unsigned int length)
765	{
766	struct strcache2_entry const *entry;
767	unsigned int hash = strcache2_case_sensitive_hash (str, length);
768	unsigned int idx;
769
770	assert (!cache->case_insensitive);
771
772	MAKE_STATS (cache->lookup_count++);
773
774	/* Lookup the entry in the hash table, hoping for an
775	early match. If not found, enter the string at IDX. */
776	idx = STRCACHE2_MOD_IT (cache, hash);
777	entry = cache->hash_tab[idx];
778	if (!entry)
779	return strcache2_enter_string (cache, idx, str, length, hash);
780	if (strcache2_is_equal (cache, entry, str, length, hash))
781	return (const char *)(entry + 1);
782	MAKE_STATS (cache->collision_1st_count++);
783
784	entry = entry->next;
785	if (!entry)
786	return strcache2_enter_string (cache, idx, str, length, hash);
787	if (strcache2_is_equal (cache, entry, str, length, hash))
788	return (const char *)(entry + 1);
789	MAKE_STATS (cache->collision_2nd_count++);
790
791	/* Loop the rest. */
792	for (;;)
793	{
794	entry = entry->next;
795	if (!entry)
796	return strcache2_enter_string (cache, idx, str, length, hash);
797	if (strcache2_is_equal (cache, entry, str, length, hash))
798	return (const char *)(entry + 1);
799	MAKE_STATS (cache->collision_3rd_count++);
800	}
801	/* not reached */
802	}
803
804	/* The public add string interface for prehashed strings.
805	Use strcache2_hash_str to calculate the hash of a string. */
806	const char *
807	strcache2_add_hashed (struct strcache2 cache, const char str,
808	unsigned int length, unsigned int hash)
809	{
810	struct strcache2_entry const *entry;
811	unsigned int idx;
812	#ifndef NDEBUG
813	unsigned correct_hash;
814
815	assert (!cache->case_insensitive);
816	correct_hash = strcache2_case_sensitive_hash (str, length);
817	MY_ASSERT_MSG (hash == correct_hash, ("%#x != %#x\n", hash, correct_hash));
818	#endif /* NDEBUG */
819
820	MAKE_STATS (cache->lookup_count++);
821
822	/* Lookup the entry in the hash table, hoping for an
823	early match. If not found, enter the string at IDX. */
824	idx = STRCACHE2_MOD_IT (cache, hash);
825	entry = cache->hash_tab[idx];
826	if (!entry)
827	return strcache2_enter_string (cache, idx, str, length, hash);
828	if (strcache2_is_equal (cache, entry, str, length, hash))
829	return (const char *)(entry + 1);
830	MAKE_STATS (cache->collision_1st_count++);
831
832	entry = entry->next;
833	if (!entry)
834	return strcache2_enter_string (cache, idx, str, length, hash);
835	if (strcache2_is_equal (cache, entry, str, length, hash))
836	return (const char *)(entry + 1);
837	MAKE_STATS (cache->collision_2nd_count++);
838
839	/* Loop the rest. */
840	for (;;)
841	{
842	entry = entry->next;
843	if (!entry)
844	return strcache2_enter_string (cache, idx, str, length, hash);
845	if (strcache2_is_equal (cache, entry, str, length, hash))
846	return (const char *)(entry + 1);
847	MAKE_STATS (cache->collision_3rd_count++);
848	}
849	/* not reached */
850	}
851
852	/* The public lookup (case sensitive) string interface. */
853	const char *
854	strcache2_lookup (struct strcache2 cache, const char str, unsigned int length)
855	{
856	struct strcache2_entry const *entry;
857	unsigned int hash = strcache2_case_sensitive_hash (str, length);
858	unsigned int idx;
859
860	assert (!cache->case_insensitive);
861
862	MAKE_STATS (cache->lookup_count++);
863
864	/* Lookup the entry in the hash table, hoping for an
865	early match. */
866	idx = STRCACHE2_MOD_IT (cache, hash);
867	entry = cache->hash_tab[idx];
868	if (!entry)
869	return NULL;
870	if (strcache2_is_equal (cache, entry, str, length, hash))
871	return (const char *)(entry + 1);
872	MAKE_STATS (cache->collision_1st_count++);
873
874	entry = entry->next;
875	if (!entry)
876	return NULL;
877	if (strcache2_is_equal (cache, entry, str, length, hash))
878	return (const char *)(entry + 1);
879	MAKE_STATS (cache->collision_2nd_count++);
880
881	/* Loop the rest. */
882	for (;;)
883	{
884	entry = entry->next;
885	if (!entry)
886	return NULL;
887	if (strcache2_is_equal (cache, entry, str, length, hash))
888	return (const char *)(entry + 1);
889	MAKE_STATS (cache->collision_3rd_count++);
890	}
891	/* not reached */
892	}
893
894	#if defined(HAVE_CASE_INSENSITIVE_FS)
895
896	/* The public add string interface for case insensitive strings. */
897	const char *
898	strcache2_iadd (struct strcache2 cache, const char str, unsigned int length)
899	{
900	struct strcache2_entry const *entry;
901	unsigned int hash = strcache2_case_insensitive_hash (str, length);
902	unsigned int idx;
903
904	assert (!cache->case_insensitive);
905
906	MAKE_STATS (cache->lookup_count++);
907
908	/* Lookup the entry in the hash table, hoping for an
909	early match. If not found, enter the string at IDX. */
910	idx = STRCACHE2_MOD_IT (cache, hash);
911	entry = cache->hash_tab[idx];
912	if (!entry)
913	return strcache2_enter_string (cache, idx, str, length, hash);
914	if (strcache2_is_equal (cache, entry, str, length, hash))
915	return (const char *)(entry + 1);
916	MAKE_STATS (cache->collision_1st_count++);
917
918	entry = entry->next;
919	if (!entry)
920	return strcache2_enter_string (cache, idx, str, length, hash);
921	if (strcache2_is_equal (cache, entry, str, length, hash))
922	return (const char *)(entry + 1);
923	MAKE_STATS (cache->collision_2nd_count++);
924
925	/* Loop the rest. */
926	for (;;)
927	{
928	entry = entry->next;
929	if (!entry)
930	return strcache2_enter_string (cache, idx, str, length, hash);
931	if (strcache2_is_equal (cache, entry, str, length, hash))
932	return (const char *)(entry + 1);
933	MAKE_STATS (cache->collision_3rd_count++);
934	}
935	/* not reached */
936	}
937
938	/* The public add string interface for prehashed case insensitive strings.
939	Use strcache2_hash_istr to calculate the hash of a string. */
940	const char *
941	strcache2_iadd_hashed (struct strcache2 cache, const char str,
942	unsigned int length, unsigned int hash)
943	{
944	struct strcache2_entry const *entry;
945	unsigned int idx;
946	#ifndef NDEBUG
947	unsigned correct_hash;
948
949	assert (!cache->case_insensitive);
950	correct_hash = strcache2_case_insensitive_hash (str, length);
951	MY_ASSERT_MSG (hash == correct_hash, ("%#x != %#x\n", hash, correct_hash));
952	#endif /* NDEBUG */
953
954	MAKE_STATS (cache->lookup_count++);
955
956	/* Lookup the entry in the hash table, hoping for an
957	early match. If not found, enter the string at IDX. */
958	idx = STRCACHE2_MOD_IT (cache, hash);
959	entry = cache->hash_tab[idx];
960	if (!entry)
961	return strcache2_enter_string (cache, idx, str, length, hash);
962	if (strcache2_is_equal (cache, entry, str, length, hash))
963	return (const char *)(entry + 1);
964	MAKE_STATS (cache->collision_1st_count++);
965
966	entry = entry->next;
967	if (!entry)
968	return strcache2_enter_string (cache, idx, str, length, hash);
969	if (strcache2_is_equal (cache, entry, str, length, hash))
970	return (const char *)(entry + 1);
971	MAKE_STATS (cache->collision_2nd_count++);
972
973	/* Loop the rest. */
974	for (;;)
975	{
976	entry = entry->next;
977	if (!entry)
978	return strcache2_enter_string (cache, idx, str, length, hash);
979	if (strcache2_is_equal (cache, entry, str, length, hash))
980	return (const char *)(entry + 1);
981	MAKE_STATS (cache->collision_3rd_count++);
982	}
983	/* not reached */
984	}
985
986	/* The public lookup (case insensitive) string interface. */
987	const char *
988	strcache2_ilookup (struct strcache2 cache, const char str, unsigned int length)
989	{
990	struct strcache2_entry const *entry;
991	unsigned int hash = strcache2_case_insensitive_hash (str, length);
992	unsigned int idx;
993
994	assert (!cache->case_insensitive);
995
996	MAKE_STATS (cache->lookup_count++);
997
998	/* Lookup the entry in the hash table, hoping for an
999	early match. */
1000	idx = STRCACHE2_MOD_IT (cache, hash);
1001	entry = cache->hash_tab[idx];
1002	if (!entry)
1003	return NULL;
1004	if (strcache2_is_equal (cache, entry, str, length, hash))
1005	return (const char *)(entry + 1);
1006	MAKE_STATS (cache->collision_1st_count++);
1007
1008	entry = entry->next;
1009	if (!entry)
1010	return NULL;
1011	if (strcache2_is_equal (cache, entry, str, length, hash))
1012	return (const char *)(entry + 1);
1013	MAKE_STATS (cache->collision_2nd_count++);
1014
1015	/* Loop the rest. */
1016	for (;;)
1017	{
1018	entry = entry->next;
1019	if (!entry)
1020	return NULL;
1021	if (strcache2_is_equal (cache, entry, str, length, hash))
1022	return (const char *)(entry + 1);
1023	MAKE_STATS (cache->collision_3rd_count++);
1024	}
1025	/* not reached */
1026	}
1027
1028	#endif /* HAVE_CASE_INSENSITIVE_FS */
1029
1030	/* Is the given string cached? returns 1 if it is, 0 if it isn't. */
1031	int
1032	strcache2_is_cached (struct strcache2 cache, const char str)
1033	{
1034	/* Check mandatory alignment first. */
1035	if (!((size_t)str & (sizeof (void *) - 1)))
1036	{
1037	/* Check the segment list and consider the question answered if the
1038	string is within one of them. (Could check it more thoroughly...) */
1039	struct strcache2_seg const *seg;
1040	for (seg = cache->seg_head; seg; seg = seg->next)
1041	if ((size_t)(str - seg->start) < seg->size)
1042	return 1;
1043	}
1044
1045	return 0;
1046	}
1047
1048
1049	/* Verify the integrity of the specified string, returning 0 if OK. */
1050	int
1051	strcache2_verify_entry (struct strcache2 cache, const char str)
1052	{
1053	struct strcache2_entry const *entry;
1054	unsigned int hash;
1055	unsigned int length;
1056	const char *end;
1057
1058	entry = (struct strcache2_entry const *)str - 1;
1059	if ((size_t)entry & (STRCACHE2_ENTRY_ALIGNMENT - 1))
1060	{
1061	fprintf (stderr,
1062	"strcache2[%s]: missaligned entry %p\nstring: %p=%s\n",
1063	cache->name, (void )entry, (void )str, str);
1064	return -1;
1065	}
1066
1067	end = memchr (str, '\0', entry->length + 1);
1068	length = end - str;
1069	if (length != entry->length)
1070	{
1071	fprintf (stderr,
1072	"strcache2[%s]: corrupt entry %p, length: %u, expected %u;\nstring: %s\n",
1073	cache->name, (void *)entry, length, entry->length, str);
1074	return -1;
1075	}
1076
1077	hash = cache->case_insensitive
1078	? strcache2_case_insensitive_hash (str, entry->length)
1079	: strcache2_case_sensitive_hash (str, entry->length);
1080	if (hash != entry->hash)
1081	{
1082	fprintf (stderr,
1083	"strcache2[%s]: corrupt entry %p, hash: %x, expected %x;\nstring: %s\n",
1084	cache->name, (void *)entry, hash, entry->hash, str);
1085	return -1;
1086	}
1087
1088	return 0;
1089	}
1090
1091
1092	/* Calculates the case sensitive hash values of the string.
1093	The first hash is returned, the other is put at HASH2P. */
1094	unsigned int strcache2_hash_str (const char str, unsigned int length, unsigned int hash2p)
1095	{
1096	*hash2p = 1;
1097	return strcache2_case_sensitive_hash (str, length);
1098	}
1099
1100	/* Calculates the case insensitive hash values of the string.
1101	The first hash is returned, the other is put at HASH2P. */
1102	unsigned int strcache2_hash_istr (const char str, unsigned int length, unsigned int hash2p)
1103	{
1104	*hash2p = 1;
1105	return strcache2_case_insensitive_hash (str, length);
1106	}
1107
1108
1109
1110	/* Initalizes a new cache. */
1111	void
1112	strcache2_init (struct strcache2 cache, const char name, unsigned int size,
1113	unsigned int def_seg_size, int case_insensitive, int thread_safe)
1114	{
1115	unsigned hash_shift;
1116	assert (!thread_safe);
1117
1118	/* calc the size as a power of two */
1119	if (!size)
1120	hash_shift = STRCACHE2_HASH_SHIFT;
1121	else
1122	{
1123	assert (size <= (~0U / 2 + 1));
1124	for (hash_shift = 8; (1U << hash_shift) < size; hash_shift++)
1125	/* nothing */;
1126	}
1127
1128	/* adjust the default segment size */
1129	if (!def_seg_size)
1130	def_seg_size = STRCACHE2_SEG_SIZE;
1131	else if (def_seg_size < sizeof (struct strcache2_seg) * 10)
1132	def_seg_size = sizeof (struct strcache2_seg) * 10;
1133	else if ((def_seg_size & 0xfff) < 0xf00)
1134	def_seg_size = ((def_seg_size + 0xfff) & ~0xfffU);
1135
1136
1137	/* init the structure. */
1138	cache->case_insensitive = case_insensitive;
1139	#ifdef STRCACHE2_USE_MASK
1140	cache->hash_mask = (1U << hash_shift) - 1U;
1141	#else
1142	cache->hash_div = strcache2_find_prime(hash_shift);
1143	#endif
1144	cache->count = 0;
1145	cache->collision_count = 0;
1146	cache->lookup_count = 0;
1147	cache->collision_1st_count = 0;
1148	cache->collision_2nd_count = 0;
1149	cache->collision_3rd_count = 0;
1150	cache->rehash_count = (1U << hash_shift) / 4 * 3; /* rehash at 75% */
1151	cache->init_size = 1U << hash_shift;
1152	cache->hash_size = 1U << hash_shift;
1153	cache->def_seg_size = def_seg_size;
1154	cache->lock = NULL;
1155	cache->name = name;
1156
1157	/* allocate the hash table and first segment. */
1158	cache->hash_tab = (struct strcache2_entry **)
1159	xmalloc (cache->init_size * sizeof (struct strcache2_entry *));
1160	memset (cache->hash_tab, '\0', cache->init_size * sizeof (struct strcache2_entry *));
1161	strcache2_new_seg (cache, 0);
1162
1163	/* link it */
1164	cache->next = strcache_head;
1165	strcache_head = cache;
1166	}
1167
1168
1169	/* Terminates a string cache, freeing all memory and other
1170	associated resources. */
1171	void
1172	strcache2_term (struct strcache2 *cache)
1173	{
1174	/* unlink it */
1175	if (strcache_head == cache)
1176	strcache_head = cache->next;
1177	else
1178	{
1179	struct strcache2 *prev = strcache_head;
1180	while (prev->next != cache)
1181	prev = prev->next;
1182	assert (prev);
1183	prev->next = cache->next;
1184	}
1185
1186	/* free the memory segments */
1187	do
1188	{
1189	void *free_it = cache->seg_head;
1190	cache->seg_head = cache->seg_head->next;
1191	free (free_it);
1192	}
1193	while (cache->seg_head);
1194
1195	/* free the hash and clear the structure. */
1196	free (cache->hash_tab);
1197	memset (cache, '\0', sizeof (struct strcache2));
1198	}
1199
1200	/* Print statistics a string cache. */
1201	void
1202	strcache2_print_stats (struct strcache2 cache, const char prefix)
1203	{
1204	unsigned int seg_count = 0;
1205	unsigned long seg_total_bytes = 0;
1206	unsigned long seg_avg_bytes;
1207	unsigned long seg_avail_bytes = 0;
1208	unsigned long seg_max_bytes = 0;
1209	struct strcache2_seg *seg;
1210	unsigned int str_count = 0;
1211	unsigned long str_total_len = 0;
1212	unsigned int str_avg_len;
1213	unsigned int str_min_len = ~0U;
1214	unsigned int str_max_len = 0;
1215	unsigned int idx;
1216	unsigned int rehashes;
1217	unsigned int chain_depths[32];
1218
1219	printf (_("\n%s strcache2: %s\n"), prefix, cache->name);
1220
1221	/* Segment statistics. */
1222	for (seg = cache->seg_head; seg; seg = seg->next)
1223	{
1224	seg_count++;
1225	seg_total_bytes += seg->size;
1226	seg_avail_bytes += seg->avail;
1227	if (seg->size > seg_max_bytes)
1228	seg_max_bytes = seg->size;
1229	}
1230	seg_avg_bytes = seg_total_bytes / seg_count;
1231	printf (_("%s %u segments: total = %lu / max = %lu / avg = %lu / def = %u avail = %lu\n"),
1232	prefix, seg_count, seg_total_bytes, seg_max_bytes, seg_avg_bytes,
1233	cache->def_seg_size, seg_avail_bytes);
1234
1235	/* String statistics. */
1236	memset (chain_depths, '\0', sizeof (chain_depths));
1237	idx = cache->hash_size;
1238	while (idx-- > 0)
1239	{
1240	struct strcache2_entry const *entry = cache->hash_tab[idx];
1241	unsigned int depth = 0;
1242	for (; entry != 0; entry = entry->next, depth++)
1243	{
1244	unsigned int length = entry->length;
1245	str_total_len += length;
1246	if (length > str_max_len)
1247	str_max_len = length;
1248	if (length < str_min_len)
1249	str_min_len = length;
1250	str_count++;
1251	}
1252	chain_depths[depth >= 32 ? 31 : depth]++;
1253	}
1254	str_avg_len = cache->count ? str_total_len / cache->count : 0;
1255	printf (_("%s %u strings: total len = %lu / max = %u / avg = %u / min = %u\n"),
1256	prefix, cache->count, str_total_len, str_max_len, str_avg_len, str_min_len);
1257	if (str_count != cache->count)
1258	printf (_("%s string count mismatch! cache->count=%u, actual count is %u\n"), prefix,
1259	cache->count, str_count);
1260
1261	/* Hash statistics. */
1262	idx = cache->init_size;
1263	rehashes = 0;
1264	while (idx < cache->hash_size)
1265	{
1266	idx *= 2;
1267	rehashes++;
1268	}
1269
1270	#ifdef STRCACHE2_USE_MASK
1271	printf (_("%s hash size = %u mask = %#x rehashed %u times"),
1272	prefix, cache->hash_size, cache->hash_mask, rehashes);
1273	#else
1274	printf (_("%s hash size = %u div = %#x rehashed %u times"),
1275	prefix, cache->hash_size, cache->hash_div, rehashes);
1276	#endif
1277	if (cache->lookup_count)
1278	printf (_("%s lookups = %lu\n"
1279	"%s hash collisions 1st = %lu (%u%%) 2nd = %lu (%u%%) 3rd = %lu (%u%%)"),
1280	prefix, cache->lookup_count,
1281	prefix,
1282	cache->collision_1st_count, (unsigned int)((100.0 * cache->collision_1st_count) / cache->lookup_count),
1283	cache->collision_2nd_count, (unsigned int)((100.0 * cache->collision_2nd_count) / cache->lookup_count),
1284	cache->collision_3rd_count, (unsigned int)((100.0 * cache->collision_3rd_count) / cache->lookup_count));
1285	printf (_("\n%s hash insert collisions = %u (%u%%)\n"),
1286	prefix, cache->collision_count,(unsigned int)((100.0 * cache->collision_count) / cache->count));
1287	printf (_("%s %5u (%u%%) empty hash table slots\n"),
1288	prefix, chain_depths[0], (unsigned int)((100.0 * chain_depths[0]) / cache->hash_size));
1289	printf (_("%s %5u (%u%%) occupied hash table slots\n"),
1290	prefix, chain_depths[1], (unsigned int)((100.0 * chain_depths[1]) / cache->hash_size));
1291	for (idx = 2; idx < 32; idx++)
1292	{
1293	unsigned strs_at_this_depth = chain_depths[idx];
1294	unsigned i;
1295	for (i = idx + 1; i < 32; i++)
1296	strs_at_this_depth += chain_depths[i];
1297	if (strs_at_this_depth)
1298	printf (_("%s %5u (%2u%%) with %u string%s chained on; %5u (%2u%%) strings at depth %u.\n"),
1299	prefix, chain_depths[idx], (unsigned int)((100.0 * chain_depths[idx]) / (cache->count - cache->collision_count)),
1300	idx - 1, idx == 2 ? " " : "s",
1301	strs_at_this_depth, (unsigned int)((100.0 * strs_at_this_depth) / cache->count), idx - 1);
1302	}
1303	}
1304
1305	/* Print statistics for all string caches. */
1306	void
1307	strcache2_print_stats_all (const char *prefix)
1308	{
1309	struct strcache2 *cur;
1310	for (cur = strcache_head; cur; cur = cur->next)
1311	strcache2_print_stats (cur, prefix);
1312	}
1313

Note: See TracBrowser for help on using the repository browser.

source: kBuild/trunk/src/kmk/strcache2.c@ 1943

Download in other formats: