aboutsummaryrefslogtreecommitdiff
path: root/include/pub_tool_hashtable.h
diff options
context:
space:
mode:
authorphilippe <philippe@a5019735-40e9-0310-863c-91ae7b9d1cf9>2014-06-14 16:30:09 +0000
committerphilippe <philippe@a5019735-40e9-0310-863c-91ae7b9d1cf9>2014-06-14 16:30:09 +0000
commit7293d2530f8c60c1060f9f003e214cc341d35266 (patch)
tree9ef44dfbe277a68c24a312c16401126febfe3737 /include/pub_tool_hashtable.h
parentdcef54c4c2bb59747fa83bf877490b60984c19a5 (diff)
downloadvalgrind-7293d2530f8c60c1060f9f003e214cc341d35266.tar.gz
This patch adds a 'de-duplicating memory pool allocator':
include/pub_tool_deduppoolalloc.h coregrind/pub_core_deduppoolalloc.h coregrind/m_deduppoolalloc.c and uses it (currently only) for the strings in m_debuginfo/storage.c The idea is that such ddup pool allocator will also be used for other highly duplicated information (e.g. the DiCFSI information), where significant gains can also be achieved. The dedup pool for strings also decreases significantly the memory needed by the read inline information (patch still to be committed, see bug 278972). When testing with a big executable (tacot_process), this reduces the size of the dinfo arena from trunk: 158941184/109760512 max/curr mmap'd, 156775944/107882728 max/curr, to ddup: 157892608/106614784 max/curr mmap'd, 156362160/101414712 max/curr (so 3Mb less mmap-ed once debug info is read, 1Mb less mmap-ed in peak, 6Mb less allocated once debug info is read). This is all gained due to the string which changes from: trunk: 17,434,704 in 266: di.storage.addStr.1 to ddup: 10,966,608 in 750: di.storage.addStr.1 (6.5Mb less memory used by strings) The gain in mmap-ed memory is smaller due to fragmentation. Probably one could decrease the fragmentation by using bigger size for the dedup pool, but then we would lose memory on the last allocated pool (and for small libraries, we often do not use much of a big pool block). Solution might be to increase the pool size but have a "shrink_block" operation. To be looked at in the future. In terms of performance, startup of a big executable (on an old pentium) is not influenced significantly (something like 0.1 seconds on 15 seconds startup for a big executable, on a slow pentium). The dedup pool uses a hash table. The hash function used currently is the VG_(adler32) check sum. It is reported (and visible also here) that this checksum is not a very good hash function (many collisions). To have statistics about collisions, use --stats -v -v -v As an example of the collisions, on the strings in debug info of memcheck tool on x86, one obtain: --4789-- dedupPA:di.storage.addStr.1 9983 allocs (8174 uniq) 11 pools (4820 bytes free in last pool) --4789-- nr occurences of chains of len N, N-plicated keys, N-plicated elts --4789-- N: 0 : nr chain 6975, nr keys 0, nr elts 0 --4789-- N: 1 : nr chain 3670, nr keys 6410, nr elts 8174 --4789-- N: 2 : nr chain 1070, nr keys 226, nr elts 0 --4789-- N: 3 : nr chain 304, nr keys 100, nr elts 0 --4789-- N: 4 : nr chain 104, nr keys 84, nr elts 0 --4789-- N: 5 : nr chain 72, nr keys 42, nr elts 0 --4789-- N: 6 : nr chain 44, nr keys 34, nr elts 0 --4789-- N: 7 : nr chain 18, nr keys 13, nr elts 0 --4789-- N: 8 : nr chain 17, nr keys 8, nr elts 0 --4789-- N: 9 : nr chain 4, nr keys 6, nr elts 0 --4789-- N:10 : nr chain 9, nr keys 4, nr elts 0 --4789-- N:11 : nr chain 1, nr keys 0, nr elts 0 --4789-- N:13 : nr chain 1, nr keys 1, nr elts 0 --4789-- total nr of unique chains: 12289, keys 6928, elts 8174 which shows that on 8174 different strings, we have only 6410 strings which have a unique hash value. As other examples, N:13 line shows we have 13 strings mapping to the same key. N:14 line shows we have 4 groups of 10 strings mapping to the same key, etc. So, adler32 is definitely a bad hash function. Trials have been done with another hash function, giving a much lower collision rate. So, a better (but still fast) hash function would probably be beneficial. To be looked at ... git-svn-id: svn://svn.valgrind.org/valgrind/trunk@14029 a5019735-40e9-0310-863c-91ae7b9d1cf9
Diffstat (limited to 'include/pub_tool_hashtable.h')
-rw-r--r--include/pub_tool_hashtable.h26
1 files changed, 24 insertions, 2 deletions
diff --git a/include/pub_tool_hashtable.h b/include/pub_tool_hashtable.h
index fc721ba8f..d640c39e4 100644
--- a/include/pub_tool_hashtable.h
+++ b/include/pub_tool_hashtable.h
@@ -63,14 +63,36 @@ extern Int VG_(HT_count_nodes) ( VgHashTable table );
/* Add a node to the table. Duplicate keys are permitted. */
extern void VG_(HT_add_node) ( VgHashTable t, void* node );
-/* Looks up a VgHashNode in the table. Returns NULL if not found. If entries
+/* Looks up a VgHashNode by key in the table.
+ * Returns NULL if not found. If entries
* with duplicate keys are present, the most recently-added of the dups will
* be returned, but it's probably better to avoid dups altogether. */
extern void* VG_(HT_lookup) ( VgHashTable table, UWord key );
-/* Removes a VgHashNode from the table. Returns NULL if not found. */
+/* Removes a VgHashNode by key from the table. Returns NULL if not found. */
extern void* VG_(HT_remove) ( VgHashTable table, UWord key );
+typedef Word (*HT_Cmp_t) ( const void* node1, const void* node2 );
+
+/* Same as VG_(HT_lookup) and VG_(HT_remove), but allowing a part of or
+ the full element to be compared for equality, not only the key.
+ The typical use for the below function is to store a hash value of the
+ element in the key, and have the comparison function checking for equality
+ of the full element data.
+ Attention about the comparison function:
+ * It must *not* compare the 'next' pointer.
+ * when comparing the rest of the node, if the node data contains holes
+ between components, either the node memory should be fully initialised
+ (e.g. allocated using VG_(calloc)) or each component should be compared
+ individually. */
+extern void* VG_(HT_gen_lookup) ( VgHashTable table, void* node, HT_Cmp_t cmp );
+extern void* VG_(HT_gen_remove) ( VgHashTable table, void* node, HT_Cmp_t cmp );
+
+/* Output detailed usage/collision statistics.
+ cmp will be used to verify if 2 elements with the same key are equal.
+ Use NULL cmp if the hash table elements are only to be compared by key. */
+extern void VG_(HT_print_stats) ( VgHashTable table, HT_Cmp_t cmp );
+
/* Allocates a suitably-sized array, copies pointers to all the hashtable
elements into it, then returns both the array and the size of it. The
array must be freed with VG_(free). */