From 1169bd383421946df3a8951a7a53bdea540749db Mon Sep 17 00:00:00 2001
From: charlesgregory
Date: Wed, 26 Aug 2020 15:55:02 -0400
Subject: [PATCH 1/3] added khash and rearranged benchmarks
---
.../bench/benchmark_hashmaps.d | 15 +-
source/dklib/khashl.d | 565 ++++++++++++++++++
source/dklib/package.d | 1 +
3 files changed, 578 insertions(+), 3 deletions(-)
rename source/dklib/benchmark_khash.d => examples/bench/benchmark_hashmaps.d (81%)
create mode 100644 source/dklib/khashl.d
diff --git a/source/dklib/benchmark_khash.d b/examples/bench/benchmark_hashmaps.d
similarity index 81%
rename from source/dklib/benchmark_khash.d
rename to examples/bench/benchmark_hashmaps.d
index fd7d74d..289548c 100644
--- a/source/dklib/benchmark_khash.d
+++ b/examples/bench/benchmark_hashmaps.d
@@ -2,7 +2,8 @@
dependency "emsi_containers" version="~>0.7"
dependency "dklib" path="../.."
+/
-import khash;
+import dklib.khash;
+import dklib.khashl;
import containers;
import std.datetime.stopwatch : StopWatch, AutoStart;
@@ -17,9 +18,14 @@ int main()
enum NUMBER_OF_ITEMS = 500_000;
- void testContainerInsert(alias Container, string ContainerName)()
+ void testContainerInsert(alias Container, string ContainerName, bool cached = false)()
{
- auto c = Container!(string, int)();
+ static if(cached){
+ static assert(ContainerName == "khashl (cached)");
+ auto c = Container!(string, int,true,true,true)();
+ }else{
+ auto c = Container!(string, int)();
+ }
StopWatch sw = StopWatch(AutoStart.yes);
foreach (i; 0 .. NUMBER_OF_ITEMS)
@@ -61,9 +67,12 @@ int main()
testContainerInsert!(HashMap, "HashMap");
testContainerInsert!(khash, "khash");
+ testContainerInsert!(khashl, "khashl");
+ testContainerInsert!(khashl, "khashl (cached)",true);
testContainerLookup!(HashMap, "HashMap");
testContainerLookup!(khash, "khash");
+ testContainerLookup!(khashl, "khashl");
return 0;
}
diff --git a/source/dklib/khashl.d b/source/dklib/khashl.d
new file mode 100644
index 0000000..dfccc7e
--- /dev/null
+++ b/source/dklib/khashl.d
@@ -0,0 +1,565 @@
+/* The MIT License
+ Copyright (c) 2019 by Attractive Chaos
+ Copyright (c) 2019 James S Blachly, MD
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+module dklib.khashl;
+
+import std.traits : isNumeric, isSomeString, isSigned, hasMember;
+import core.stdc.stdint; // uint32_t, etc.
+import core.memory; // GC
+
+/*!
+ @header
+
+ Generic hash table library.
+ */
+
+enum AC_VERSION_KHASHL_H = "0.1";
+
+import core.stdc.stdlib;
+import core.stdc.string;
+import core.stdc.limits;
+
+/* compiler specific configuration */
+
+alias khint32_t = uint;
+
+alias khint64_t = ulong;
+
+alias khint_t = khint32_t;
+alias khiter_t = khint_t;
+
+pragma(inline, true)
+{
+ auto __kh_used(T)(const(khint32_t)* flag, T i)
+ {
+ return (flag[i >> 5] >> (i & 0x1fU) & 1U);
+ }
+ void __kh_set_used(T)(khint32_t* flag, T i)
+ {
+ (flag[i >> 5] |= 1U << (i & 0x1fU));
+ }
+ void __kh_set_unused(T)(khint32_t* flag, T i)
+ {
+ (flag[i >> 5] &= ~(1U << (i & 0x1fU)));
+ }
+
+ khint_t __kh_h2b(khint_t hash, khint_t bits)
+ {
+ return hash * 2654435769U >> (32 - bits);
+ }
+
+ auto __kh_fsize(khint_t m){
+ return ((m) < 32? 1 : (m)>>5);
+ }
+}
+
+alias kcalloc = calloc;
+
+alias kmalloc = malloc;
+
+alias krealloc = realloc;
+
+alias kfree = free;
+
+/* Straight port of khashl's generic C approach
+ Khashl is hash table that performs deletions without tombstones
+ The main benefit over khash is that it uses less memory however it is
+ also faster than khash if no deletions are involved.
+
+ Can use cached-hashes for faster comparison of string key hashes
+ Attractive chaos has this to say about caching hash values (https://attractivechaos.wordpress.com/):
+
+ When we use long strings as keys, comparing two keys may take significant time.
+ This comparison is often unnecessary. Note that the hash of a string is a good
+ summary of the string. If two strings are different, their hashes are often
+ different. We can cache the hash and only compare two keys when their hashes are
+ equal. It is possible to implement the idea with any hash table implementations.
+
+ If hash-caching is used we use compile time statements to change the bucket type to include
+ the hash member. We also change the logic to make equality statements check hash equality before
+ checking the key equalitys and the put and get methods to make sure they don't recompute the hashes.
+**/
+template khashl(KT, VT, bool kh_is_map = true, bool cached = false, bool useGC = true)
+{
+ static assert(!isSigned!KT, "Numeric key types must be unsigned -- try uint instead of int, etc.");
+
+ alias __hash_func = kh_hash!KT.kh_hash_func;
+ alias __hash_equal= kh_equal!(Bucket,cached).kh_hash_equal;
+
+ alias kh_t = khashl; /// klib uses 'kh_t' struct name
+
+ struct Bucket {
+ KT key;
+ static if(kh_is_map) VT val;
+ static if(cached) khint_t hash;
+ }
+
+ struct khashl // @suppress(dscanner.style.phobos_naming_convention)
+ {
+ khint_t bits, count;
+ khint32_t *used;
+ Bucket * keys;
+
+ ~this()
+ {
+ //kh_destroy(&this); // the free(h) at the end of kh_destroy will SIGSEGV
+ static if (useGC) {
+ GC.removeRange(this.keys);
+ }
+ kfree(cast(void*) this.keys);
+ kfree(cast(void*) this.used);
+ }
+
+ /// Lookup by key
+ ref VT opIndex(KT key)
+ {
+ Bucket ins;
+ ins.key = key;
+ static if(cached) ins.hash = __hash_func(ins.key); //cache the hash
+ auto x = kh_get(&this, ins);
+ return this.keys[x].val;
+ }
+
+ /// Assign by key
+ void opIndexAssign(VT val, KT key)
+ {
+ int absent;
+ Bucket ins;
+ ins.key = key;
+ static if(cached) ins.hash = __hash_func(ins.key); //cache the hash
+ auto x = kh_put(&this, ins, &absent);
+ this.keys[x].val = val;
+ static if(cached) this.keys[x].hash = ins.hash; //cache the hash
+ }
+
+ /// remove key/value pair
+ void remove(KT key)
+ {
+ Bucket ins;
+ ins.key = key;
+ static if(cached) ins.hash = __hash_func(ins.key); //cache the hash
+ auto x = kh_get(&this, ins);
+ kh_del(&this, x);
+ }
+
+ /// Get or create if does not exist; mirror built-in hashmap
+ /// https://dlang.org/spec/hash-map.html#inserting_if_not_present
+ ref VT require(KT key, lazy VT initval)
+ {
+ static assert (kh_is_map == true, "require() not sensible in a hash set");
+ Bucket ins;
+ ins.key = key;
+ static if(cached) ins.hash = __hash_func(ins.key); //cache the hash
+ auto x = kh_get(&this, ins);
+ if (x == kh_end(&this)) {
+ // not present
+ int absent;
+ x = kh_put(&this, ins, &absent);
+ this.keys[x].val = initval;
+ static if(cached) this.keys[x].hash = ins.hash; //cache the hash
+ }
+ return this.keys[x].val;
+ }
+
+ /// Return an InputRange over the keys.
+ /// Manipulating the hash table during iteration results in undefined behavior.
+ /// Returns: Voldemort type
+ auto byKey()
+ {
+ /** Manipulating the hash table during iteration results in undefined behavior */
+ struct KeyRange
+ {
+ kh_t* kh;
+ khint_t itr;
+ bool empty() // non-const as may call popFront
+ {
+ //return (this.itr == kh_end(this.kh));
+ if (this.itr == kh_end(this.kh)) return true;
+ // Handle the case of deleted keys
+ else if (__kh_used(this.kh.used, this.itr) == 0) {
+ while(__kh_used(this.kh.used, this.itr) == 0) {
+ this.popFront();
+ if (this.itr == kh_end(this.kh)) return true;
+ }
+ return false;
+ }
+ return false;
+ }
+ ref KT front()
+ {
+ return kh.keys[this.itr].key;
+ }
+ void popFront()
+ {
+ if(this.itr < kh_end(this.kh)) {
+ this.itr++;
+ }
+ }
+ }
+ return KeyRange(&this);
+ }
+ }
+
+ void kh_clear(kh_t* h);
+ int kh_resize(kh_t* h, khint_t new_n_buckets);
+ khint_t kh_putp(kh_t* h, Bucket * key, int* absent);
+ khint_t kh_put(kh_t* h, Bucket key, int* absent);
+ int kh_del(kh_t* h, khint_t i);
+
+ deprecated("kept for source-level homology; use D-style RAII")
+ kh_t* kh_init()
+ {
+ return cast(kh_t*) kcalloc(1, kh_t.sizeof);
+ }
+
+ deprecated("kept for source-level homology; kfree(h) will SIGSEGV when called as kh_destroy(&this)")
+ void kh_destroy(kh_t* h)
+ {
+ if (h)
+ {
+ kfree(cast(void*) h.keys);
+ kfree(cast(void*) h.used);
+ kfree(h);
+ }
+ }
+
+ void kh_clear(kh_t* h)
+ {
+ if (h && h.used)
+ {
+ uint32_t n_buckets = 1U << h.bits;
+ memset(h.used, 0, __kh_fsize(n_buckets) * khint32_t.sizeof);
+ h.count = 0;
+ }
+ }
+
+ khint_t kh_getp(const(kh_t)* h, Bucket * key)
+ {
+ khint_t i, last, n_buckets, mask;
+ if (h.keys == null) return 0;
+ n_buckets = 1U << h.bits;
+ mask = n_buckets - 1U;
+
+ /// if using caching, don't rehash key
+ static if(cached) i = last = __kh_h2b((*key).hash, h.bits);
+ else i = last = __kh_h2b(__hash_func((*key).key), h.bits);
+
+ while (__kh_used(h.used, i) && !__hash_equal(h.keys[i], *key)) {
+ i = (i + 1U) & mask;
+ if (i == last) return n_buckets;
+ }
+ return !__kh_used(h.used, i)? n_buckets : i;
+ }
+ khint_t kh_get(const(kh_t) *h, Bucket key) { return kh_getp(h, &key); }
+
+ int kh_resize(kh_t *h, khint_t new_n_buckets)
+ {
+ khint32_t * new_used = null;
+ khint_t j = 0, x = new_n_buckets, n_buckets, new_bits, new_mask;
+ while ((x >>= 1) != 0) ++j;
+ if (new_n_buckets & (new_n_buckets - 1)) ++j;
+ new_bits = j > 2? j : 2;
+ new_n_buckets = 1U << new_bits;
+ if (h.count > (new_n_buckets>>1) + (new_n_buckets>>2)) return 0; /* requested size is too small */
+ new_used = cast(khint32_t*)kmalloc(__kh_fsize(new_n_buckets) * khint32_t.sizeof);
+ memset(new_used, 0, __kh_fsize(new_n_buckets) * khint32_t.sizeof);
+ if (!new_used) return -1; /* not enough memory */
+ n_buckets = h.keys? 1U< new_n_buckets) /* shrink the hash table */
+ h.keys = cast(Bucket*)krealloc(cast(void *)h.keys, new_n_buckets * Bucket.sizeof);
+ kfree(h.used); /* free the working space */
+ h.used = new_used, h.bits = new_bits;
+ return 0;
+ }
+
+ khint_t kh_putp(kh_t *h, Bucket * key, int *absent)
+ {
+ khint_t n_buckets, i, last, mask;
+ n_buckets = h.keys? 1U<= (n_buckets>>1) + (n_buckets>>2)) { /* rehashing */
+ if (kh_resize(h, n_buckets + 1U) < 0)
+ return n_buckets;
+ n_buckets = 1U< i && (k <= i || k > j)) || (j < i && (k <= i && k > j)))
+ h.keys[i] = h.keys[j], i = j;
+ }
+ __kh_set_unused(h.used, i);
+ --h.count;
+ return 1;
+ }
+
+ auto kh_bucket(const(kh_t)* h, khint_t x)
+ {
+ return h.keys[x];
+ }
+
+ auto kh_key(const(kh_t)* h, khint_t x)
+ {
+ return h.keys[x].key;
+ }
+
+ auto kh_val(const(kh_t)* h, khint_t x)
+ {
+ return h.keys[x].val;
+ }
+
+ auto kh_end(const(kh_t)* h)
+ {
+ return kh_capacity(h);
+ }
+
+ auto kh_size(const(kh_t)* h)
+ {
+ return h.count;
+ }
+
+ auto kh_capacity(const(kh_t)* h)
+ {
+ return h.keys ? 1U<> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+ }
+
+ auto kh_hash_func(T)(T key)
+ if (is(T == ulong) || is(T == uint64_t) || is(T == khint64_t))
+ {
+ key = ~key + (key << 21);
+ key = key ^ key >> 24;
+ key = (key + (key << 3)) + (key << 8);
+ key = key ^ key >> 14;
+ key = (key + (key << 2)) + (key << 4);
+ key = key ^ key >> 28;
+ key = key + (key << 31);
+ return cast(khint_t) key;
+ }
+
+ khint_t kh_hash_str(const(char)* s)
+ {
+ khint_t h = cast(khint_t)*s;
+ if (h) for (++s; *s; ++s) h = (h << 5) - h + cast(khint_t)*s;
+ return h;
+ }
+
+ auto kh_hash_func(T)(T* key)
+ if(is(T == char) || is(T == const(char)) || is(T == immutable(char)))
+ {
+ return kh_hash_str(key);
+ }
+
+ auto kh_hash_func(T)(T key)
+ if(isSomeString!T)
+ {
+ // rewrite __ac_X31_hash_string for D string/smart array
+ if (key.length == 0) return 0;
+ khint_t h = key[0];
+ for (int i=1; i
Date: Wed, 26 Aug 2020 15:58:33 -0400
Subject: [PATCH 2/3] removed dub line for non-existent file
---
dub.json | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/dub.json b/dub.json
index a19e755..efb4b16 100644
--- a/dub.json
+++ b/dub.json
@@ -10,6 +10,5 @@
"unittest-inline": {
"buildOptions": ["unittests", "optimize", "inline"]
}
- },
- "excludedSourceFiles": ["source/dklib/benchmark_khash.d"]
+ }
}
From f24d44252e14f407b60c8be9970fe495b2d0f84f Mon Sep 17 00:00:00 2001
From: Charles Gregory
Date: Thu, 27 Aug 2020 16:21:11 -0400
Subject: [PATCH 3/3] added more benchmarks and updated readme
---
README.md | 28 +++++++++---
examples/bench/benchmark_hashmaps.d | 68 +++++++++++++++++++++--------
2 files changed, 70 insertions(+), 26 deletions(-)
diff --git a/README.md b/README.md
index 0463e16..bc65a46 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,27 @@
Library of efficient algorithms and data structures ported to templatized D
from attractivechaos' generic C approach: https://github.com/attractivechaos/klib
-# khash
+# khash & khashl
## Fast
Comparison with [emsi containers](https://github.com/dlang-community/containers) HashMap:
-Time, in msec, for n=500,000 operations benchmarked on linux VM; ldc2 -release
+Time, in msec, for n=500,000 operations with uint keys benchmarked on WSL; ldc2 -release
-| Operation | HashMap | khash |
-|-------------------|---------|-------|
-| Insert | 3573 | 2347 |
-| Retrieve (Serial) | 145 | 26 |
-| Retrieve (Random) | 282 | 84 |
+| Operation | HashMap | khash | khashl |
+|-------------------|---------|-------|--------|
+| Insert | 1168 | 411 | 401 |
+| Retrieve (Serial) | 83 | 20 | 110 |
+| Retrieve (Random) | 198 | 91 | 134 |
+
+Time, in msec, for n=500,000 operations with string keys benchmarked on WSL; ldc2 -release
+
+| Operation | HashMap | khash | khashl | khashl (cached) |
+|-------------------|---------|-------|--------|-----------------|
+| Insert | 1727 | 782 | 903 | 494 |
+| Retrieve (Serial) | 232 | 261 | 267 | 240 |
+| Retrieve (Random) | 404 | 420 | 422 | 422 |
## Notes
@@ -26,6 +34,10 @@ pass optional third template parameter `kh_is_map = false`.
By default, memory allocated by the hashmap will be scanned by the GC.
(pass optional fourth template parameter `useGC = false` to disable)
+For ```khashl```, hash caching can be enabled by passing an optional fourth
+ template parameter `cached = true`. This allows faster insertions for
+ string keys.
+
Can undergo static initialization (e.g. define as struct member
with no extra init code needed in struct ctor), unlike
[emsi containers](https://github.com/dlang-community/containers) HashMap.
@@ -36,6 +48,8 @@ with no extra init code needed in struct ctor), unlike
### Declaration
```D
auto map = khash!(keytype, valuetype);
+auto map2 = khashl!(keytype, valuetype);
+auto map3 = khashl!(string, valuetype,true,true); // for hash-caching with strings
```
### Assignment / Insert
diff --git a/examples/bench/benchmark_hashmaps.d b/examples/bench/benchmark_hashmaps.d
index 289548c..75e3ce2 100644
--- a/examples/bench/benchmark_hashmaps.d
+++ b/examples/bench/benchmark_hashmaps.d
@@ -11,6 +11,7 @@ import std.stdio;
import std.uuid : randomUUID;
string global_x; /// for benchmarking to prevent elision(?)
+uint global_y;
int main()
{
@@ -18,38 +19,55 @@ int main()
enum NUMBER_OF_ITEMS = 500_000;
- void testContainerInsert(alias Container, string ContainerName, bool cached = false)()
+ void testContainerInsert(T, alias Container, string ContainerName, bool cached = false)()
+ if(is(T == uint) || is(T == string))
{
static if(cached){
static assert(ContainerName == "khashl (cached)");
- auto c = Container!(string, int,true,true,true)();
+ static if(is(T == uint)) auto c = Container!(uint, string,true,true,true)();
+ else auto c = Container!(string, uint,true,true,true)();
}else{
- auto c = Container!(string, int)();
+ static if(is(T == uint)) auto c = Container!(uint, string)();
+ else auto c = Container!(string, uint)();
}
StopWatch sw = StopWatch(AutoStart.yes);
foreach (i; 0 .. NUMBER_OF_ITEMS)
//c.insert(i);
- c[randomUUID().toString] = i;
+ static if(is(T == uint)) c[i] = randomUUID().toString;
+ else c[randomUUID().toString] = i;
sw.stop();
- writeln("Inserts for ", ContainerName, " finished in ",
+ writeln(T.stringof~" inserts for ", ContainerName, " finished in ",
sw.peek.total!"msecs", " milliseconds.");
}
- void testContainerLookup(alias Container, string ContainerName)()
+ void testContainerLookup(T, alias Container, string ContainerName, bool cached = false)()
+ if(is(T == uint) || is(T == string))
{
import std.random : uniform;
- auto c = Container!(uint, string)();
+ static if(cached){
+ static assert(ContainerName == "khashl (cached)");
+ static if(is(T == uint)) auto c = Container!(uint, string,true,true,true)();
+ else auto c = Container!(string, uint,true,true,true)();
+ }else{
+ static if(is(T == uint)) auto c = Container!(uint, string)();
+ else auto c = Container!(string, uint)();
+ }
// untimed insert
+ string[] items = new string[NUMBER_OF_ITEMS];
foreach (i; 0 .. NUMBER_OF_ITEMS)
- c[i] = randomUUID().toString;
+ items[i] = randomUUID().toString;
+ foreach (uint i,item; items)
+ static if(is(T == uint)) c[i] = item;
+ else c[item] = i;
StopWatch sw = StopWatch(AutoStart.yes);
// serial lookups
foreach (i; 0 .. NUMBER_OF_ITEMS)
- global_x = c[i];
+ static if(is(T == uint)) global_x = c[i];
+ else global_y = c[items[i]];
sw.stop();
- writeln("Serial Lookups for ", ContainerName, " finished in ",
+ writeln("Serial "~T.stringof~" lookups for ", ContainerName, " finished in ",
sw.peek.total!"msecs", " milliseconds.");
sw.reset();
@@ -57,22 +75,34 @@ int main()
// random lookups
sw.start();
foreach(i; 0 .. NUMBER_OF_ITEMS)
- global_x = c[ uniform(0, NUMBER_OF_ITEMS) ];
+ static if(is(T == uint)) global_x = c[ uniform(0, NUMBER_OF_ITEMS) ];
+ else global_y = c[ items[uniform(0, NUMBER_OF_ITEMS)] ];
sw.stop();
- writeln("Random lookups for ", ContainerName, " finished in ",
+ writeln("Random "~ T.stringof ~" lookups for ", ContainerName, " finished in ",
sw.peek.total!"msecs", " milliseconds.");
writeln("Confirming stored value of last lookup: ", global_x);
}
- testContainerInsert!(HashMap, "HashMap");
- testContainerInsert!(khash, "khash");
- testContainerInsert!(khashl, "khashl");
- testContainerInsert!(khashl, "khashl (cached)",true);
+ testContainerInsert!(uint, HashMap, "HashMap");
+ testContainerInsert!(uint, khash, "khash");
+ testContainerInsert!(uint, khashl, "khashl");
+ // testContainerInsert!(uint, khashl, "khashl (cached)",true);
+
+ testContainerInsert!(string, HashMap, "HashMap");
+ testContainerInsert!(string, khash, "khash");
+ testContainerInsert!(string, khashl, "khashl");
+ testContainerInsert!(string, khashl, "khashl (cached)",true);
+
+ testContainerLookup!(uint, HashMap, "HashMap");
+ testContainerLookup!(uint, khash, "khash");
+ testContainerLookup!(uint, khashl, "khashl");
+ // testContainerLookup!(uint, khashl, "khashl (cached)",true);
- testContainerLookup!(HashMap, "HashMap");
- testContainerLookup!(khash, "khash");
- testContainerLookup!(khashl, "khashl");
+ testContainerLookup!(string, HashMap, "HashMap");
+ testContainerLookup!(string, khash, "khash");
+ testContainerLookup!(string, khashl, "khashl");
+ testContainerLookup!(string, khashl, "khashl (cached)",true);
return 0;
}