commit 790f63b30946aac744dd6f193f2e69785dfb63f0 Author: Tony Garnock-Jones Date: Mon Jun 29 20:39:49 2015 -0400 Initial commit diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..60a451b --- /dev/null +++ b/Makefile @@ -0,0 +1,5 @@ +t: *.c + gcc -Wall -o $@ *.c + +clean: + rm -f t diff --git a/README.md b/README.md new file mode 100644 index 0000000..50197e9 --- /dev/null +++ b/README.md @@ -0,0 +1,102 @@ +# TreeTrie + +An implementation of a trie map that can hold (patterns over) ordered +trees in its keys. + +Uses djb's "critbit" structure for each branch node. + +There are two types to represent: + + Trie = Ok + | Tl Trie + | Br Trie Branch + +The first `Trie` argument to `Br` is the wildcard case. + + Branch = Mt + | Lf Trie Atom + | Nd Int Branch Branch + +The `Branch` type is the "critbit" type, extended with a value slot to +make it into a map. We put the `Atom` last in a `Lf`, because we might +want to use 32-bit pointers internal to our structures, but the hosted +atoms might need 64-bit pointers. + +The empty `Trie`, which we'll write `empty` is a cyclic structure (!): + + empty = Br empty Mt + +It's the only such structure, so we might want to actually represent +it as a distinct constant instead. + +Because `Ok` and `Mt` are nullary constants, we can represent them +with special bit-patterns. Let's choose `NULL` for this. They don't +need to be distinct, since they inhabit different types. + +This leaves us with four cases to represent, two in each type. While +we could represent these using a single bit, we will instead use two +bits, for debuggability. + + 33222222222211111111110000000000 + 10987654321098765432109876543210 + |--------------------------------| + + |--------------------------------| + | Trie pointer 00| Br case + |--------------------------------| + | Branch pointer 00| + |--------------------------------| + + |--------------------------------| + | Trie pointer 01| Tl case + |--------------------------------| + + |--------------------------------| + | Trie pointer 10| Lf case + |--------------------------------| + | Atom pointer | + | (may be 64 bits long) | + |--------------------------------| + + |--------------------------------| + | Int 11| Nd case + |--------------------------------| + | Branch pointer 00| + |--------------------------------| + | Branch pointer 00| + |--------------------------------| + +We use a weak hash table to index all our objects, because we need to +hash-cons them. + +Perhaps it could be a Robin-Hood hashtable with backward shift +deletion, +. + +Hmm. For critbit to work, we need to be able to examine the +bitpatterns of atoms. However, if those bitpatterns represent pointers +to Racket-level objects, and Racket uses a moving collector, then not +only will our atom pointers become out of date after a GC, even if we +could update them we'd have to reindex each critbit tree. + +So we probably want `Atom` to instead be some index into a *different* +table. (Another level of indirection!) While any (referenced) `Trie` +holds an indirect reference to a given `Atom`, the underlying Racket +object should be preserved across collections. We will need to find +the `Atom` for a given Racket object (based on `equal?` rather than +`eq?`), and the Racket object for a given `Atom` (an easy table +lookup). We'll want to not hold an `Atom`'s Racket object longer than +necessary. + +It might be better to have the tag bits in each pointer to an object, +rather than in the object header: the tag bits would then identify +which of four separate heaps (each with its own object size) is being +referred to. + +Our data structures are never cyclic. + +Supporting direct atoms is probably a sensible thing to do, so that +e.g. fixnums map to `Atom` without having to take up space in the +table. In fact, the host language should probably allocate and manage +the `Atom`-to-host-object table itself! That way, our code can be +completely ignorant of that kind of detail. diff --git a/fasthash.c b/fasthash.c new file mode 100644 index 0000000..48e16ce --- /dev/null +++ b/fasthash.c @@ -0,0 +1,75 @@ +/* The MIT License + + Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com) + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include "fasthash.h" + +// Compression function for Merkle-Damgard construction. +// This function is generated using the framework provided. +#define mix(h) ({ \ + (h) ^= (h) >> 23; \ + (h) *= 0x2127599bf4325c37ULL; \ + (h) ^= (h) >> 47; }) + +uint64_t fasthash64(const void *buf, size_t len, uint64_t seed) +{ + const uint64_t m = 0x880355f21e6d1965ULL; + const uint64_t *pos = (const uint64_t *)buf; + const uint64_t *end = pos + (len / 8); + const unsigned char *pos2; + uint64_t h = seed ^ (len * m); + uint64_t v; + + while (pos != end) { + v = *pos++; + h ^= mix(v); + h *= m; + } + + pos2 = (const unsigned char*)pos; + v = 0; + + switch (len & 7) { + case 7: v ^= (uint64_t)pos2[6] << 48; + case 6: v ^= (uint64_t)pos2[5] << 40; + case 5: v ^= (uint64_t)pos2[4] << 32; + case 4: v ^= (uint64_t)pos2[3] << 24; + case 3: v ^= (uint64_t)pos2[2] << 16; + case 2: v ^= (uint64_t)pos2[1] << 8; + case 1: v ^= (uint64_t)pos2[0]; + h ^= mix(v); + h *= m; + } + + return mix(h); +} + +uint32_t fasthash32(const void *buf, size_t len, uint32_t seed) +{ + // the following trick converts the 64-bit hashcode to Fermat + // residue, which shall retain information from both the higher + // and lower parts of hashcode. + uint64_t h = fasthash64(buf, len, seed); + return h - (h >> 32); +} diff --git a/fasthash.h b/fasthash.h new file mode 100644 index 0000000..15ac222 --- /dev/null +++ b/fasthash.h @@ -0,0 +1,56 @@ +/* The MIT License + + Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com) + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef _FASTHASH_H +#define _FASTHASH_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * fasthash32 - 32-bit implementation of fasthash + * @buf: data buffer + * @len: data size + * @seed: the seed + */ + uint32_t fasthash32(const void *buf, size_t len, uint32_t seed); + +/** + * fasthash64 - 64-bit implementation of fasthash + * @buf: data buffer + * @len: data size + * @seed: the seed + */ + uint64_t fasthash64(const void *buf, size_t len, uint64_t seed); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/main.c b/main.c new file mode 100644 index 0000000..6855b81 --- /dev/null +++ b/main.c @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +#include "fasthash.h" +#include "treetrie.h" + +static void dump_arena(tt_arena_t *a) { + int i; + printf("max_probe: %u\n", a->max_probe); + printf("live_count: %u\n", a->live_count); + printf("table_length: %u\n", a->table_length); + + for (i = 0; i < a->table_length; i++) { + tt_node_idx_t n = a->table[i]; + tt_hash_t h = tt_hash_node(a, n); + int distance = i - (h % a->table_length); + if (distance < 0) distance += a->table_length; + if (n >= TT_FIRST_VALID_NODE_IDX) { + printf("%12u -> %12u: dist %d ref %d ", + i, + n, + distance, + a->headers[n].inuse.refcount); + switch (a->headers[n].inuse.tag) { + case TT_TAG_TAIL: + printf("tail %u\n", a->nodes[n].a); + break; + case TT_TAG_BRANCH: + printf("branch %u %u\n", a->nodes[n].a, a->nodes[n].b); + break; + case TT_TAG_LEAF: + printf("leaf %u %u\n", a->nodes[n].a, a->nodes[n].b); + break; + case TT_TAG_NODE: + printf("node index %d, %u %u\n", + a->headers[n].inuse.index, + a->nodes[n].a, + a->nodes[n].b); + break; + } + } + } +} + +int main(int argc, char *argv[]) { + tt_arena_t a; + int i, outer; + tt_node_idx_t prev = TT_EMPTY; + + setbuf(stdout, NULL); + tt_arena_init(&a); + + for (outer = 0; outer < 10; outer++) { + tt_grab(&a, prev); + tt_drop(&a, prev); + printf("---------------------------------------- AFTER DROP of %d:\n", prev); + dump_arena(&a); + prev = TT_EMPTY; + printf("======================================== LOOP ITERATION %d\n", outer); + for (i = 0; i < 10; i++) { + tt_node_idx_t leaf = tt_arena_cons(&a, + TT_TAG_LEAF, + 0, + TT_OK, + 1001); + tt_node_idx_t curr = tt_arena_cons(&a, + TT_TAG_NODE, + 0, + leaf, + prev); + dump_arena(&a); + prev = curr; + } + } + + tt_arena_done(&a); + return EXIT_SUCCESS; +} diff --git a/treetrie.c b/treetrie.c new file mode 100644 index 0000000..ac5f835 --- /dev/null +++ b/treetrie.c @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include +#include + +#include "treetrie.h" +#include "fasthash.h" + +static inline tt_hash_t hash(uint32_t tag, + uint32_t index, + tt_node_idx_t a, + tt_node_idx_t b) +{ + uint32_t keyblock[4] = { tag, + index, + a, + b }; + assert(sizeof(keyblock) == 4 * sizeof(uint32_t)); + return (tt_hash_t) fasthash32(keyblock, sizeof(keyblock), 0); +} + +inline tt_hash_t tt_hash_node(tt_arena_t *a, tt_node_idx_t i) { + return hash(a->headers[i].inuse.tag, + a->headers[i].inuse.index, + a->nodes[i].a, + a->nodes[i].b); +} + +int tt_arena_init(tt_arena_t *a) { + a->max_probe = 0; + a->live_count = 0; + a->table_length = 16; + a->table = calloc(a->table_length, sizeof(a->table[0])); + a->headers = calloc(a->table_length, sizeof(a->headers[0])); + a->nodes = calloc(a->table_length, sizeof(a->nodes[0])); + a->free_chain = TT_ERROR; + + if (a->table == NULL || a->headers == NULL || a->nodes == NULL) { + if (a->table != NULL) free(a->table); + if (a->headers != NULL) free(a->headers); + if (a->nodes != NULL) free(a->nodes); + errno = ENOMEM; + return -1; + } + + { + int i; + for (i = a->table_length - 1; i >= TT_FIRST_VALID_NODE_IDX; i--) { + a->headers[i].next_free = a->free_chain; + a->free_chain = i; + } + } + + return 0; +} + +static int tt_grow(tt_arena_t *a) { + assert(0); +} + +void tt_arena_done(tt_arena_t *a) { + free(a->table); + free(a->headers); + free(a->nodes); + memset(a, 0, sizeof(*a)); +} + +static void recycle_node(tt_arena_t *a, tt_node_idx_t ni) { + tt_hash_t h; + int i; + + printf("++++++++++++++++++++++++++++++++++++++++ recycling %d\n", ni); + + assert(ni >= TT_FIRST_VALID_NODE_IDX); + h = tt_hash_node(a, ni); + + if (a->headers[ni].inuse.tag == TT_TAG_LEAF) { + a->nodes[ni].b = TT_ERROR; + } + a->headers[ni].next_free = a->free_chain; + a->free_chain = ni; + a->live_count--; + + for (i = 0; i < a->max_probe+1; i++) { + unsigned int index = (h + i) % a->table_length; + tt_node_idx_t candidate = a->table[index]; + + printf("hunting i=%d index=%d ni=%d candidate=%d\n", i, index, ni, candidate); + assert(candidate >= TT_FIRST_VALID_NODE_IDX); /* Internal error if node not in table */ + + if (candidate == ni) { + /* We found it. Now swap in elements. */ + while (1) { + unsigned int nextindex = (index + 1) % a->table_length; + tt_node_idx_t next_n = a->table[nextindex]; + tt_hash_t next_h; + int distance; + + a->table[index] = TT_ERROR; + + if (next_n < TT_FIRST_VALID_NODE_IDX) { + break; + } + + next_h = tt_hash_node(a, next_n); + distance = nextindex - (next_h % a->table_length); + if (distance < 0) distance += a->table_length; + + if (distance == 0) { + break; + } + + a->table[index] = next_n; + index = nextindex; + } + break; + } + } +} + +tt_node_idx_t tt_arena_cons(tt_arena_t *a, + uint32_t tag, + uint32_t nindex, + tt_node_idx_t na, + tt_node_idx_t nb) +{ + tt_hash_t h = hash(tag, nindex, na, nb); + int i; + + for (i = 0; i < a->max_probe+1; i++) { + unsigned int index = (h + i) % a->table_length; + tt_node_idx_t candidate = a->table[index]; + + printf("cons at %d candidate %d\n", i, candidate); + /* TODO: perhaps also bail early if we detect that the hash code changes */ + if (candidate < TT_FIRST_VALID_NODE_IDX) { + printf("cons empty cell\n"); + break; + } + + printf("tag %d %d\n", a->headers[candidate].inuse.tag, tag); + printf("index %d %d\n", a->headers[candidate].inuse.index, nindex); + printf("a %d %d\n", a->nodes[candidate].a, na); + printf("b %d %d\n", a->nodes[candidate].b, nb); + + if (a->headers[candidate].inuse.tag == tag && + a->headers[candidate].inuse.index == nindex && + a->nodes[candidate].a == na && + a->nodes[candidate].b == nb) { + printf("cons located correct candidate\n"); + return candidate; + } + } + + printf("cons needs to alloc\n"); + + if (a->free_chain == TT_ERROR) { + if (tt_grow(a) != 0) { + return TT_ERROR; + } + } + + { + tt_node_idx_t node = a->free_chain; + tt_node_idx_t tostore = node; + + tt_grab(a, na); + if (tag != TT_TAG_LEAF) tt_grab(a, nb); + + a->free_chain = a->headers[node].next_free; + tt_drop(a, a->nodes[node].a); + tt_drop(a, a->nodes[node].b); + a->live_count++; + + a->headers[node].inuse.refcount = 0; + a->headers[node].inuse.tag = tag; + a->headers[node].inuse.index = nindex; + a->nodes[node].a = na; + a->nodes[node].b = nb; + + /* Not found */ + i = 0; + while (1) { + unsigned int index = (h + i) % a->table_length; + tt_node_idx_t candidate = a->table[index]; + + printf("checking robinhood at h %d i %d index %d candidate %d\n", h, i, index, candidate); + + if (i > a->max_probe) { + a->max_probe = i; + } + + if (candidate < TT_FIRST_VALID_NODE_IDX) { + /* This slot in the table is free. */ + printf("slot free!\n"); + a->table[index] = tostore; + break; + } + + printf("slot not free.\n"); + { + tt_hash_t candidate_h = tt_hash_node(a, candidate); + int distance = index - (candidate_h % a->table_length); + if (distance < 0) distance += a->table_length; + + if (distance < i) { + a->table[index] = tostore; + h = candidate_h; + i = distance + 1; + tostore = candidate; + } else { + /* keep scanning. */ + i++; + } + } + } + + return node; + } +} + +tt_node_idx_t tt_grab(tt_arena_t *a, tt_node_idx_t i) { + if (i >= TT_FIRST_VALID_NODE_IDX && a->headers[i].inuse.refcount < TT_REFCOUNT_LIMIT) { + a->headers[i].inuse.refcount++; + } + return i; +} + +void tt_drop(tt_arena_t *a, tt_node_idx_t i) { + if (i >= TT_FIRST_VALID_NODE_IDX && a->headers[i].inuse.refcount < TT_REFCOUNT_LIMIT) { + printf("++++++++++++++++++++++++++++++ dropping %d\n", i); + if (--(a->headers[i].inuse.refcount) == 0) { + recycle_node(a, i); + } + } +} diff --git a/treetrie.h b/treetrie.h new file mode 100644 index 0000000..2174038 --- /dev/null +++ b/treetrie.h @@ -0,0 +1,80 @@ +#ifndef TREETRIE_H_f55a3f6d_ef43_45d3_bec3_496a196b5db1 +#define TREETRIE_H_f55a3f6d_ef43_45d3_bec3_496a196b5db1 + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum tt_tag_t { + TT_TAG_TAIL = 0, + TT_TAG_BRANCH, + TT_TAG_LEAF, /* only case where one of node a or b points to non-node */ + TT_TAG_NODE +} tt_tag_t; + +typedef enum tt_reserved_node_idx_t { + TT_ERROR = 0, /* invalid node index, means "no node at all", not even empty */ + TT_EMPTY, /* empty treetrie */ + TT_OK, /* terminal marker */ + + TT_FIRST_VALID_NODE_IDX +} tt_reserved_node_idx_t; + +typedef uint32_t tt_node_idx_t; /* N.B. tt_reserved_node_idx_t */ + +typedef uint32_t tt_atom_t; + +typedef union tt_header_t { + uint32_t next_free; + struct { + uint32_t refcount : 24; + uint32_t index : 6; + tt_tag_t tag : 2; + } inuse; +} tt_header_t; + +#define TT_REFCOUNT_LIMIT ((1 << 24) - 1) + +typedef struct tt_node_t { + tt_node_idx_t a; /* always a real node idx */ + tt_node_idx_t b; /* a real node idx unless corresponding tag is TT_TAG_LEAF */ +} tt_node_t; + +typedef struct tt_arena_t { + /* Fields for the Robin Hood hashset used for hashconsing of tt_nodes */ + unsigned int max_probe; + unsigned int live_count; + unsigned int table_length; + tt_node_idx_t *table; + + tt_header_t *headers; + tt_node_t *nodes; + + tt_node_idx_t free_chain; +} tt_arena_t; + +extern int tt_arena_init(tt_arena_t *a); +extern void tt_arena_done(tt_arena_t *a); + +/* Returns 0 if consing failed (because of out-of-memory). + Otherwise, returns a nonzero index. + Grabs na and nb (according to tag) IF it needs to allocate a new node, otherwise does not. + DOES NOT increase the reference count of the returned node. */ +extern tt_node_idx_t tt_arena_cons(tt_arena_t *a, + uint32_t tag, + uint32_t index, + tt_node_idx_t na, + tt_node_idx_t nb); + +extern tt_node_idx_t tt_grab(tt_arena_t *a, tt_node_idx_t i); +extern void tt_drop(tt_arena_t *a, tt_node_idx_t i); + +/* WARNING: private, unsafe */ +typedef uint32_t tt_hash_t; +extern tt_hash_t tt_hash_node(tt_arena_t *a, tt_node_idx_t i); + +#ifdef __cplusplus +} +#endif + +#endif