From cb56dbcdb9f3e3d107542844134a15af28160c6f Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 6 Mar 2013 21:08:41 +0000 Subject: [PATCH 01/37] Make buildable without ulib --- demo/customslots.pxd | 4 +-- extensibletype/extensibletype.pyx | 58 +++++++++++++++---------------- setup.py | 9 +++-- 3 files changed, 37 insertions(+), 34 deletions(-) diff --git a/demo/customslots.pxd b/demo/customslots.pxd index 44140d0..2829dba 100644 --- a/demo/customslots.pxd +++ b/demo/customslots.pxd @@ -5,9 +5,9 @@ cdef extern from "customslots.h": void *pointer Py_ssize_t objoffset uintptr_t flags - + ctypedef struct PyCustomSlot: - uintrptr_t id + uintptr_t id pyx_data data int PyCustomSlots_Check(obj) diff --git a/extensibletype/extensibletype.pyx b/extensibletype/extensibletype.pyx index dd4eed2..ce910ba 100644 --- a/extensibletype/extensibletype.pyx +++ b/extensibletype/extensibletype.pyx @@ -66,33 +66,33 @@ def perfect_hash(cnp.ndarray[uint64_t] hashes, int repeat=1): return p, table.base.r, table.base.m_f, table.base.m_g, d -cdef extern from "md5sum.h": - ctypedef struct MD5_CTX: - uint32_t i[2] - uint32_t buf[4] - unsigned char in_ "in"[64] - unsigned char digest[16] - - void MD5Init(MD5_CTX *mdContext) - void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf, - unsigned int inLen) - void MD5Final(MD5_CTX *mdContext) - -cdef extern from "hash.h": - uint64_t hash_crapwow64(unsigned char *buf, uint64_t len, uint64_t seed) - -def crapwowbench(int repeat=1): - cdef int r - cdef MD5_CTX ctx - for r in range(repeat): - hash_crapwow64("asdf", 4, 0xf123456781234567) - - -def md5bench(int repeat=1): - cdef int r - cdef MD5_CTX ctx - for r in range(repeat): - MD5Init(&ctx) - MD5Update(&ctx, "asdf", 4) - MD5Final(&ctx) +#cdef extern from "md5sum.h": +# ctypedef struct MD5_CTX: +# uint32_t i[2] +# uint32_t buf[4] +# unsigned char in_ "in"[64] +# unsigned char digest[16] +# +# void MD5Init(MD5_CTX *mdContext) +# void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf, +# unsigned int inLen) +# void MD5Final(MD5_CTX *mdContext) +# +#cdef extern from "hash.h": +# uint64_t hash_crapwow64(unsigned char *buf, uint64_t len, uint64_t seed) +# +#def crapwowbench(int repeat=1): +# cdef int r +# cdef MD5_CTX ctx +# for r in range(repeat): +# hash_crapwow64("asdf", 4, 0xf123456781234567) +# +# +#def md5bench(int repeat=1): +# cdef int r +# cdef MD5_CTX ctx +# for r in range(repeat): +# MD5Init(&ctx) +# MD5Update(&ctx, "asdf", 4) +# MD5Final(&ctx) diff --git a/setup.py b/setup.py index 435eaf0..5e12b74 100644 --- a/setup.py +++ b/setup.py @@ -3,13 +3,16 @@ from Cython.Distutils import build_ext import os -include_dirs = ['include', '../ulib/src/base'] +import numpy as np + +include_dirs = ['include', '../ulib/src/base', np.get_include()] extensions = [ Extension("extensibletype.extensibletype", [os.path.join("extensibletype", "extensibletype.pyx"), - '../ulib/src/base/md5sum.c', - '../ulib/src/base/hash.c'], + #'../ulib/src/base/md5sum.c', + #'../ulib/src/base/hash.c' + ], include_dirs=include_dirs)] setup(cmdclass={'build_ext': build_ext}, From 77f6611cb347c26df8464ac542853d0338192e69 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 6 Mar 2013 21:32:05 +0000 Subject: [PATCH 02/37] Add string interning mechanism --- extensibletype/intern.pxd | 15 +++++ extensibletype/intern.pyx | 19 +++++++ extensibletype/test/test_interning.py | 25 +++++++++ include/globalinterning.h | 80 +++++++++++++++++++++++++++ include/interning.h | 75 +++++++++++++++++++++++++ setup.py | 8 ++- 6 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 extensibletype/intern.pxd create mode 100644 extensibletype/intern.pyx create mode 100644 extensibletype/test/test_interning.py create mode 100644 include/globalinterning.h create mode 100644 include/interning.h diff --git a/extensibletype/intern.pxd b/extensibletype/intern.pxd new file mode 100644 index 0000000..e277b84 --- /dev/null +++ b/extensibletype/intern.pxd @@ -0,0 +1,15 @@ +cdef extern from "Python.h": + ctypedef unsigned int Py_uintptr_t + +cdef extern from *: + ctypedef char *string_t "const char *" + +cdef extern from "globalinterning.h": + ctypedef void *intern_table_t + + intern_table_t intern_create_table() except NULL + void intern_destroy_table(intern_table_t table) + string_t intern_key(intern_table_t table, string_t key) except NULL + + int PyIntern_Initialize() except -1 + string_t PyIntern_AddKey(string_t key) except NULL diff --git a/extensibletype/intern.pyx b/extensibletype/intern.pyx new file mode 100644 index 0000000..dd88a21 --- /dev/null +++ b/extensibletype/intern.pyx @@ -0,0 +1,19 @@ +def global_intern(bytes key): + return PyIntern_AddKey(key) + +def global_intern_initialize(): + PyIntern_Initialize() + +cdef class InternTable(object): + "Wrap intern tables (intern_table_t)" + + cdef intern_table_t table + + def __init__(self): + self.table = intern_create_table() + + def __dealloc__(self): + intern_destroy_table(self.table) + + def intern(self, bytes key): + return intern_key(self.table, key) diff --git a/extensibletype/test/test_interning.py b/extensibletype/test/test_interning.py new file mode 100644 index 0000000..d754de1 --- /dev/null +++ b/extensibletype/test/test_interning.py @@ -0,0 +1,25 @@ +from .. import intern + +def test_global_interning(): + try: + intern.global_intern("hello") + except AssertionError, e: + pass + else: + raise Exception("Expects complaint about uninitialized table") + + intern.global_intern_initialize() + id1 = intern.global_intern("hello") + id2 = intern.global_intern("hello") + id3 = intern.global_intern("hallo") + assert id1 == id2 + assert id1 != id3 + +def test_interning(): + table = intern.InternTable() + + id1 = intern.global_intern("hello") + id2 = intern.global_intern("hello") + id3 = intern.global_intern("hallo") + assert id1 == id2 + assert id1 != id3 diff --git a/include/globalinterning.h b/include/globalinterning.h new file mode 100644 index 0000000..83e0352 --- /dev/null +++ b/include/globalinterning.h @@ -0,0 +1,80 @@ +#ifndef Py_GLOBAL_INTERN_H +#define Py_GLOBAL_INTERN_H_ +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "interning.h" + +static const char *_table_name = "_global_table_v1"; +static intern_table_t _global_table = NULL; + +/* Interning API */ +/* Uses functions so we can get the address (and make it + accessible from FFIs) */ + +/* Get an interned pointer to a key (a string). + Returns NULL on error with an exception set. */ +static const char * +PyIntern_AddKey(const char *key) +{ + if (_global_table == NULL) { + PyErr_SetString(PyExc_AssertionError, + "Intern table not set, did you call PyIntern_Initialize()?"); + return NULL; + } + return intern_key(_global_table, key); +} + +/* Intialize global interning table */ +static int +PyIntern_Initialize(void) { + PyObject *module = NULL; + PyObject *table = NULL; + int retval; + + if (_global_table != NULL) { + return 0; + } + + module = PyImport_AddModule("_global_interning"); /* borrowed ref */ + if (!module) + goto bad; + + if (PyObject_HasAttrString(module, _table_name)) { + table = PyObject_GetAttrString(module, _table_name); + if (!table) + goto bad; + + if (!PyDict_Check(table)) + PyErr_SetString(PyExc_TypeError, "Intern table is not a dict"); + } else { + /* not found; create it */ + table = (PyObject *) intern_create_table(); + if (table == NULL) + goto bad; + + if (PyObject_SetAttrString(module, _table_name, table) < 0) + goto bad; + } + + /* Initialize the global variable used in macros */ + _global_table = table; + + retval = 0; + goto ret; + bad: + retval = -1; + ret: + /* module is borrowed */ + Py_XDECREF(table); + return retval; +} + + +#ifdef __cplusplus +} +#endif +#endif /* !Py_GLOBAL_INTERN_H */ diff --git a/include/interning.h b/include/interning.h new file mode 100644 index 0000000..decf114 --- /dev/null +++ b/include/interning.h @@ -0,0 +1,75 @@ +#ifndef Py_INTERNING_H +#define Py_INTERNING_H +#ifdef __cplusplus +extern "C" { +#endif + +/* Utility for interning strings */ +/* TODO: make it GIL-less and Python independent */ + +#include + +#if PY_MAJOR_VERSION < 3 + #define _PyIntern_FromString PyString_FromString + #define _PyIntern_FromStringAndSize PyString_FromStringAndSize + #define _PyIntern_AsString PyString_AsString +#else + #define _PyIntern_FromString PyBytes_FromString + #define _PyIntern_FromStringAndSize PyBytes_FromStringAndSize + #define _PyIntern_AsString PyBytes_AsString +#endif + + +typedef void *intern_table_t; + +static intern_table_t +intern_create_table(void) +{ + /* { string -> interned_string } */ + PyObject *table = PyDict_New(); + return (intern_table_t) table; +} + +static void +intern_destroy_table(intern_table_t table) +{ + Py_DECREF((PyObject *) table); +} + +static const char * +_intern_key(intern_table_t table, PyObject *key) +{ + PyObject *dict = (PyObject *) table; + PyObject *value; + + value = PyDict_GetItem(dict, key); + + if (value == NULL) { + /* Key not in dict */ + value = key; + PyDict_SetItem(dict, key, value); + } + + return _PyIntern_AsString(value); +} + +static const char * +intern_key(intern_table_t table, const char *key) +{ + PyObject *key_obj = _PyIntern_FromString(key); + const char *retval; + + if (key_obj == NULL) + return NULL; + + retval = _intern_key(table, key_obj); + + Py_DECREF(key_obj); + return retval; +} + + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNING_H */ diff --git a/setup.py b/setup.py index 5e12b74..17a4fb6 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,13 @@ #'../ulib/src/base/md5sum.c', #'../ulib/src/base/hash.c' ], - include_dirs=include_dirs)] + include_dirs=include_dirs), + Extension("extensibletype.intern", + ["extensibletype/intern.pyx"], + include_dirs=include_dirs, + depends=["include/globalinterning.h", + "include/interning.h"]), +] setup(cmdclass={'build_ext': build_ext}, ext_modules=extensions) From 2c742df0772e985f1420b2e72a9f6b2c155fce1e Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 7 Mar 2013 22:42:31 +0000 Subject: [PATCH 03/37] Add python/cython-level way to build function hashtable --- extensibletype/extensibletype.pyx | 106 +++++++++++++++++++++ extensibletype/test/test_perfecthashing.py | 30 +++--- 2 files changed, 123 insertions(+), 13 deletions(-) diff --git a/extensibletype/extensibletype.pyx b/extensibletype/extensibletype.pyx index ce910ba..69fb664 100644 --- a/extensibletype/extensibletype.pyx +++ b/extensibletype/extensibletype.pyx @@ -1,6 +1,11 @@ +cimport stdlib cimport numpy as cnp import numpy as np +import hashlib + +from . import intern + cdef extern from "stdint.h": ctypedef unsigned int uint32_t ctypedef unsigned long long uint64_t @@ -66,6 +71,107 @@ def perfect_hash(cnp.ndarray[uint64_t] hashes, int repeat=1): return p, table.base.r, table.base.m_f, table.base.m_g, d +def roundup(x): + "Round up to a power of two" + x -= 1 + x |= x >> 1 + x |= x >> 2 + x |= x >> 4 + x |= x >> 8 + x |= x >> 16 + x |= x >> 32 + x += 1 + return x + +cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: + cdef PyCustomSlots_Table *table + + size = roundup(size) + + table = stdlib.malloc( + sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * size + + sizeof(PyCustomSlots_Entry) * size) + + if table == NULL: + raise MemoryError + + table.n = size + table.b = size + table.flags = 0 + + table.entries = (( &table[1]) + + size * sizeof(uint16_t)) + + return table + +cdef class PerfectHashMethodTable(object): + """ + Simple wrapper for hash-based virtual method tables. + """ + + cdef PyCustomSlots_Table *table + cdef uint16_t *displacements + + def __init__(self, n, ids, flags, funcs): + cdef Py_ssize_t i + cdef cnp.ndarray[uint64_t] hashes + + self.table = allocate_hash_table(n) + self.displacements = ( self.table + + sizeof(PyCustomSlots_Table)) + + hashes = np.empty(n, dtype=np.uint64) + + intern.global_intern_initialize() + + # Initialize hash table entries, build hash ids + for i, (signature, flag, func) in enumerate(zip(ids, flags, funcs)): + id = intern.global_intern(signature) + + self.table.entries[i].id = id + self.table.entries[i].flags = flag + self.table.entries[i].ptr = func + + hashes[i] = self.hash(signature) + + # Perfect hash our table + PyCustomSlots_PerfectHash(self.table, &hashes[0]) + + def hash(self, signature): + cdef uint64_t hashvalue + cdef bytes md5 = hashlib.md5(signature).digest() + + (&hashvalue)[0] = ( md5)[0] + return hashvalue + + def find_method(self, signature): + """ + Find method of the given signature. Use from non-performance + critical code. + """ + cdef uintptr_t id = intern.global_intern(signature) + cdef uint64_t prehash = self.hash(signature) + + cdef int idx = (((prehash >> self.table.r) & self.table.m_f) ^ + self.displacements[prehash & self.table.m_g]) + + assert 0 <= idx < self.size + + if self.table.entries[idx].id != id: + return None + + return ( self.table.entries[idx].ptr, + self.table.entries[idx].flags) + + property table_ptr: + def __get__(self): + return self.table + + property size: + def __get__(self): + return self.table.n + + #cdef extern from "md5sum.h": # ctypedef struct MD5_CTX: # uint32_t i[2] diff --git a/extensibletype/test/test_perfecthashing.py b/extensibletype/test/test_perfecthashing.py index c8a11aa..bf54aab 100644 --- a/extensibletype/test/test_perfecthashing.py +++ b/extensibletype/test/test_perfecthashing.py @@ -8,17 +8,6 @@ def draw_hashes(rng, nitems): hashes |= rng.randint(2**32, size=nitems).astype(np.uint64) return hashes -def roundup(x): - x -= 1 - x |= x >> 1 - x |= x >> 2 - x |= x >> 4 - x |= x >> 8 - x |= x >> 16 - x |= x >> 32 - x += 1 - return x - def test_binsort(): nbins = 64 p = np.zeros(nbins, dtype=np.uint16) @@ -32,11 +21,26 @@ def test_binsort(): def test_basic(): n=64 prehashes = draw_hashes(np.random, n) - p, r, m_f, m_g, d = extensibletype.perfect_hash(prehashes, repeat=10**6) + assert len(prehashes) == len(set(prehashes)) + p, r, m_f, m_g, d = extensibletype.perfect_hash(prehashes, repeat=10**5) hashes = ((prehashes >> r) & m_f) ^ d[prehashes & m_g] print p print d hashes.sort() print hashes assert len(hashes) == len(np.unique(hashes)) - + +def test_methodtable(): + ids = ["ff->f", "dd->d", "ii->i", "ll->l", "OO->O"] + flags = range(1, len(ids) + 1) + funcs = range(len(ids)) + + methodtable = extensibletype.PerfectHashMethodTable( + len(ids), ids, flags, funcs) + + for (signature, flag, func) in zip(ids, flags, funcs): + result = methodtable.find_method(signature) + assert result is not None + got_func, got_flag = result + assert func == got_func, (func, got_func) + assert flag == got_flag, (flag, got_flag) From 71f542d5686d920ce64e498d9c1831b2211c14d8 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 8 Mar 2013 16:00:35 +0000 Subject: [PATCH 04/37] Add siphash --- include/siphash24.c | 243 ++++++++++++++++++++++++++++++++++++++++++++ include/siphash24.h | 12 +++ 2 files changed, 255 insertions(+) create mode 100644 include/siphash24.c create mode 100644 include/siphash24.h diff --git a/include/siphash24.c b/include/siphash24.c new file mode 100644 index 0000000..981bd31 --- /dev/null +++ b/include/siphash24.c @@ -0,0 +1,243 @@ +/* + SipHash reference C implementation + + Written in 2012 by + Jean-Philippe Aumasson + Daniel J. Bernstein + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#include +#include +#include +typedef uint64_t u64; +typedef uint32_t u32; +typedef uint8_t u8; + +#define ROTL(x,b) (u64)( ((x) << (b)) | ( (x) >> (64 - (b))) ) + +#define U32TO8_LE(p, v) \ + (p)[0] = (u8)((v) ); (p)[1] = (u8)((v) >> 8); \ + (p)[2] = (u8)((v) >> 16); (p)[3] = (u8)((v) >> 24); + +#define U64TO8_LE(p, v) \ + U32TO8_LE((p), (u32)((v) )); \ + U32TO8_LE((p) + 4, (u32)((v) >> 32)); + +#define U8TO64_LE(p) \ + (((u64)((p)[0]) ) | \ + ((u64)((p)[1]) << 8) | \ + ((u64)((p)[2]) << 16) | \ + ((u64)((p)[3]) << 24) | \ + ((u64)((p)[4]) << 32) | \ + ((u64)((p)[5]) << 40) | \ + ((u64)((p)[6]) << 48) | \ + ((u64)((p)[7]) << 56)) + +#define SIPROUND \ + do { \ + v0 += v1; v1=ROTL(v1,13); v1 ^= v0; v0=ROTL(v0,32); \ + v2 += v3; v3=ROTL(v3,16); v3 ^= v2; \ + v0 += v3; v3=ROTL(v3,21); v3 ^= v0; \ + v2 += v1; v1=ROTL(v1,17); v1 ^= v2; v2=ROTL(v2,32); \ + } while(0) + +/* SipHash-2-4 */ +int crypto_auth( unsigned char *out, const unsigned char *in, unsigned long long inlen, const unsigned char *k ) +{ + /* "somepseudorandomlygeneratedbytes" */ + u64 v0 = 0x736f6d6570736575ULL; + u64 v1 = 0x646f72616e646f6dULL; + u64 v2 = 0x6c7967656e657261ULL; + u64 v3 = 0x7465646279746573ULL; + u64 b; + u64 k0 = U8TO64_LE( k ); + u64 k1 = U8TO64_LE( k + 8 ); + u64 m; + const u8 *end = in + inlen - ( inlen % sizeof( u64 ) ); + const int left = inlen & 7; + b = ( ( u64 )inlen ) << 56; + v3 ^= k1; + v2 ^= k0; + v1 ^= k1; + v0 ^= k0; + + for ( ; in != end; in += 8 ) + { + m = U8TO64_LE( in ); +#ifdef DEBUG + printf( "(%3d) v0 %08x %08x\n", ( int )inlen, ( u32 )( v0 >> 32 ), ( u32 )v0 ); + printf( "(%3d) v1 %08x %08x\n", ( int )inlen, ( u32 )( v1 >> 32 ), ( u32 )v1 ); + printf( "(%3d) v2 %08x %08x\n", ( int )inlen, ( u32 )( v2 >> 32 ), ( u32 )v2 ); + printf( "(%3d) v3 %08x %08x\n", ( int )inlen, ( u32 )( v3 >> 32 ), ( u32 )v3 ); + printf( "(%3d) compress %08x %08x\n", ( int )inlen, ( u32 )( m >> 32 ), ( u32 )m ); +#endif + v3 ^= m; + SIPROUND; + SIPROUND; + v0 ^= m; + } + + switch( left ) + { + case 7: b |= ( ( u64 )in[ 6] ) << 48; + + case 6: b |= ( ( u64 )in[ 5] ) << 40; + + case 5: b |= ( ( u64 )in[ 4] ) << 32; + + case 4: b |= ( ( u64 )in[ 3] ) << 24; + + case 3: b |= ( ( u64 )in[ 2] ) << 16; + + case 2: b |= ( ( u64 )in[ 1] ) << 8; + + case 1: b |= ( ( u64 )in[ 0] ); break; + + case 0: break; + } + +#ifdef DEBUG + printf( "(%3d) v0 %08x %08x\n", ( int )inlen, ( u32 )( v0 >> 32 ), ( u32 )v0 ); + printf( "(%3d) v1 %08x %08x\n", ( int )inlen, ( u32 )( v1 >> 32 ), ( u32 )v1 ); + printf( "(%3d) v2 %08x %08x\n", ( int )inlen, ( u32 )( v2 >> 32 ), ( u32 )v2 ); + printf( "(%3d) v3 %08x %08x\n", ( int )inlen, ( u32 )( v3 >> 32 ), ( u32 )v3 ); + printf( "(%3d) padding %08x %08x\n", ( int )inlen, ( u32 )( b >> 32 ), ( u32 )b ); +#endif + v3 ^= b; + SIPROUND; + SIPROUND; + v0 ^= b; +#ifdef DEBUG + printf( "(%3d) v0 %08x %08x\n", ( int )inlen, ( u32 )( v0 >> 32 ), ( u32 )v0 ); + printf( "(%3d) v1 %08x %08x\n", ( int )inlen, ( u32 )( v1 >> 32 ), ( u32 )v1 ); + printf( "(%3d) v2 %08x %08x\n", ( int )inlen, ( u32 )( v2 >> 32 ), ( u32 )v2 ); + printf( "(%3d) v3 %08x %08x\n", ( int )inlen, ( u32 )( v3 >> 32 ), ( u32 )v3 ); +#endif + v2 ^= 0xff; + SIPROUND; + SIPROUND; + SIPROUND; + SIPROUND; + b = v0 ^ v1 ^ v2 ^ v3; + U64TO8_LE( out, b ); + return 0; +} + +/* + SipHash-2-4 output with + k = 00 01 02 ... + and + in = (empty string) + in = 00 (1 byte) + in = 00 01 (2 bytes) + in = 00 01 02 (3 bytes) + ... + in = 00 01 02 ... 3e (63 bytes) +*/ +u8 vectors[64][8] = +{ + { 0x31, 0x0e, 0x0e, 0xdd, 0x47, 0xdb, 0x6f, 0x72, }, + { 0xfd, 0x67, 0xdc, 0x93, 0xc5, 0x39, 0xf8, 0x74, }, + { 0x5a, 0x4f, 0xa9, 0xd9, 0x09, 0x80, 0x6c, 0x0d, }, + { 0x2d, 0x7e, 0xfb, 0xd7, 0x96, 0x66, 0x67, 0x85, }, + { 0xb7, 0x87, 0x71, 0x27, 0xe0, 0x94, 0x27, 0xcf, }, + { 0x8d, 0xa6, 0x99, 0xcd, 0x64, 0x55, 0x76, 0x18, }, + { 0xce, 0xe3, 0xfe, 0x58, 0x6e, 0x46, 0xc9, 0xcb, }, + { 0x37, 0xd1, 0x01, 0x8b, 0xf5, 0x00, 0x02, 0xab, }, + { 0x62, 0x24, 0x93, 0x9a, 0x79, 0xf5, 0xf5, 0x93, }, + { 0xb0, 0xe4, 0xa9, 0x0b, 0xdf, 0x82, 0x00, 0x9e, }, + { 0xf3, 0xb9, 0xdd, 0x94, 0xc5, 0xbb, 0x5d, 0x7a, }, + { 0xa7, 0xad, 0x6b, 0x22, 0x46, 0x2f, 0xb3, 0xf4, }, + { 0xfb, 0xe5, 0x0e, 0x86, 0xbc, 0x8f, 0x1e, 0x75, }, + { 0x90, 0x3d, 0x84, 0xc0, 0x27, 0x56, 0xea, 0x14, }, + { 0xee, 0xf2, 0x7a, 0x8e, 0x90, 0xca, 0x23, 0xf7, }, + { 0xe5, 0x45, 0xbe, 0x49, 0x61, 0xca, 0x29, 0xa1, }, + { 0xdb, 0x9b, 0xc2, 0x57, 0x7f, 0xcc, 0x2a, 0x3f, }, + { 0x94, 0x47, 0xbe, 0x2c, 0xf5, 0xe9, 0x9a, 0x69, }, + { 0x9c, 0xd3, 0x8d, 0x96, 0xf0, 0xb3, 0xc1, 0x4b, }, + { 0xbd, 0x61, 0x79, 0xa7, 0x1d, 0xc9, 0x6d, 0xbb, }, + { 0x98, 0xee, 0xa2, 0x1a, 0xf2, 0x5c, 0xd6, 0xbe, }, + { 0xc7, 0x67, 0x3b, 0x2e, 0xb0, 0xcb, 0xf2, 0xd0, }, + { 0x88, 0x3e, 0xa3, 0xe3, 0x95, 0x67, 0x53, 0x93, }, + { 0xc8, 0xce, 0x5c, 0xcd, 0x8c, 0x03, 0x0c, 0xa8, }, + { 0x94, 0xaf, 0x49, 0xf6, 0xc6, 0x50, 0xad, 0xb8, }, + { 0xea, 0xb8, 0x85, 0x8a, 0xde, 0x92, 0xe1, 0xbc, }, + { 0xf3, 0x15, 0xbb, 0x5b, 0xb8, 0x35, 0xd8, 0x17, }, + { 0xad, 0xcf, 0x6b, 0x07, 0x63, 0x61, 0x2e, 0x2f, }, + { 0xa5, 0xc9, 0x1d, 0xa7, 0xac, 0xaa, 0x4d, 0xde, }, + { 0x71, 0x65, 0x95, 0x87, 0x66, 0x50, 0xa2, 0xa6, }, + { 0x28, 0xef, 0x49, 0x5c, 0x53, 0xa3, 0x87, 0xad, }, + { 0x42, 0xc3, 0x41, 0xd8, 0xfa, 0x92, 0xd8, 0x32, }, + { 0xce, 0x7c, 0xf2, 0x72, 0x2f, 0x51, 0x27, 0x71, }, + { 0xe3, 0x78, 0x59, 0xf9, 0x46, 0x23, 0xf3, 0xa7, }, + { 0x38, 0x12, 0x05, 0xbb, 0x1a, 0xb0, 0xe0, 0x12, }, + { 0xae, 0x97, 0xa1, 0x0f, 0xd4, 0x34, 0xe0, 0x15, }, + { 0xb4, 0xa3, 0x15, 0x08, 0xbe, 0xff, 0x4d, 0x31, }, + { 0x81, 0x39, 0x62, 0x29, 0xf0, 0x90, 0x79, 0x02, }, + { 0x4d, 0x0c, 0xf4, 0x9e, 0xe5, 0xd4, 0xdc, 0xca, }, + { 0x5c, 0x73, 0x33, 0x6a, 0x76, 0xd8, 0xbf, 0x9a, }, + { 0xd0, 0xa7, 0x04, 0x53, 0x6b, 0xa9, 0x3e, 0x0e, }, + { 0x92, 0x59, 0x58, 0xfc, 0xd6, 0x42, 0x0c, 0xad, }, + { 0xa9, 0x15, 0xc2, 0x9b, 0xc8, 0x06, 0x73, 0x18, }, + { 0x95, 0x2b, 0x79, 0xf3, 0xbc, 0x0a, 0xa6, 0xd4, }, + { 0xf2, 0x1d, 0xf2, 0xe4, 0x1d, 0x45, 0x35, 0xf9, }, + { 0x87, 0x57, 0x75, 0x19, 0x04, 0x8f, 0x53, 0xa9, }, + { 0x10, 0xa5, 0x6c, 0xf5, 0xdf, 0xcd, 0x9a, 0xdb, }, + { 0xeb, 0x75, 0x09, 0x5c, 0xcd, 0x98, 0x6c, 0xd0, }, + { 0x51, 0xa9, 0xcb, 0x9e, 0xcb, 0xa3, 0x12, 0xe6, }, + { 0x96, 0xaf, 0xad, 0xfc, 0x2c, 0xe6, 0x66, 0xc7, }, + { 0x72, 0xfe, 0x52, 0x97, 0x5a, 0x43, 0x64, 0xee, }, + { 0x5a, 0x16, 0x45, 0xb2, 0x76, 0xd5, 0x92, 0xa1, }, + { 0xb2, 0x74, 0xcb, 0x8e, 0xbf, 0x87, 0x87, 0x0a, }, + { 0x6f, 0x9b, 0xb4, 0x20, 0x3d, 0xe7, 0xb3, 0x81, }, + { 0xea, 0xec, 0xb2, 0xa3, 0x0b, 0x22, 0xa8, 0x7f, }, + { 0x99, 0x24, 0xa4, 0x3c, 0xc1, 0x31, 0x57, 0x24, }, + { 0xbd, 0x83, 0x8d, 0x3a, 0xaf, 0xbf, 0x8d, 0xb7, }, + { 0x0b, 0x1a, 0x2a, 0x32, 0x65, 0xd5, 0x1a, 0xea, }, + { 0x13, 0x50, 0x79, 0xa3, 0x23, 0x1c, 0xe6, 0x60, }, + { 0x93, 0x2b, 0x28, 0x46, 0xe4, 0xd7, 0x06, 0x66, }, + { 0xe1, 0x91, 0x5f, 0x5c, 0xb1, 0xec, 0xa4, 0x6c, }, + { 0xf3, 0x25, 0x96, 0x5c, 0xa1, 0x6d, 0x62, 0x9f, }, + { 0x57, 0x5f, 0xf2, 0x8e, 0x60, 0x38, 0x1b, 0xe5, }, + { 0x72, 0x45, 0x06, 0xeb, 0x4c, 0x32, 0x8a, 0x95, } +}; + + +/* +int test_vectors() +{ +#define MAXLEN 64 + u8 in[MAXLEN], out[8], k[16]; + int i; + int ok = 1; + + for( i = 0; i < 16; ++i ) k[i] = i; + + for( i = 0; i < MAXLEN; ++i ) + { + in[i] = i; + crypto_auth( out, in, i, k ); + + if ( memcmp( out, vectors[i], 8 ) ) + { + printf( "test vector failed for %d bytes\n", i ); + ok = 0; + } + } + + return ok; +} + +int main() +{ + if ( test_vectors() ) printf( "test vectors ok\n" ); + + return 0; +} +*/ diff --git a/include/siphash24.h b/include/siphash24.h new file mode 100644 index 0000000..627d46e --- /dev/null +++ b/include/siphash24.h @@ -0,0 +1,12 @@ +#ifndef SIPHASH_H +#define SIPHASH_H +#ifdef __cplusplus +extern "C" { +#endif + +#include "siphash24.c" + +#ifdef __cplusplus +} +#endif +#endif /* !SIPHASH_H */ From 9d41ebe7cc86b4bac75fd11391d6d238ef1f288d Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 11 Mar 2013 12:37:03 +0000 Subject: [PATCH 05/37] Use siphash key interning scheme --- extensibletype/extensibletype.pxd | 28 +++++ extensibletype/extensibletype.pyx | 26 ---- extensibletype/intern.pxd | 10 +- extensibletype/intern.pyx | 7 +- include/globalinterning.h | 64 +++++++--- include/interning.h | 200 +++++++++++++++++++++++++++--- setup.py | 6 +- 7 files changed, 271 insertions(+), 70 deletions(-) create mode 100644 extensibletype/extensibletype.pxd diff --git a/extensibletype/extensibletype.pxd b/extensibletype/extensibletype.pxd new file mode 100644 index 0000000..6a4a4f4 --- /dev/null +++ b/extensibletype/extensibletype.pxd @@ -0,0 +1,28 @@ +cdef extern from "stdint.h": + ctypedef unsigned int uint32_t + ctypedef unsigned long long uint64_t + ctypedef unsigned short uint16_t + ctypedef unsigned char uint8_t + ctypedef uint64_t uintptr_t + +cdef extern from "perfecthash.h": + ctypedef struct PyCustomSlots_Entry: + char *id + uintptr_t flags + void *ptr + + ctypedef struct PyCustomSlots_Table: + uint64_t flags + uint64_t m_f, m_g + PyCustomSlots_Entry *entries + uint16_t n, b + uint8_t r + + ctypedef struct PyCustomSlots_Table_64_64: + PyCustomSlots_Table base + uint16_t d[64] + PyCustomSlots_Entry entries_mem[64] + + + int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) + diff --git a/extensibletype/extensibletype.pyx b/extensibletype/extensibletype.pyx index 69fb664..3942457 100644 --- a/extensibletype/extensibletype.pyx +++ b/extensibletype/extensibletype.pyx @@ -6,33 +6,7 @@ import hashlib from . import intern -cdef extern from "stdint.h": - ctypedef unsigned int uint32_t - ctypedef unsigned long long uint64_t - ctypedef unsigned short uint16_t - ctypedef unsigned char uint8_t - ctypedef uint64_t uintptr_t - cdef extern from "perfecthash.h": - ctypedef struct PyCustomSlots_Entry: - char *id - uintptr_t flags - void *ptr - - ctypedef struct PyCustomSlots_Table: - uint64_t flags - uint64_t m_f, m_g - PyCustomSlots_Entry *entries - uint16_t n, b - uint8_t r - - ctypedef struct PyCustomSlots_Table_64_64: - PyCustomSlots_Table base - uint16_t d[64] - PyCustomSlots_Entry entries_mem[64] - - - int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) void _PyCustomSlots_bucket_argsort(uint16_t *p, uint8_t *binsizes, uint8_t *number_of_bins_by_size) diff --git a/extensibletype/intern.pxd b/extensibletype/intern.pxd index e277b84..2128b68 100644 --- a/extensibletype/intern.pxd +++ b/extensibletype/intern.pxd @@ -1,3 +1,5 @@ +from extensibletype cimport * + cdef extern from "Python.h": ctypedef unsigned int Py_uintptr_t @@ -7,9 +9,9 @@ cdef extern from *: cdef extern from "globalinterning.h": ctypedef void *intern_table_t - intern_table_t intern_create_table() except NULL - void intern_destroy_table(intern_table_t table) - string_t intern_key(intern_table_t table, string_t key) except NULL + intern_table_t *intern_create_table(intern_table_t *table) except NULL + void intern_destroy_table(intern_table_t *table) + uint64_t intern_key(intern_table_t *table, string_t key) except? 0 int PyIntern_Initialize() except -1 - string_t PyIntern_AddKey(string_t key) except NULL + uint64_t PyIntern_AddKey(string_t key) except? 0 diff --git a/extensibletype/intern.pyx b/extensibletype/intern.pyx index dd88a21..fad7d0a 100644 --- a/extensibletype/intern.pyx +++ b/extensibletype/intern.pyx @@ -1,5 +1,5 @@ def global_intern(bytes key): - return PyIntern_AddKey(key) + return PyIntern_AddKey(key) def global_intern_initialize(): PyIntern_Initialize() @@ -7,10 +7,11 @@ def global_intern_initialize(): cdef class InternTable(object): "Wrap intern tables (intern_table_t)" - cdef intern_table_t table + cdef intern_table_t _table + cdef intern_table_t *table def __init__(self): - self.table = intern_create_table() + self.table = intern_create_table(&self._table) def __dealloc__(self): intern_destroy_table(self.table) diff --git a/include/globalinterning.h b/include/globalinterning.h index 83e0352..c1d499e 100644 --- a/include/globalinterning.h +++ b/include/globalinterning.h @@ -9,30 +9,62 @@ extern "C" { #include "interning.h" static const char *_table_name = "_global_table_v1"; -static intern_table_t _global_table = NULL; +static intern_table_t _global_intern_table; +static intern_table_t *_global_table = NULL; /* Interning API */ + /* Uses functions so we can get the address (and make it accessible from FFIs) */ -/* Get an interned pointer to a key (a string). - Returns NULL on error with an exception set. */ -static const char * +/* Get a unique prehash for a signature string. + Returns 0 on error with an exception set ('except? 0'). */ +static uint64_t PyIntern_AddKey(const char *key) { if (_global_table == NULL) { PyErr_SetString(PyExc_AssertionError, "Intern table not set, did you call PyIntern_Initialize()?"); - return NULL; + return 0; } + return intern_key(_global_table, key); } +static PyObject * +capsule_create(void *p, const char *sig) +{ + PyObject *capsule; + +#if PY_VERSION_HEX >= 0x02070000 && !(PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 0) + capsule = PyCapsule_New(p, sig, NULL); +#else + capsule = PyCObject_FromVoidPtr(p, NULL); +#endif + + return capsule; +} + +static void * +capsule_getpointer(PyObject *capsule, const char *sig) +{ + void *cobj; + +#if PY_VERSION_HEX >= 0x02070000 && !(PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 0) + cobj = PyCapsule_GetPointer(capsule, sig); +#else + cobj = PyCObject_AsVoidPtr(p); +#endif + + return cobj; +} + /* Intialize global interning table */ static int PyIntern_Initialize(void) { PyObject *module = NULL; - PyObject *table = NULL; + intern_table_t *table = NULL; + PyObject *capsule = NULL; int retval; if (_global_table != NULL) { @@ -44,19 +76,21 @@ PyIntern_Initialize(void) { goto bad; if (PyObject_HasAttrString(module, _table_name)) { - table = PyObject_GetAttrString(module, _table_name); - if (!table) + capsule = PyObject_GetAttrString(module, _table_name); + if (!capsule) goto bad; - if (!PyDict_Check(table)) - PyErr_SetString(PyExc_TypeError, "Intern table is not a dict"); + table = capsule_getpointer(capsule, "_intern_table"); + if (!table) + goto bad; } else { /* not found; create it */ - table = (PyObject *) intern_create_table(); + table = intern_create_table(&_global_intern_table); if (table == NULL) goto bad; - if (PyObject_SetAttrString(module, _table_name, table) < 0) + capsule = capsule_create(table, "_intern_table"); + if (PyObject_SetAttrString(module, _table_name, capsule) < 0) goto bad; } @@ -65,11 +99,11 @@ PyIntern_Initialize(void) { retval = 0; goto ret; - bad: +bad: retval = -1; - ret: +ret: /* module is borrowed */ - Py_XDECREF(table); + Py_XDECREF(capsule); return retval; } diff --git a/include/interning.h b/include/interning.h index decf114..90b254f 100644 --- a/include/interning.h +++ b/include/interning.h @@ -8,61 +8,221 @@ extern "C" { /* TODO: make it GIL-less and Python independent */ #include +#include +#include "siphash24.h" #if PY_MAJOR_VERSION < 3 #define _PyIntern_FromString PyString_FromString #define _PyIntern_FromStringAndSize PyString_FromStringAndSize #define _PyIntern_AsString PyString_AsString + #define _PyIntern_Size PyString_Size #else #define _PyIntern_FromString PyBytes_FromString #define _PyIntern_FromStringAndSize PyBytes_FromStringAndSize #define _PyIntern_AsString PyBytes_AsString + #define _PyIntern_Size PyBytes_Size #endif +/* Data types */ -typedef void *intern_table_t; +typedef struct _intern_table_t { + PyObject *signature_to_key; + PyObject *key_to_signature; + char secrets[16*4]; /* 4 secret keys, which we try in succession */ +} intern_table_t; -static intern_table_t -intern_create_table(void) +/* Prototypes */ +static void intern_destroy_table(intern_table_t *table); + +/* API */ + +/* Create an intern table from preallocated memory. + Returns NULL on failure with an appropriate exception set. */ +static intern_table_t * +intern_create_table(intern_table_t *table) { - /* { string -> interned_string } */ - PyObject *table = PyDict_New(); - return (intern_table_t) table; + int i, randval; + + table->signature_to_key = NULL; + table->key_to_signature = NULL; + + table->signature_to_key = PyDict_New(); + table->key_to_signature = PyDict_New(); + + if (!table->signature_to_key || !table->key_to_signature) + goto bad; + + for (i = 0; i < 16 * 4; i+=2) { + randval = rand(); /* TODO: use a better prng */ + + /* Take the lower two bytes from the random value, since + RAND_MAX is at least 2**16 */ + table->secrets[i + 0] = ((char *) &randval)[sizeof(int) - 2]; + table->secrets[i + 1] = ((char *) &randval)[sizeof(int) - 1]; + } + + return table; +bad: + intern_destroy_table(table); + return NULL; } static void -intern_destroy_table(intern_table_t table) +intern_destroy_table(intern_table_t *table) +{ + Py_CLEAR(table->signature_to_key); + Py_CLEAR(table->key_to_signature); +} + +/* + Update table with a prehash candidate. + + Returns -1 on error, 0 on duplicate prehash, 1 on success. + */ +static int +update_table(intern_table_t *table, PyObject *key_obj, uint64_t prehash) +{ + PyObject *value; + int retcode; + int result; + + /* TODO: Py_LONG_LONG may not be 64 bits... */ + #if PY_ULLONG_MAX < 0xffffffffffffffffULL + #error "sizeof(unsigned PY_LONG_LONG) must be at least 8 bytes" + #endif + + value = PyLong_FromUnsignedLongLong(prehash); + if (!value) + goto bad; + + /* See whether we already have this hash for a different signature string */ + result = PyDict_Contains(table->key_to_signature, value); + if (result != 0) { + if (result == -1) + goto bad; + else + goto duplicate; + } + + if (PyDict_SetItem(table->signature_to_key, key_obj, value) < 0) + goto bad; + + if (PyDict_SetItem(table->key_to_signature, value, key_obj) < 0) { + PyDict_DelItem(table->signature_to_key, key_obj); + goto bad; + } + + retcode = 1; + goto done; + +bad: + retcode = -1; + +duplicate: + retcode = 0; + +done: + Py_XDECREF(value); + return retcode; +} + +/* Build prehash using siphash given the signature string and a secret key */ +static uint64_t +_intern_build_key(PyObject *key_obj, const char *key, const char *secret) { - Py_DECREF((PyObject *) table); + Py_ssize_t len = _PyIntern_Size(key_obj); + uint64_t prehash; + (void) crypto_auth((unsigned char *) &prehash, + (const unsigned char *) key, + len, + (const unsigned char *) secret); + return prehash; } -static const char * -_intern_key(intern_table_t table, PyObject *key) +/* Make a prehash for a signature string, trying different secret keys in + succession. */ +static int +make_prehash(intern_table_t *table, PyObject *key_obj, const char *key, + uint64_t *prehash_out) +{ + const char *secret = table->secrets; + int tries = 0; + uint64_t prehash; + int result; + + while (1) { + int result; + prehash = _intern_build_key(key_obj, key, secret); + result = update_table(table, key_obj, prehash); + if (result < 0) { + goto bad; + } else if (result == 0) { + /* Duplicate, keep going */ + secret += 16; + if (++tries == 4) { + PyErr_SetString(PyExc_ValueError, + "Failed to create unique prehash"); + goto bad; + } + } else { + /* We have a unique prehash */ + break; + } + } + + *prehash_out = prehash; + return 0; +bad: + return -1; +} + +static uint64_t +_intern_key(intern_table_t *table, PyObject *key_obj, const char *key) { - PyObject *dict = (PyObject *) table; PyObject *value; + PyObject *tmp = NULL; + uint64_t prehash; - value = PyDict_GetItem(dict, key); + value = PyDict_GetItem(table->signature_to_key, key_obj); if (value == NULL) { /* Key not in dict */ - value = key; - PyDict_SetItem(dict, key, value); + if (make_prehash(table, key_obj, key, &prehash) < 0) + goto bad; + } else { + prehash = PyLong_AsUnsignedLongLong(value); + if (PyErr_Occurred()) + goto bad; } - return _PyIntern_AsString(value); + goto done; + +bad: + prehash = 0; + +done: + Py_XDECREF(tmp); + return prehash; } -static const char * -intern_key(intern_table_t table, const char *key) +/* + + Intern a signature string and return a unique prehash, to be used to + compute the final hash in a perfect hashing vtable. + + Callers should check for errors using PyErr_Occurred() when this function + returns 0. +*/ +static uint64_t +intern_key(intern_table_t *table, const char *key) { PyObject *key_obj = _PyIntern_FromString(key); - const char *retval; + uint64_t retval; if (key_obj == NULL) - return NULL; + return 0; - retval = _intern_key(table, key_obj); + retval = _intern_key(table, key_obj, key); Py_DECREF(key_obj); return retval; diff --git a/setup.py b/setup.py index 17a4fb6..9552156 100644 --- a/setup.py +++ b/setup.py @@ -13,12 +13,14 @@ #'../ulib/src/base/md5sum.c', #'../ulib/src/base/hash.c' ], - include_dirs=include_dirs), + include_dirs=include_dirs, + depends=["include/perfecthash.h"]), Extension("extensibletype.intern", ["extensibletype/intern.pyx"], include_dirs=include_dirs, depends=["include/globalinterning.h", - "include/interning.h"]), + "include/interning.h", + "include/perfecthash.h"]), ] setup(cmdclass={'build_ext': build_ext}, From 3b9dff9b99dec6f23fc66f339670aea7a90a877f Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 11 Mar 2013 12:45:58 +0000 Subject: [PATCH 06/37] Create methodtable module --- extensibletype/extensibletype.pyx | 104 -------------------- extensibletype/methodtable.pyx | 109 +++++++++++++++++++++ extensibletype/test/test_perfecthashing.py | 6 +- setup.py | 8 +- 4 files changed, 119 insertions(+), 108 deletions(-) create mode 100644 extensibletype/methodtable.pyx diff --git a/extensibletype/extensibletype.pyx b/extensibletype/extensibletype.pyx index 3942457..a2f280c 100644 --- a/extensibletype/extensibletype.pyx +++ b/extensibletype/extensibletype.pyx @@ -1,11 +1,8 @@ -cimport stdlib cimport numpy as cnp import numpy as np import hashlib -from . import intern - cdef extern from "perfecthash.h": void _PyCustomSlots_bucket_argsort(uint16_t *p, uint8_t *binsizes, uint8_t *number_of_bins_by_size) @@ -45,107 +42,6 @@ def perfect_hash(cnp.ndarray[uint64_t] hashes, int repeat=1): return p, table.base.r, table.base.m_f, table.base.m_g, d -def roundup(x): - "Round up to a power of two" - x -= 1 - x |= x >> 1 - x |= x >> 2 - x |= x >> 4 - x |= x >> 8 - x |= x >> 16 - x |= x >> 32 - x += 1 - return x - -cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: - cdef PyCustomSlots_Table *table - - size = roundup(size) - - table = stdlib.malloc( - sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * size + - sizeof(PyCustomSlots_Entry) * size) - - if table == NULL: - raise MemoryError - - table.n = size - table.b = size - table.flags = 0 - - table.entries = (( &table[1]) + - size * sizeof(uint16_t)) - - return table - -cdef class PerfectHashMethodTable(object): - """ - Simple wrapper for hash-based virtual method tables. - """ - - cdef PyCustomSlots_Table *table - cdef uint16_t *displacements - - def __init__(self, n, ids, flags, funcs): - cdef Py_ssize_t i - cdef cnp.ndarray[uint64_t] hashes - - self.table = allocate_hash_table(n) - self.displacements = ( self.table + - sizeof(PyCustomSlots_Table)) - - hashes = np.empty(n, dtype=np.uint64) - - intern.global_intern_initialize() - - # Initialize hash table entries, build hash ids - for i, (signature, flag, func) in enumerate(zip(ids, flags, funcs)): - id = intern.global_intern(signature) - - self.table.entries[i].id = id - self.table.entries[i].flags = flag - self.table.entries[i].ptr = func - - hashes[i] = self.hash(signature) - - # Perfect hash our table - PyCustomSlots_PerfectHash(self.table, &hashes[0]) - - def hash(self, signature): - cdef uint64_t hashvalue - cdef bytes md5 = hashlib.md5(signature).digest() - - (&hashvalue)[0] = ( md5)[0] - return hashvalue - - def find_method(self, signature): - """ - Find method of the given signature. Use from non-performance - critical code. - """ - cdef uintptr_t id = intern.global_intern(signature) - cdef uint64_t prehash = self.hash(signature) - - cdef int idx = (((prehash >> self.table.r) & self.table.m_f) ^ - self.displacements[prehash & self.table.m_g]) - - assert 0 <= idx < self.size - - if self.table.entries[idx].id != id: - return None - - return ( self.table.entries[idx].ptr, - self.table.entries[idx].flags) - - property table_ptr: - def __get__(self): - return self.table - - property size: - def __get__(self): - return self.table.n - - #cdef extern from "md5sum.h": # ctypedef struct MD5_CTX: # uint32_t i[2] diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx new file mode 100644 index 0000000..5e5099d --- /dev/null +++ b/extensibletype/methodtable.pyx @@ -0,0 +1,109 @@ +from libc cimport stdlib +cimport numpy as cnp +import numpy as np + +from extensibletype cimport * + +import intern + +def roundup(x): + "Round up to a power of two" + x -= 1 + x |= x >> 1 + x |= x >> 2 + x |= x >> 4 + x |= x >> 8 + x |= x >> 16 + x |= x >> 32 + x += 1 + return x + +cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: + cdef PyCustomSlots_Table *table + + size = roundup(size) + + table = stdlib.malloc( + sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * size + + sizeof(PyCustomSlots_Entry) * size) + + if table == NULL: + raise MemoryError + + table.n = size + table.b = size + table.flags = 0 + + table.entries = (( &table[1]) + + size * sizeof(uint16_t)) + + return table + +cdef class PerfectHashMethodTable(object): + """ + Simple wrapper for hash-based virtual method tables. + """ + + cdef PyCustomSlots_Table *table + cdef uint16_t *displacements + + def __init__(self, n, ids, flags, funcs): + cdef Py_ssize_t i + cdef cnp.ndarray[uint64_t] hashes + + self.table = allocate_hash_table(n) + self.displacements = ( self.table + + sizeof(PyCustomSlots_Table)) + + hashes = np.empty(n, dtype=np.uint64) + + intern.global_intern_initialize() + + # Initialize hash table entries, build hash ids + for i, (signature, flag, func) in enumerate(zip(ids, flags, funcs)): + id = intern.global_intern(signature) + + self.table.entries[i].id = id + self.table.entries[i].flags = flag + self.table.entries[i].ptr = func + + hashes[i] = self.hash(signature) + + # Perfect hash our table + PyCustomSlots_PerfectHash(self.table, &hashes[0]) + + def hash(self, signature): + cdef uint64_t hashvalue + # cdef bytes md5 = hashlib.md5(signature).digest() + # (&hashvalue)[0] = ( md5)[0] + hashvalue = intern.global_intern(signature) + + return hashvalue + + def find_method(self, signature): + """ + Find method of the given signature. Use from non-performance + critical code. + """ + cdef uintptr_t id = intern.global_intern(signature) + cdef uint64_t prehash = self.hash(signature) + + cdef int idx = (((prehash >> self.table.r) & self.table.m_f) ^ + self.displacements[prehash & self.table.m_g]) + + assert 0 <= idx < self.size + + if self.table.entries[idx].id != id: + return None + + return ( self.table.entries[idx].ptr, + self.table.entries[idx].flags) + + property table_ptr: + def __get__(self): + return self.table + + property size: + def __get__(self): + return self.table.n + diff --git a/extensibletype/test/test_perfecthashing.py b/extensibletype/test/test_perfecthashing.py index bf54aab..5fd61da 100644 --- a/extensibletype/test/test_perfecthashing.py +++ b/extensibletype/test/test_perfecthashing.py @@ -1,6 +1,6 @@ from nose.tools import eq_, ok_ import numpy as np -from .. import extensibletype +from .. import extensibletype, methodtable def draw_hashes(rng, nitems): hashes = rng.randint(2**32, size=nitems).astype(np.uint64) @@ -35,11 +35,11 @@ def test_methodtable(): flags = range(1, len(ids) + 1) funcs = range(len(ids)) - methodtable = extensibletype.PerfectHashMethodTable( + table = methodtable.PerfectHashMethodTable( len(ids), ids, flags, funcs) for (signature, flag, func) in zip(ids, flags, funcs): - result = methodtable.find_method(signature) + result = table.find_method(signature) assert result is not None got_func, got_flag = result assert func == got_func, (func, got_func) diff --git a/setup.py b/setup.py index 9552156..14d51e6 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,8 @@ include_dirs = ['include', '../ulib/src/base', np.get_include()] +perfecthash_deps = ["include/perfecthash.h"] + extensions = [ Extension("extensibletype.extensibletype", [os.path.join("extensibletype", "extensibletype.pyx"), @@ -14,13 +16,17 @@ #'../ulib/src/base/hash.c' ], include_dirs=include_dirs, - depends=["include/perfecthash.h"]), + depends=perfecthash_deps), Extension("extensibletype.intern", ["extensibletype/intern.pyx"], include_dirs=include_dirs, depends=["include/globalinterning.h", "include/interning.h", "include/perfecthash.h"]), + Extension("extensibletype.methodtable", + [os.path.join("extensibletype", "methodtable.pyx")], + include_dirs=include_dirs, + depends=perfecthash_deps), ] setup(cmdclass={'build_ext': build_ext}, From 4edb5f4f0546a86a7e3434af0b52e08eee4ba4c8 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 11 Mar 2013 12:48:24 +0000 Subject: [PATCH 07/37] Deallocate vtable when going out of scope --- extensibletype/methodtable.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index 5e5099d..2383986 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -99,6 +99,10 @@ cdef class PerfectHashMethodTable(object): return ( self.table.entries[idx].ptr, self.table.entries[idx].flags) + def __dealloc__(self): + stdlib.free(self.table) + self.table = NULL + property table_ptr: def __get__(self): return self.table From fdbe2a500623fbcc316a68a9f8553666818b1160 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 11 Mar 2013 14:47:39 +0000 Subject: [PATCH 08/37] Make extension module sources and names configurable --- setup.py | 30 ++------------------------ setupconfig.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 28 deletions(-) create mode 100644 setupconfig.py diff --git a/setup.py b/setup.py index 14d51e6..39545e3 100644 --- a/setup.py +++ b/setup.py @@ -1,33 +1,7 @@ from distutils.core import setup -from distutils.extension import Extension from Cython.Distutils import build_ext -import os -import numpy as np - -include_dirs = ['include', '../ulib/src/base', np.get_include()] - -perfecthash_deps = ["include/perfecthash.h"] - -extensions = [ - Extension("extensibletype.extensibletype", - [os.path.join("extensibletype", "extensibletype.pyx"), - #'../ulib/src/base/md5sum.c', - #'../ulib/src/base/hash.c' - ], - include_dirs=include_dirs, - depends=perfecthash_deps), - Extension("extensibletype.intern", - ["extensibletype/intern.pyx"], - include_dirs=include_dirs, - depends=["include/globalinterning.h", - "include/interning.h", - "include/perfecthash.h"]), - Extension("extensibletype.methodtable", - [os.path.join("extensibletype", "methodtable.pyx")], - include_dirs=include_dirs, - depends=perfecthash_deps), -] +from setupconfig import get_extensions setup(cmdclass={'build_ext': build_ext}, - ext_modules=extensions) + ext_modules=get_extensions(prefix="")) diff --git a/setupconfig.py b/setupconfig.py new file mode 100644 index 0000000..b110c24 --- /dev/null +++ b/setupconfig.py @@ -0,0 +1,57 @@ +import os +import functools +from distutils.extension import Extension + +import numpy as np + +def prefix_module(prefix, module_name): + if prefix: + return ".".join(prefix.split("/") + [module_name]) + return module_name + +def prefix_path(prefix, path): + if prefix: + return "%s/%s" % (prefix.rstrip("/"), path.lstrip("/")) + return path + +def make_extension(prefix, modname, sources, depends, **kwds): + _prefix_path = functools.partial(prefix_path, prefix) + + return Extension( + prefix_module(prefix, modname), + sources=map(_prefix_path, sources), + depends=map(_prefix_path, depends), + **kwds + ) + +def get_extensions(prefix): + include_dirs = [prefix_path(prefix, 'include'), + np.get_include()] + + perfecthash_deps = ["include/perfecthash.h"] + + Extension = functools.partial(make_extension, prefix) + + extensions = [ + Extension("extensibletype.extensibletype", + ["extensibletype/extensibletype.pyx", + #'../ulib/src/base/md5sum.c', + #'../ulib/src/base/hash.c' + ], + include_dirs=include_dirs, + depends=perfecthash_deps), + + Extension("extensibletype.intern", + ["extensibletype/intern.pyx"], + include_dirs=include_dirs, + depends=["include/globalinterning.h", + "include/interning.h", + "include/perfecthash.h"]), + + Extension("extensibletype.methodtable", + ["extensibletype/methodtable.pyx"], + include_dirs=include_dirs, + depends=perfecthash_deps), + ] + + return extensions From 94b1ff43732805c4da07af8128299f7698d8303f Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 11 Mar 2013 15:11:31 +0000 Subject: [PATCH 09/37] Allow building from different working directory --- setup.py | 4 +++- setupconfig.py | 16 +++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 39545e3..13ee259 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,9 @@ +import os from distutils.core import setup from Cython.Distutils import build_ext from setupconfig import get_extensions +root = os.path.dirname(os.path.abspath(__file__)) setup(cmdclass={'build_ext': build_ext}, - ext_modules=get_extensions(prefix="")) + ext_modules=get_extensions(path_prefix=root)) diff --git a/setupconfig.py b/setupconfig.py index b110c24..38de28e 100644 --- a/setupconfig.py +++ b/setupconfig.py @@ -6,31 +6,33 @@ def prefix_module(prefix, module_name): if prefix: - return ".".join(prefix.split("/") + [module_name]) + return "%s.%s" % (prefix, module_name) + return module_name def prefix_path(prefix, path): if prefix: return "%s/%s" % (prefix.rstrip("/"), path.lstrip("/")) + return path -def make_extension(prefix, modname, sources, depends, **kwds): - _prefix_path = functools.partial(prefix_path, prefix) +def make_extension(path_prefix, module_prefix, modname, sources, depends, **kwds): + _prefix_path = functools.partial(prefix_path, path_prefix) return Extension( - prefix_module(prefix, modname), + prefix_module(module_prefix, modname), sources=map(_prefix_path, sources), depends=map(_prefix_path, depends), **kwds ) -def get_extensions(prefix): - include_dirs = [prefix_path(prefix, 'include'), +def get_extensions(path_prefix, module_prefix=""): + include_dirs = [prefix_path(path_prefix, 'include'), np.get_include()] perfecthash_deps = ["include/perfecthash.h"] - Extension = functools.partial(make_extension, prefix) + Extension = functools.partial(make_extension, path_prefix, module_prefix) extensions = [ Extension("extensibletype.extensibletype", From e6524a072cf622fcddf5b2ba4fb7ced59c092192 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 11 Mar 2013 15:33:27 +0000 Subject: [PATCH 10/37] Add packages to setup.py --- setup.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 13ee259..104dc79 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,28 @@ import os +from fnmatch import fnmatchcase +from distutils.util import convert_path from distutils.core import setup from Cython.Distutils import build_ext from setupconfig import get_extensions +def find_packages(where='.', exclude=()): + out = [] + stack=[(convert_path(where), '')] + while stack: + where, prefix = stack.pop(0) + for name in os.listdir(where): + fn = os.path.join(where,name) + if ('.' not in name and os.path.isdir(fn) and + os.path.isfile(os.path.join(fn, '__init__.py')) + ): + out.append(prefix+name) + stack.append((fn, prefix+name+'.')) + for pat in list(exclude) + ['ez_setup', 'distribute_setup']: + out = [item for item in out if not fnmatchcase(item, pat)] + return out + root = os.path.dirname(os.path.abspath(__file__)) setup(cmdclass={'build_ext': build_ext}, - ext_modules=get_extensions(path_prefix=root)) + ext_modules=get_extensions(path_prefix=root), + packages=find_packages()) From d48b7f41b1127f94c0399e6d8f54a85e7278fff2 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 11 Mar 2013 16:44:00 +0000 Subject: [PATCH 11/37] Fix typo in capsule_getpointer --- include/globalinterning.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/globalinterning.h b/include/globalinterning.h index c1d499e..438f79f 100644 --- a/include/globalinterning.h +++ b/include/globalinterning.h @@ -53,7 +53,7 @@ capsule_getpointer(PyObject *capsule, const char *sig) #if PY_VERSION_HEX >= 0x02070000 && !(PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 0) cobj = PyCapsule_GetPointer(capsule, sig); #else - cobj = PyCObject_AsVoidPtr(p); + cobj = PyCObject_AsVoidPtr(capsule); #endif return cobj; From 29f7b0f79a3b8266124293733f9ded1a5f2c9d68 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 14 Mar 2013 14:53:41 +0000 Subject: [PATCH 12/37] Use separate hasher and add table generation method to method table --- extensibletype/methodtable.pyx | 33 ++++++++++++++-------- extensibletype/test/test_perfecthashing.py | 4 +-- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index 2383986..5cebc4d 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -39,6 +39,21 @@ cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: return table + +cdef class Hasher(object): + """ + Generate a globally unique hashes for signature strings. + """ + + def hash_signature(self, signature): + cdef uint64_t hashvalue + # cdef bytes md5 = hashlib.md5(signature).digest() + # (&hashvalue)[0] = ( md5)[0] + hashvalue = intern.global_intern(signature) + + return hashvalue + + cdef class PerfectHashMethodTable(object): """ Simple wrapper for hash-based virtual method tables. @@ -46,8 +61,12 @@ cdef class PerfectHashMethodTable(object): cdef PyCustomSlots_Table *table cdef uint16_t *displacements + cdef Hasher hasher + + def __init__(self, hasher): + self.hasher = hasher - def __init__(self, n, ids, flags, funcs): + def generate_table(self, n, ids, flags, funcs): cdef Py_ssize_t i cdef cnp.ndarray[uint64_t] hashes @@ -67,26 +86,18 @@ cdef class PerfectHashMethodTable(object): self.table.entries[i].flags = flag self.table.entries[i].ptr = func - hashes[i] = self.hash(signature) + hashes[i] = self.hasher.hash_signature(signature) # Perfect hash our table PyCustomSlots_PerfectHash(self.table, &hashes[0]) - def hash(self, signature): - cdef uint64_t hashvalue - # cdef bytes md5 = hashlib.md5(signature).digest() - # (&hashvalue)[0] = ( md5)[0] - hashvalue = intern.global_intern(signature) - - return hashvalue - def find_method(self, signature): """ Find method of the given signature. Use from non-performance critical code. """ cdef uintptr_t id = intern.global_intern(signature) - cdef uint64_t prehash = self.hash(signature) + cdef uint64_t prehash = self.hasher.hash_signature(signature) cdef int idx = (((prehash >> self.table.r) & self.table.m_f) ^ self.displacements[prehash & self.table.m_g]) diff --git a/extensibletype/test/test_perfecthashing.py b/extensibletype/test/test_perfecthashing.py index 5fd61da..de49433 100644 --- a/extensibletype/test/test_perfecthashing.py +++ b/extensibletype/test/test_perfecthashing.py @@ -35,8 +35,8 @@ def test_methodtable(): flags = range(1, len(ids) + 1) funcs = range(len(ids)) - table = methodtable.PerfectHashMethodTable( - len(ids), ids, flags, funcs) + table = methodtable.PerfectHashMethodTable(methodtable.Hasher()) + table.generate_table(len(ids), ids, flags, funcs) for (signature, flag, func) in zip(ids, flags, funcs): result = table.find_method(signature) From 5f0ffba8039a405a0f10e52b0140e43c8dbb9cf2 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 14 Mar 2013 22:44:09 +0000 Subject: [PATCH 13/37] Use prehash as id --- extensibletype/extensibletype.pxd | 3 +- extensibletype/extensibletype.pyx | 11 ++++++-- extensibletype/intern.pyx | 2 +- extensibletype/methodtable.pyx | 32 ++++++++++++---------- extensibletype/test/test_perfecthashing.py | 10 ++----- include/perfecthash.h | 3 +- 6 files changed, 32 insertions(+), 29 deletions(-) diff --git a/extensibletype/extensibletype.pxd b/extensibletype/extensibletype.pxd index 6a4a4f4..f353a11 100644 --- a/extensibletype/extensibletype.pxd +++ b/extensibletype/extensibletype.pxd @@ -7,8 +7,7 @@ cdef extern from "stdint.h": cdef extern from "perfecthash.h": ctypedef struct PyCustomSlots_Entry: - char *id - uintptr_t flags + uint64_t id void *ptr ctypedef struct PyCustomSlots_Table: diff --git a/extensibletype/extensibletype.pyx b/extensibletype/extensibletype.pyx index a2f280c..653a9cb 100644 --- a/extensibletype/extensibletype.pyx +++ b/extensibletype/extensibletype.pyx @@ -13,6 +13,12 @@ def bucket_argsort(cnp.ndarray[uint16_t, mode='c'] p, _PyCustomSlots_bucket_argsort(&p[0], &binsizes[0], &number_of_bins_by_size[0]) +def draw_hashes(rng, nitems): + hashes = rng.randint(2**32, size=nitems).astype(np.uint64) + hashes <<= 32 + hashes |= rng.randint(2**32, size=nitems).astype(np.uint64) + return hashes + def perfect_hash(cnp.ndarray[uint64_t] hashes, int repeat=1): """Used for testing. Takes the hashes as input, and returns a permutation array and hash parameters: @@ -25,8 +31,7 @@ def perfect_hash(cnp.ndarray[uint64_t] hashes, int repeat=1): table.base.b = 64 table.base.entries = &table.entries_mem[0] for i in range(64): - table.entries_mem[i].id = NULL - table.entries_mem[i].flags = i + table.entries_mem[i].id = hashes[i] table.entries_mem[i].ptr = NULL cdef int r @@ -37,7 +42,7 @@ def perfect_hash(cnp.ndarray[uint64_t] hashes, int repeat=1): p = np.zeros(64, dtype=np.uint16) for i in range(64): - p[i] = table.entries_mem[i].flags + p[i] = table.entries_mem[i].id & 0xFF d[i] = table.d[i] return p, table.base.r, table.base.m_f, table.base.m_g, d diff --git a/extensibletype/intern.pyx b/extensibletype/intern.pyx index fad7d0a..76aeeff 100644 --- a/extensibletype/intern.pyx +++ b/extensibletype/intern.pyx @@ -17,4 +17,4 @@ cdef class InternTable(object): intern_destroy_table(self.table) def intern(self, bytes key): - return intern_key(self.table, key) + return intern_key(self.table, key) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index 5cebc4d..64c36fd 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -3,6 +3,7 @@ cimport numpy as cnp import numpy as np from extensibletype cimport * +from . import extensibletype import intern @@ -23,9 +24,9 @@ cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: size = roundup(size) - table = stdlib.malloc( - sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * size + - sizeof(PyCustomSlots_Entry) * size) + table = stdlib.calloc( + 1, sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * size + + sizeof(PyCustomSlots_Entry) * size) if table == NULL: raise MemoryError @@ -74,45 +75,48 @@ cdef class PerfectHashMethodTable(object): self.displacements = ( self.table + sizeof(PyCustomSlots_Table)) - hashes = np.empty(n, dtype=np.uint64) + hashes = np.empty(self.table.n, dtype=np.uint64) intern.global_intern_initialize() # Initialize hash table entries, build hash ids for i, (signature, flag, func) in enumerate(zip(ids, flags, funcs)): - id = intern.global_intern(signature) - - self.table.entries[i].id = id - self.table.entries[i].flags = flag + self.table.entries[i].id = self.hasher.hash_signature(signature) self.table.entries[i].ptr = func hashes[i] = self.hasher.hash_signature(signature) + hashes[n:self.table.n] = extensibletype.draw_hashes(np.random, + self.table.n - n) + # Perfect hash our table PyCustomSlots_PerfectHash(self.table, &hashes[0]) + for signature in ids: + assert self.find_method(signature) + def find_method(self, signature): """ Find method of the given signature. Use from non-performance critical code. """ - cdef uintptr_t id = intern.global_intern(signature) - cdef uint64_t prehash = self.hasher.hash_signature(signature) + cdef uint64_t prehash = intern.global_intern(signature) cdef int idx = (((prehash >> self.table.r) & self.table.m_f) ^ self.displacements[prehash & self.table.m_g]) assert 0 <= idx < self.size - if self.table.entries[idx].id != id: + if self.table.entries[idx].id != prehash: return None return ( self.table.entries[idx].ptr, - self.table.entries[idx].flags) + self.table.entries[idx].id & 0xFF) def __dealloc__(self): - stdlib.free(self.table) - self.table = NULL + # stdlib.free(self.table) + # self.table = NULL + pass property table_ptr: def __get__(self): diff --git a/extensibletype/test/test_perfecthashing.py b/extensibletype/test/test_perfecthashing.py index de49433..cb720e7 100644 --- a/extensibletype/test/test_perfecthashing.py +++ b/extensibletype/test/test_perfecthashing.py @@ -2,11 +2,6 @@ import numpy as np from .. import extensibletype, methodtable -def draw_hashes(rng, nitems): - hashes = rng.randint(2**32, size=nitems).astype(np.uint64) - hashes <<= 32 - hashes |= rng.randint(2**32, size=nitems).astype(np.uint64) - return hashes def test_binsort(): nbins = 64 @@ -20,7 +15,7 @@ def test_binsort(): def test_basic(): n=64 - prehashes = draw_hashes(np.random, n) + prehashes = extensibletype.draw_hashes(np.random, n) assert len(prehashes) == len(set(prehashes)) p, r, m_f, m_g, d = extensibletype.perfect_hash(prehashes, repeat=10**5) hashes = ((prehashes >> r) & m_f) ^ d[prehashes & m_g] @@ -43,4 +38,5 @@ def test_methodtable(): assert result is not None got_func, got_flag = result assert func == got_func, (func, got_func) - assert flag == got_flag, (flag, got_flag) + # assert flag == got_flag, (flag, got_flag) + diff --git a/include/perfecthash.h b/include/perfecthash.h index aa4e042..1212dc1 100644 --- a/include/perfecthash.h +++ b/include/perfecthash.h @@ -2,8 +2,7 @@ #include typedef struct { - char *id; - uintptr_t flags; + uint64_t id; void *ptr; } PyCustomSlots_Entry; From 836be99771306a3e91c7fc83a58fe3c44e554850 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 21 Mar 2013 11:52:31 +0000 Subject: [PATCH 14/37] Make sure the extensions builds on python 3 --- setupconfig.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setupconfig.py b/setupconfig.py index 38de28e..11274af 100644 --- a/setupconfig.py +++ b/setupconfig.py @@ -21,8 +21,8 @@ def make_extension(path_prefix, module_prefix, modname, sources, depends, **kwds return Extension( prefix_module(module_prefix, modname), - sources=map(_prefix_path, sources), - depends=map(_prefix_path, depends), + sources=list(map(_prefix_path, sources)), + depends=list(map(_prefix_path, depends)), **kwds ) From 7bc2663d20f0c8f39fbdbc1f85bebde682e67c2a Mon Sep 17 00:00:00 2001 From: Jon Riehl Date: Tue, 26 Mar 2013 17:26:59 -0500 Subject: [PATCH 15/37] More Python 3 fixes. --- extensibletype/test/test_interning.py | 2 +- extensibletype/test/test_perfecthashing.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/extensibletype/test/test_interning.py b/extensibletype/test/test_interning.py index d754de1..9395778 100644 --- a/extensibletype/test/test_interning.py +++ b/extensibletype/test/test_interning.py @@ -3,7 +3,7 @@ def test_global_interning(): try: intern.global_intern("hello") - except AssertionError, e: + except AssertionError as e: pass else: raise Exception("Expects complaint about uninitialized table") diff --git a/extensibletype/test/test_perfecthashing.py b/extensibletype/test/test_perfecthashing.py index cb720e7..edf145f 100644 --- a/extensibletype/test/test_perfecthashing.py +++ b/extensibletype/test/test_perfecthashing.py @@ -19,10 +19,10 @@ def test_basic(): assert len(prehashes) == len(set(prehashes)) p, r, m_f, m_g, d = extensibletype.perfect_hash(prehashes, repeat=10**5) hashes = ((prehashes >> r) & m_f) ^ d[prehashes & m_g] - print p - print d + print(p) + print(d) hashes.sort() - print hashes + print(hashes) assert len(hashes) == len(np.unique(hashes)) def test_methodtable(): From 9e83a792345ce53c99a627e6a20ba22bae23218e Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 3 Apr 2013 17:25:39 +0100 Subject: [PATCH 16/37] Add string method to perfect hashing vtable --- extensibletype/methodtable.pyx | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index 64c36fd..3f207d4 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -64,10 +64,14 @@ cdef class PerfectHashMethodTable(object): cdef uint16_t *displacements cdef Hasher hasher + cdef object id_to_signature, signatures + def __init__(self, hasher): self.hasher = hasher + # For debugging + self.id_to_signature = {} - def generate_table(self, n, ids, flags, funcs): + def generate_table(self, n, ids, flags, funcs, method_names=None): cdef Py_ssize_t i cdef cnp.ndarray[uint64_t] hashes @@ -81,10 +85,13 @@ cdef class PerfectHashMethodTable(object): # Initialize hash table entries, build hash ids for i, (signature, flag, func) in enumerate(zip(ids, flags, funcs)): - self.table.entries[i].id = self.hasher.hash_signature(signature) + id = self.hasher.hash_signature(signature) + + self.table.entries[i].id = id self.table.entries[i].ptr = func - hashes[i] = self.hasher.hash_signature(signature) + hashes[i] = id + self.id_to_signature[id] = signature hashes[n:self.table.n] = extensibletype.draw_hashes(np.random, self.table.n - n) @@ -95,6 +102,9 @@ cdef class PerfectHashMethodTable(object): for signature in ids: assert self.find_method(signature) + # For debugging + self.signatures = ids + def find_method(self, signature): """ Find method of the given signature. Use from non-performance @@ -113,6 +123,19 @@ cdef class PerfectHashMethodTable(object): return ( self.table.entries[idx].ptr, self.table.entries[idx].id & 0xFF) + def __str__(self): + buf = ["PerfectHashMethodTable("] + for i in range(self.table.n): + id = self.table.entries[i].id + ptr = self.table.entries[i].ptr + sig = self.id_to_signature.get(id, "") + s = " id: %20d funcptr: %20d signature: %s" % (id, ptr, sig) + buf.append(s) + + buf.append(")") + + return "\n".join(buf) + def __dealloc__(self): # stdlib.free(self.table) # self.table = NULL From 31ca4a04410975f00a6ef1922f931749caa00f3f Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 3 Apr 2013 18:42:28 +0100 Subject: [PATCH 17/37] Some py3 compatability --- extensibletype/methodtable.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index 3f207d4..24b9377 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -50,6 +50,9 @@ cdef class Hasher(object): cdef uint64_t hashvalue # cdef bytes md5 = hashlib.md5(signature).digest() # (&hashvalue)[0] = ( md5)[0] + if isinstance(signature, str): + # Python 3 + signature = signature.encode("ascii") hashvalue = intern.global_intern(signature) return hashvalue From c7078fd5fb2cd86dca69415125d21c4870660aed Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 3 Apr 2013 20:18:05 +0100 Subject: [PATCH 18/37] Some more str -> bytes conversion for py3 --- extensibletype/methodtable.pyx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index 24b9377..0562628 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -40,6 +40,12 @@ cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: return table +def make_bytes(s): + if isinstance(s, str): + # Python 3 + s = s.encode("ascii") + + return s cdef class Hasher(object): """ @@ -50,11 +56,8 @@ cdef class Hasher(object): cdef uint64_t hashvalue # cdef bytes md5 = hashlib.md5(signature).digest() # (&hashvalue)[0] = ( md5)[0] - if isinstance(signature, str): - # Python 3 - signature = signature.encode("ascii") - hashvalue = intern.global_intern(signature) + hashvalue = intern.global_intern(make_bytes(signature)) return hashvalue @@ -113,7 +116,7 @@ cdef class PerfectHashMethodTable(object): Find method of the given signature. Use from non-performance critical code. """ - cdef uint64_t prehash = intern.global_intern(signature) + cdef uint64_t prehash = intern.global_intern(make_bytes(signature)) cdef int idx = (((prehash >> self.table.r) & self.table.m_f) ^ self.displacements[prehash & self.table.m_g]) From e4d9c4f2ec88585a899d02325cccf571c431b806 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 8 Apr 2013 16:20:21 +0100 Subject: [PATCH 19/37] Add some error checking to see whether we succeeded building hash table --- extensibletype/methodtable.pyx | 23 ++++++++++++++++------- include/perfecthash.h | 24 +++++++++++++++++++++--- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index 0562628..62c7732 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -19,6 +19,11 @@ def roundup(x): x += 1 return x +class HashingError(Exception): + """ + Raised when we can't create a perfect hash-based function table. + """ + cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: cdef PyCustomSlots_Table *table @@ -70,7 +75,7 @@ cdef class PerfectHashMethodTable(object): cdef uint16_t *displacements cdef Hasher hasher - cdef object id_to_signature, signatures + cdef object id_to_signature, signatures, fs, gs def __init__(self, hasher): self.hasher = hasher @@ -90,6 +95,8 @@ cdef class PerfectHashMethodTable(object): intern.global_intern_initialize() # Initialize hash table entries, build hash ids + assert len(ids) == len(flags) == len(funcs) + for i, (signature, flag, func) in enumerate(zip(ids, flags, funcs)): id = self.hasher.hash_signature(signature) @@ -103,10 +110,12 @@ cdef class PerfectHashMethodTable(object): self.table.n - n) # Perfect hash our table - PyCustomSlots_PerfectHash(self.table, &hashes[0]) + if PyCustomSlots_PerfectHash(self.table, &hashes[0]) < 0: + # TODO: sensible error messages + raise HashingError("Unable to create perfect hash table") - for signature in ids: - assert self.find_method(signature) + for i, signature in enumerate(ids): + assert self.find_method(signature), (i, signature) # For debugging self.signatures = ids @@ -118,8 +127,8 @@ cdef class PerfectHashMethodTable(object): """ cdef uint64_t prehash = intern.global_intern(make_bytes(signature)) - cdef int idx = (((prehash >> self.table.r) & self.table.m_f) ^ - self.displacements[prehash & self.table.m_g]) + cdef uint64_t idx = (((prehash >> self.table.r) & self.table.m_f) ^ + self.displacements[prehash & self.table.m_g]) assert 0 <= idx < self.size @@ -135,7 +144,7 @@ cdef class PerfectHashMethodTable(object): id = self.table.entries[i].id ptr = self.table.entries[i].ptr sig = self.id_to_signature.get(id, "") - s = " id: %20d funcptr: %20d signature: %s" % (id, ptr, sig) + s = " id: 0x%-16x funcptr: %20d signature: %s" % (id, ptr, sig) buf.append(s) buf.append(")") diff --git a/include/perfecthash.h b/include/perfecthash.h index 1212dc1..83e6c68 100644 --- a/include/perfecthash.h +++ b/include/perfecthash.h @@ -133,6 +133,7 @@ int _PyCustomSlots_FindDisplacements(PyCustomSlots_Table *table, } int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { + int result; uint16_t bin, j; uint8_t binsize; uint16_t i, n = table->n, b = table->b; @@ -145,10 +146,13 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { uint8_t number_of_bins_by_size[BIN_LIMIT]; PyCustomSlots_Entry *entries_copy = malloc(sizeof(PyCustomSlots_Entry) * n); + if (!bins || !binsizes || !p || !taken || !entries_copy) + goto error; + for (i = 0; i != n; ++i) { entries_copy[i] = table->entries[i]; } - + /* Bin the n hashes into b bins based on the g hash. Also count the number of bins of each size. */ for (bin = 0; bin != b; ++bin) { @@ -163,7 +167,7 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { binsize = ++binsizes[bin]; if (binsize == BIN_LIMIT) { printf("ERROR 1\n"); - return -1; + goto error; } bins[BIN_LIMIT * bin + binsize - 1] = i; number_of_bins_by_size[binsize - 1]--; @@ -187,6 +191,20 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { } } + if (retcode != 0) { + printf("no suitable table found\n"); + goto error; + } + + result = 0; + goto cleanup; + +error: + + result = -1; + +cleanup: + /*TODO does not free on error... */ free(bins); free(binsizes); @@ -194,5 +212,5 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { free(taken); free(entries_copy); - return 0; + return result; } From 8d0c82421e9158a9724cd92814614ad502b40ee6 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 8 Apr 2013 18:32:42 +0100 Subject: [PATCH 20/37] Add utility to print secret table keys --- extensibletype/methodtable.pyx | 10 ++++++---- include/interning.h | 33 +++++++++++++++++++++++++-------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index 62c7732..6d24aca 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -26,22 +26,23 @@ class HashingError(Exception): cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: cdef PyCustomSlots_Table *table + cdef int nbins = size * 2 size = roundup(size) table = stdlib.calloc( - 1, sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * size + + 1, sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * nbins + sizeof(PyCustomSlots_Entry) * size) if table == NULL: raise MemoryError table.n = size - table.b = size + table.b = nbins table.flags = 0 table.entries = (( &table[1]) + - size * sizeof(uint16_t)) + table.b * sizeof(uint16_t)) return table @@ -90,7 +91,7 @@ cdef class PerfectHashMethodTable(object): self.displacements = ( self.table + sizeof(PyCustomSlots_Table)) - hashes = np.empty(self.table.n, dtype=np.uint64) + hashes = np.zeros(self.table.n, dtype=np.uint64) intern.global_intern_initialize() @@ -108,6 +109,7 @@ cdef class PerfectHashMethodTable(object): hashes[n:self.table.n] = extensibletype.draw_hashes(np.random, self.table.n - n) + assert len(np.unique(hashes)) == len(hashes) # Perfect hash our table if PyCustomSlots_PerfectHash(self.table, &hashes[0]) < 0: diff --git a/include/interning.h b/include/interning.h index 90b254f..f18e9b0 100644 --- a/include/interning.h +++ b/include/interning.h @@ -9,6 +9,7 @@ extern "C" { #include #include +#include #include "siphash24.h" #if PY_MAJOR_VERSION < 3 @@ -28,7 +29,7 @@ extern "C" { typedef struct _intern_table_t { PyObject *signature_to_key; PyObject *key_to_signature; - char secrets[16*4]; /* 4 secret keys, which we try in succession */ + char secrets[16*4]; /* 4 secret keys, which we try in succession */ } intern_table_t; /* Prototypes */ @@ -36,12 +37,26 @@ static void intern_destroy_table(intern_table_t *table); /* API */ +static void +_print_secrets(intern_table_t *table) +{ + int i, j; + + for (i = 0; i < 4; i++) { + printf("secret key[%d] = {", i); + for (j = 0; j < 16; j += 4) { + printf(" %-8x, ", *(int32_t *) &table->secrets[i * 16 + j]); + } + printf("}\n"); + } +} + /* Create an intern table from preallocated memory. Returns NULL on failure with an appropriate exception set. */ static intern_table_t * intern_create_table(intern_table_t *table) { - int i, randval; + int i; table->signature_to_key = NULL; table->key_to_signature = NULL; @@ -52,15 +67,17 @@ intern_create_table(intern_table_t *table) if (!table->signature_to_key || !table->key_to_signature) goto bad; - for (i = 0; i < 16 * 4; i+=2) { - randval = rand(); /* TODO: use a better prng */ - + for (i = 0; i < 16 * 4; i += 2) { /* Take the lower two bytes from the random value, since - RAND_MAX is at least 2**16 */ - table->secrets[i + 0] = ((char *) &randval)[sizeof(int) - 2]; - table->secrets[i + 1] = ((char *) &randval)[sizeof(int) - 1]; + RAND_MAX is at least 2**16 */ + short randval = (short) rand(); /* TODO: use a better prng */ + + table->secrets[i + 0] = ((char *) &randval)[0]; + table->secrets[i + 1] = ((char *) &randval)[1]; } + _print_secrets(table); + return table; bad: intern_destroy_table(table); From 5851fd1ec33a63d971a4064ed28d198c3c31a884 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 9 Apr 2013 10:15:10 +0100 Subject: [PATCH 21/37] Add more thorough test to build hash-based function table --- extensibletype/methodtable.pyx | 5 +++-- extensibletype/test/test_perfecthashing.py | 18 ++++++++++++++++-- include/interning.h | 2 +- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index 6d24aca..c1265fd 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -26,9 +26,10 @@ class HashingError(Exception): cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: cdef PyCustomSlots_Table *table - cdef int nbins = size * 2 + cdef int nbins size = roundup(size) + nbins = size #* 2 table = stdlib.calloc( 1, sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * nbins + @@ -76,7 +77,7 @@ cdef class PerfectHashMethodTable(object): cdef uint16_t *displacements cdef Hasher hasher - cdef object id_to_signature, signatures, fs, gs + cdef object id_to_signature, signatures def __init__(self, hasher): self.hasher = hasher diff --git a/extensibletype/test/test_perfecthashing.py b/extensibletype/test/test_perfecthashing.py index edf145f..c8d65dc 100644 --- a/extensibletype/test/test_perfecthashing.py +++ b/extensibletype/test/test_perfecthashing.py @@ -1,3 +1,5 @@ +import itertools + from nose.tools import eq_, ok_ import numpy as np from .. import extensibletype, methodtable @@ -25,8 +27,20 @@ def test_basic(): print(hashes) assert len(hashes) == len(np.unique(hashes)) +# --- +# Test methodtable + +def make_signature(type_permutation): + return "".join(type_permutation[:-1]) + '->' + type_permutation[-1] + +def make_ids(): + types = ['f', 'd', 'i', 'l', 'O'] + power = 6 + return map(make_signature, itertools.product(*(types,) * power)) + def test_methodtable(): - ids = ["ff->f", "dd->d", "ii->i", "ll->l", "OO->O"] + # ids = ["ff->f", "dd->d", "ii->i", "ll->l", "OO->O"] + ids = make_ids()[:31] flags = range(1, len(ids) + 1) funcs = range(len(ids)) @@ -36,7 +50,7 @@ def test_methodtable(): for (signature, flag, func) in zip(ids, flags, funcs): result = table.find_method(signature) assert result is not None + got_func, got_flag = result assert func == got_func, (func, got_func) # assert flag == got_flag, (flag, got_flag) - diff --git a/include/interning.h b/include/interning.h index f18e9b0..591bb6b 100644 --- a/include/interning.h +++ b/include/interning.h @@ -76,7 +76,7 @@ intern_create_table(intern_table_t *table) table->secrets[i + 1] = ((char *) &randval)[1]; } - _print_secrets(table); + /* _print_secrets(table); */ return table; bad: From 4cb4abe7a65e38503a4a1a5cedad02ed1a574ff8 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 9 Apr 2013 10:58:40 +0100 Subject: [PATCH 22/37] Add and use pstdint.h --- extensibletype/extensibletype.pxd | 2 +- extensibletype/methodtable.pyx | 2 +- include/customslots.h | 17 +- include/interning.h | 2 +- include/perfecthash.h | 2 +- include/pstdint.h | 800 ++++++++++++++++++++++++++++++ include/siphash24.c | 4 +- 7 files changed, 820 insertions(+), 9 deletions(-) create mode 100644 include/pstdint.h diff --git a/extensibletype/extensibletype.pxd b/extensibletype/extensibletype.pxd index f353a11..6599e76 100644 --- a/extensibletype/extensibletype.pxd +++ b/extensibletype/extensibletype.pxd @@ -1,4 +1,4 @@ -cdef extern from "stdint.h": +cdef extern from "pstdint.h": ctypedef unsigned int uint32_t ctypedef unsigned long long uint64_t ctypedef unsigned short uint16_t diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index c1265fd..f16e2cf 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -29,7 +29,7 @@ cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: cdef int nbins size = roundup(size) - nbins = size #* 2 + nbins = size table = stdlib.calloc( 1, sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * nbins + diff --git a/include/customslots.h b/include/customslots.h index b86a4da..2ec3d33 100644 --- a/include/customslots.h +++ b/include/customslots.h @@ -6,11 +6,20 @@ extern "C" { #include #include -#include -/* Some stdint.h implementations: -Portable: http://www.azillionmonkeys.com/qed/pstdint.h -MSVC: http://msinttypes.googlecode.com/svn/trunk/stdint.h + +/* +Make this work by default on all platforms using pstdint: + + Portable: http://www.azillionmonkeys.com/qed/pstdint.h + +There is also: + + MSVC: http://msinttypes.googlecode.com/svn/trunk/stdint.h + +We could conditionally include it, but we would need to use something +like autoconf... */ +#include #if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 95)) #define PY_CUSTOMSLOTS_LIKELY(x) __builtin_expect(!!(x), 1) diff --git a/include/interning.h b/include/interning.h index 591bb6b..3557ed4 100644 --- a/include/interning.h +++ b/include/interning.h @@ -9,7 +9,7 @@ extern "C" { #include #include -#include +#include "pstdint.h" #include "siphash24.h" #if PY_MAJOR_VERSION < 3 diff --git a/include/perfecthash.h b/include/perfecthash.h index 83e6c68..b770cc8 100644 --- a/include/perfecthash.h +++ b/include/perfecthash.h @@ -1,5 +1,5 @@ -#include #include +#include "pstdint.h" typedef struct { uint64_t id; diff --git a/include/pstdint.h b/include/pstdint.h new file mode 100644 index 0000000..2e4dbff --- /dev/null +++ b/include/pstdint.h @@ -0,0 +1,800 @@ +/* A portable stdint.h + **************************************************************************** + * BSD License: + **************************************************************************** + * + * Copyright (c) 2005-2011 Paul Hsieh + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************** + * + * Version 0.1.12 + * + * The ANSI C standard committee, for the C99 standard, specified the + * inclusion of a new standard include file called stdint.h. This is + * a very useful and long desired include file which contains several + * very precise definitions for integer scalar types that is + * critically important for making portable several classes of + * applications including cryptography, hashing, variable length + * integer libraries and so on. But for most developers its likely + * useful just for programming sanity. + * + * The problem is that most compiler vendors have decided not to + * implement the C99 standard, and the next C++ language standard + * (which has a lot more mindshare these days) will be a long time in + * coming and its unknown whether or not it will include stdint.h or + * how much adoption it will have. Either way, it will be a long time + * before all compilers come with a stdint.h and it also does nothing + * for the extremely large number of compilers available today which + * do not include this file, or anything comparable to it. + * + * So that's what this file is all about. Its an attempt to build a + * single universal include file that works on as many platforms as + * possible to deliver what stdint.h is supposed to. A few things + * that should be noted about this file: + * + * 1) It is not guaranteed to be portable and/or present an identical + * interface on all platforms. The extreme variability of the + * ANSI C standard makes this an impossibility right from the + * very get go. Its really only meant to be useful for the vast + * majority of platforms that possess the capability of + * implementing usefully and precisely defined, standard sized + * integer scalars. Systems which are not intrinsically 2s + * complement may produce invalid constants. + * + * 2) There is an unavoidable use of non-reserved symbols. + * + * 3) Other standard include files are invoked. + * + * 4) This file may come in conflict with future platforms that do + * include stdint.h. The hope is that one or the other can be + * used with no real difference. + * + * 5) In the current verison, if your platform can't represent + * int32_t, int16_t and int8_t, it just dumps out with a compiler + * error. + * + * 6) 64 bit integers may or may not be defined. Test for their + * presence with the test: #ifdef INT64_MAX or #ifdef UINT64_MAX. + * Note that this is different from the C99 specification which + * requires the existence of 64 bit support in the compiler. If + * this is not defined for your platform, yet it is capable of + * dealing with 64 bits then it is because this file has not yet + * been extended to cover all of your system's capabilities. + * + * 7) (u)intptr_t may or may not be defined. Test for its presence + * with the test: #ifdef PTRDIFF_MAX. If this is not defined + * for your platform, then it is because this file has not yet + * been extended to cover all of your system's capabilities, not + * because its optional. + * + * 8) The following might not been defined even if your platform is + * capable of defining it: + * + * WCHAR_MIN + * WCHAR_MAX + * (u)int64_t + * PTRDIFF_MIN + * PTRDIFF_MAX + * (u)intptr_t + * + * 9) The following have not been defined: + * + * WINT_MIN + * WINT_MAX + * + * 10) The criteria for defining (u)int_least(*)_t isn't clear, + * except for systems which don't have a type that precisely + * defined 8, 16, or 32 bit types (which this include file does + * not support anyways). Default definitions have been given. + * + * 11) The criteria for defining (u)int_fast(*)_t isn't something I + * would trust to any particular compiler vendor or the ANSI C + * committee. It is well known that "compatible systems" are + * commonly created that have very different performance + * characteristics from the systems they are compatible with, + * especially those whose vendors make both the compiler and the + * system. Default definitions have been given, but its strongly + * recommended that users never use these definitions for any + * reason (they do *NOT* deliver any serious guarantee of + * improved performance -- not in this file, nor any vendor's + * stdint.h). + * + * 12) The following macros: + * + * PRINTF_INTMAX_MODIFIER + * PRINTF_INT64_MODIFIER + * PRINTF_INT32_MODIFIER + * PRINTF_INT16_MODIFIER + * PRINTF_LEAST64_MODIFIER + * PRINTF_LEAST32_MODIFIER + * PRINTF_LEAST16_MODIFIER + * PRINTF_INTPTR_MODIFIER + * + * are strings which have been defined as the modifiers required + * for the "d", "u" and "x" printf formats to correctly output + * (u)intmax_t, (u)int64_t, (u)int32_t, (u)int16_t, (u)least64_t, + * (u)least32_t, (u)least16_t and (u)intptr_t types respectively. + * PRINTF_INTPTR_MODIFIER is not defined for some systems which + * provide their own stdint.h. PRINTF_INT64_MODIFIER is not + * defined if INT64_MAX is not defined. These are an extension + * beyond what C99 specifies must be in stdint.h. + * + * In addition, the following macros are defined: + * + * PRINTF_INTMAX_HEX_WIDTH + * PRINTF_INT64_HEX_WIDTH + * PRINTF_INT32_HEX_WIDTH + * PRINTF_INT16_HEX_WIDTH + * PRINTF_INT8_HEX_WIDTH + * PRINTF_INTMAX_DEC_WIDTH + * PRINTF_INT64_DEC_WIDTH + * PRINTF_INT32_DEC_WIDTH + * PRINTF_INT16_DEC_WIDTH + * PRINTF_INT8_DEC_WIDTH + * + * Which specifies the maximum number of characters required to + * print the number of that type in either hexadecimal or decimal. + * These are an extension beyond what C99 specifies must be in + * stdint.h. + * + * Compilers tested (all with 0 warnings at their highest respective + * settings): Borland Turbo C 2.0, WATCOM C/C++ 11.0 (16 bits and 32 + * bits), Microsoft Visual C++ 6.0 (32 bit), Microsoft Visual Studio + * .net (VC7), Intel C++ 4.0, GNU gcc v3.3.3 + * + * This file should be considered a work in progress. Suggestions for + * improvements, especially those which increase coverage are strongly + * encouraged. + * + * Acknowledgements + * + * The following people have made significant contributions to the + * development and testing of this file: + * + * Chris Howie + * John Steele Scott + * Dave Thorup + * John Dill + * + */ + +#include +#include +#include + +/* + * For gcc with _STDINT_H, fill in the PRINTF_INT*_MODIFIER macros, and + * do nothing else. On the Mac OS X version of gcc this is _STDINT_H_. + */ + +#if ((defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L) || (defined (__WATCOMC__) && (defined (_STDINT_H_INCLUDED) || __WATCOMC__ >= 1250)) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_) || defined (__UINT_FAST64_TYPE__)) )) && !defined (_PSTDINT_H_INCLUDED) +#include +#define _PSTDINT_H_INCLUDED +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "l" +# endif +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "h" +# endif +# ifndef PRINTF_INTMAX_MODIFIER +# define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER +# endif +# ifndef PRINTF_INT64_HEX_WIDTH +# define PRINTF_INT64_HEX_WIDTH "16" +# endif +# ifndef PRINTF_INT32_HEX_WIDTH +# define PRINTF_INT32_HEX_WIDTH "8" +# endif +# ifndef PRINTF_INT16_HEX_WIDTH +# define PRINTF_INT16_HEX_WIDTH "4" +# endif +# ifndef PRINTF_INT8_HEX_WIDTH +# define PRINTF_INT8_HEX_WIDTH "2" +# endif +# ifndef PRINTF_INT64_DEC_WIDTH +# define PRINTF_INT64_DEC_WIDTH "20" +# endif +# ifndef PRINTF_INT32_DEC_WIDTH +# define PRINTF_INT32_DEC_WIDTH "10" +# endif +# ifndef PRINTF_INT16_DEC_WIDTH +# define PRINTF_INT16_DEC_WIDTH "5" +# endif +# ifndef PRINTF_INT8_DEC_WIDTH +# define PRINTF_INT8_DEC_WIDTH "3" +# endif +# ifndef PRINTF_INTMAX_HEX_WIDTH +# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH +# endif +# ifndef PRINTF_INTMAX_DEC_WIDTH +# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH +# endif + +/* + * Something really weird is going on with Open Watcom. Just pull some of + * these duplicated definitions from Open Watcom's stdint.h file for now. + */ + +# if defined (__WATCOMC__) && __WATCOMC__ >= 1250 +# if !defined (INT64_C) +# define INT64_C(x) (x + (INT64_MAX - INT64_MAX)) +# endif +# if !defined (UINT64_C) +# define UINT64_C(x) (x + (UINT64_MAX - UINT64_MAX)) +# endif +# if !defined (INT32_C) +# define INT32_C(x) (x + (INT32_MAX - INT32_MAX)) +# endif +# if !defined (UINT32_C) +# define UINT32_C(x) (x + (UINT32_MAX - UINT32_MAX)) +# endif +# if !defined (INT16_C) +# define INT16_C(x) (x) +# endif +# if !defined (UINT16_C) +# define UINT16_C(x) (x) +# endif +# if !defined (INT8_C) +# define INT8_C(x) (x) +# endif +# if !defined (UINT8_C) +# define UINT8_C(x) (x) +# endif +# if !defined (UINT64_MAX) +# define UINT64_MAX 18446744073709551615ULL +# endif +# if !defined (INT64_MAX) +# define INT64_MAX 9223372036854775807LL +# endif +# if !defined (UINT32_MAX) +# define UINT32_MAX 4294967295UL +# endif +# if !defined (INT32_MAX) +# define INT32_MAX 2147483647L +# endif +# if !defined (INTMAX_MAX) +# define INTMAX_MAX INT64_MAX +# endif +# if !defined (INTMAX_MIN) +# define INTMAX_MIN INT64_MIN +# endif +# endif +#endif + +#ifndef _PSTDINT_H_INCLUDED +#define _PSTDINT_H_INCLUDED + +#ifndef SIZE_MAX +# define SIZE_MAX (~(size_t)0) +#endif + +/* + * Deduce the type assignments from limits.h under the assumption that + * integer sizes in bits are powers of 2, and follow the ANSI + * definitions. + */ + +#ifndef UINT8_MAX +# define UINT8_MAX 0xff +#endif +#ifndef uint8_t +# if (UCHAR_MAX == UINT8_MAX) || defined (S_SPLINT_S) + typedef unsigned char uint8_t; +# define UINT8_C(v) ((uint8_t) v) +# else +# error "Platform not supported" +# endif +#endif + +#ifndef INT8_MAX +# define INT8_MAX 0x7f +#endif +#ifndef INT8_MIN +# define INT8_MIN INT8_C(0x80) +#endif +#ifndef int8_t +# if (SCHAR_MAX == INT8_MAX) || defined (S_SPLINT_S) + typedef signed char int8_t; +# define INT8_C(v) ((int8_t) v) +# else +# error "Platform not supported" +# endif +#endif + +#ifndef UINT16_MAX +# define UINT16_MAX 0xffff +#endif +#ifndef uint16_t +#if (UINT_MAX == UINT16_MAX) || defined (S_SPLINT_S) + typedef unsigned int uint16_t; +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "" +# endif +# define UINT16_C(v) ((uint16_t) (v)) +#elif (USHRT_MAX == UINT16_MAX) + typedef unsigned short uint16_t; +# define UINT16_C(v) ((uint16_t) (v)) +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "h" +# endif +#else +#error "Platform not supported" +#endif +#endif + +#ifndef INT16_MAX +# define INT16_MAX 0x7fff +#endif +#ifndef INT16_MIN +# define INT16_MIN INT16_C(0x8000) +#endif +#ifndef int16_t +#if (INT_MAX == INT16_MAX) || defined (S_SPLINT_S) + typedef signed int int16_t; +# define INT16_C(v) ((int16_t) (v)) +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "" +# endif +#elif (SHRT_MAX == INT16_MAX) + typedef signed short int16_t; +# define INT16_C(v) ((int16_t) (v)) +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "h" +# endif +#else +#error "Platform not supported" +#endif +#endif + +#ifndef UINT32_MAX +# define UINT32_MAX (0xffffffffUL) +#endif +#ifndef uint32_t +#if (ULONG_MAX == UINT32_MAX) || defined (S_SPLINT_S) + typedef unsigned long uint32_t; +# define UINT32_C(v) v ## UL +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "l" +# endif +#elif (UINT_MAX == UINT32_MAX) + typedef unsigned int uint32_t; +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +# define UINT32_C(v) v ## U +#elif (USHRT_MAX == UINT32_MAX) + typedef unsigned short uint32_t; +# define UINT32_C(v) ((unsigned short) (v)) +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +#else +#error "Platform not supported" +#endif +#endif + +#ifndef INT32_MAX +# define INT32_MAX (0x7fffffffL) +#endif +#ifndef INT32_MIN +# define INT32_MIN INT32_C(0x80000000) +#endif +#ifndef int32_t +#if (LONG_MAX == INT32_MAX) || defined (S_SPLINT_S) + typedef signed long int32_t; +# define INT32_C(v) v ## L +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "l" +# endif +#elif (INT_MAX == INT32_MAX) + typedef signed int int32_t; +# define INT32_C(v) v +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +#elif (SHRT_MAX == INT32_MAX) + typedef signed short int32_t; +# define INT32_C(v) ((short) (v)) +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +#else +#error "Platform not supported" +#endif +#endif + +/* + * The macro stdint_int64_defined is temporarily used to record + * whether or not 64 integer support is available. It must be + * defined for any 64 integer extensions for new platforms that are + * added. + */ + +#undef stdint_int64_defined +#if (defined(__STDC__) && defined(__STDC_VERSION__)) || defined (S_SPLINT_S) +# if (__STDC__ && __STDC_VERSION__ >= 199901L) || defined (S_SPLINT_S) +# define stdint_int64_defined + typedef long long int64_t; + typedef unsigned long long uint64_t; +# define UINT64_C(v) v ## ULL +# define INT64_C(v) v ## LL +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# endif +#endif + +#if !defined (stdint_int64_defined) +# if defined(__GNUC__) +# define stdint_int64_defined + __extension__ typedef long long int64_t; + __extension__ typedef unsigned long long uint64_t; +# define UINT64_C(v) v ## ULL +# define INT64_C(v) v ## LL +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# elif defined(__MWERKS__) || defined (__SUNPRO_C) || defined (__SUNPRO_CC) || defined (__APPLE_CC__) || defined (_LONG_LONG) || defined (_CRAYC) || defined (S_SPLINT_S) +# define stdint_int64_defined + typedef long long int64_t; + typedef unsigned long long uint64_t; +# define UINT64_C(v) v ## ULL +# define INT64_C(v) v ## LL +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# elif (defined(__WATCOMC__) && defined(__WATCOM_INT64__)) || (defined(_MSC_VER) && _INTEGRAL_MAX_BITS >= 64) || (defined (__BORLANDC__) && __BORLANDC__ > 0x460) || defined (__alpha) || defined (__DECC) +# define stdint_int64_defined + typedef __int64 int64_t; + typedef unsigned __int64 uint64_t; +# define UINT64_C(v) v ## UI64 +# define INT64_C(v) v ## I64 +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "I64" +# endif +# endif +#endif + +#if !defined (LONG_LONG_MAX) && defined (INT64_C) +# define LONG_LONG_MAX INT64_C (9223372036854775807) +#endif +#ifndef ULONG_LONG_MAX +# define ULONG_LONG_MAX UINT64_C (18446744073709551615) +#endif + +#if !defined (INT64_MAX) && defined (INT64_C) +# define INT64_MAX INT64_C (9223372036854775807) +#endif +#if !defined (INT64_MIN) && defined (INT64_C) +# define INT64_MIN INT64_C (-9223372036854775808) +#endif +#if !defined (UINT64_MAX) && defined (INT64_C) +# define UINT64_MAX UINT64_C (18446744073709551615) +#endif + +/* + * Width of hexadecimal for number field. + */ + +#ifndef PRINTF_INT64_HEX_WIDTH +# define PRINTF_INT64_HEX_WIDTH "16" +#endif +#ifndef PRINTF_INT32_HEX_WIDTH +# define PRINTF_INT32_HEX_WIDTH "8" +#endif +#ifndef PRINTF_INT16_HEX_WIDTH +# define PRINTF_INT16_HEX_WIDTH "4" +#endif +#ifndef PRINTF_INT8_HEX_WIDTH +# define PRINTF_INT8_HEX_WIDTH "2" +#endif + +#ifndef PRINTF_INT64_DEC_WIDTH +# define PRINTF_INT64_DEC_WIDTH "20" +#endif +#ifndef PRINTF_INT32_DEC_WIDTH +# define PRINTF_INT32_DEC_WIDTH "10" +#endif +#ifndef PRINTF_INT16_DEC_WIDTH +# define PRINTF_INT16_DEC_WIDTH "5" +#endif +#ifndef PRINTF_INT8_DEC_WIDTH +# define PRINTF_INT8_DEC_WIDTH "3" +#endif + +/* + * Ok, lets not worry about 128 bit integers for now. Moore's law says + * we don't need to worry about that until about 2040 at which point + * we'll have bigger things to worry about. + */ + +#ifdef stdint_int64_defined + typedef int64_t intmax_t; + typedef uint64_t uintmax_t; +# define INTMAX_MAX INT64_MAX +# define INTMAX_MIN INT64_MIN +# define UINTMAX_MAX UINT64_MAX +# define UINTMAX_C(v) UINT64_C(v) +# define INTMAX_C(v) INT64_C(v) +# ifndef PRINTF_INTMAX_MODIFIER +# define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER +# endif +# ifndef PRINTF_INTMAX_HEX_WIDTH +# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH +# endif +# ifndef PRINTF_INTMAX_DEC_WIDTH +# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH +# endif +#else + typedef int32_t intmax_t; + typedef uint32_t uintmax_t; +# define INTMAX_MAX INT32_MAX +# define UINTMAX_MAX UINT32_MAX +# define UINTMAX_C(v) UINT32_C(v) +# define INTMAX_C(v) INT32_C(v) +# ifndef PRINTF_INTMAX_MODIFIER +# define PRINTF_INTMAX_MODIFIER PRINTF_INT32_MODIFIER +# endif +# ifndef PRINTF_INTMAX_HEX_WIDTH +# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT32_HEX_WIDTH +# endif +# ifndef PRINTF_INTMAX_DEC_WIDTH +# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT32_DEC_WIDTH +# endif +#endif + +/* + * Because this file currently only supports platforms which have + * precise powers of 2 as bit sizes for the default integers, the + * least definitions are all trivial. Its possible that a future + * version of this file could have different definitions. + */ + +#ifndef stdint_least_defined + typedef int8_t int_least8_t; + typedef uint8_t uint_least8_t; + typedef int16_t int_least16_t; + typedef uint16_t uint_least16_t; + typedef int32_t int_least32_t; + typedef uint32_t uint_least32_t; +# define PRINTF_LEAST32_MODIFIER PRINTF_INT32_MODIFIER +# define PRINTF_LEAST16_MODIFIER PRINTF_INT16_MODIFIER +# define UINT_LEAST8_MAX UINT8_MAX +# define INT_LEAST8_MAX INT8_MAX +# define UINT_LEAST16_MAX UINT16_MAX +# define INT_LEAST16_MAX INT16_MAX +# define UINT_LEAST32_MAX UINT32_MAX +# define INT_LEAST32_MAX INT32_MAX +# define INT_LEAST8_MIN INT8_MIN +# define INT_LEAST16_MIN INT16_MIN +# define INT_LEAST32_MIN INT32_MIN +# ifdef stdint_int64_defined + typedef int64_t int_least64_t; + typedef uint64_t uint_least64_t; +# define PRINTF_LEAST64_MODIFIER PRINTF_INT64_MODIFIER +# define UINT_LEAST64_MAX UINT64_MAX +# define INT_LEAST64_MAX INT64_MAX +# define INT_LEAST64_MIN INT64_MIN +# endif +#endif +#undef stdint_least_defined + +/* + * The ANSI C committee pretending to know or specify anything about + * performance is the epitome of misguided arrogance. The mandate of + * this file is to *ONLY* ever support that absolute minimum + * definition of the fast integer types, for compatibility purposes. + * No extensions, and no attempt to suggest what may or may not be a + * faster integer type will ever be made in this file. Developers are + * warned to stay away from these types when using this or any other + * stdint.h. + */ + +typedef int_least8_t int_fast8_t; +typedef uint_least8_t uint_fast8_t; +typedef int_least16_t int_fast16_t; +typedef uint_least16_t uint_fast16_t; +typedef int_least32_t int_fast32_t; +typedef uint_least32_t uint_fast32_t; +#define UINT_FAST8_MAX UINT_LEAST8_MAX +#define INT_FAST8_MAX INT_LEAST8_MAX +#define UINT_FAST16_MAX UINT_LEAST16_MAX +#define INT_FAST16_MAX INT_LEAST16_MAX +#define UINT_FAST32_MAX UINT_LEAST32_MAX +#define INT_FAST32_MAX INT_LEAST32_MAX +#define INT_FAST8_MIN INT_LEAST8_MIN +#define INT_FAST16_MIN INT_LEAST16_MIN +#define INT_FAST32_MIN INT_LEAST32_MIN +#ifdef stdint_int64_defined + typedef int_least64_t int_fast64_t; + typedef uint_least64_t uint_fast64_t; +# define UINT_FAST64_MAX UINT_LEAST64_MAX +# define INT_FAST64_MAX INT_LEAST64_MAX +# define INT_FAST64_MIN INT_LEAST64_MIN +#endif + +#undef stdint_int64_defined + +/* + * Whatever piecemeal, per compiler thing we can do about the wchar_t + * type limits. + */ + +#if defined(__WATCOMC__) || defined(_MSC_VER) || defined (__GNUC__) +# include +# ifndef WCHAR_MIN +# define WCHAR_MIN 0 +# endif +# ifndef WCHAR_MAX +# define WCHAR_MAX ((wchar_t)-1) +# endif +#endif + +/* + * Whatever piecemeal, per compiler/platform thing we can do about the + * (u)intptr_t types and limits. + */ + +#if defined (_MSC_VER) && defined (_UINTPTR_T_DEFINED) +# define STDINT_H_UINTPTR_T_DEFINED +#endif + +#ifndef STDINT_H_UINTPTR_T_DEFINED +# if defined (__alpha__) || defined (__ia64__) || defined (__x86_64__) || defined (_WIN64) +# define stdint_intptr_bits 64 +# elif defined (__WATCOMC__) || defined (__TURBOC__) +# if defined(__TINY__) || defined(__SMALL__) || defined(__MEDIUM__) +# define stdint_intptr_bits 16 +# else +# define stdint_intptr_bits 32 +# endif +# elif defined (__i386__) || defined (_WIN32) || defined (WIN32) +# define stdint_intptr_bits 32 +# elif defined (__INTEL_COMPILER) +/* TODO -- what did Intel do about x86-64? */ +# endif + +# ifdef stdint_intptr_bits +# define stdint_intptr_glue3_i(a,b,c) a##b##c +# define stdint_intptr_glue3(a,b,c) stdint_intptr_glue3_i(a,b,c) +# ifndef PRINTF_INTPTR_MODIFIER +# define PRINTF_INTPTR_MODIFIER stdint_intptr_glue3(PRINTF_INT,stdint_intptr_bits,_MODIFIER) +# endif +# ifndef PTRDIFF_MAX +# define PTRDIFF_MAX stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX) +# endif +# ifndef PTRDIFF_MIN +# define PTRDIFF_MIN stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN) +# endif +# ifndef UINTPTR_MAX +# define UINTPTR_MAX stdint_intptr_glue3(UINT,stdint_intptr_bits,_MAX) +# endif +# ifndef INTPTR_MAX +# define INTPTR_MAX stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX) +# endif +# ifndef INTPTR_MIN +# define INTPTR_MIN stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN) +# endif +# ifndef INTPTR_C +# define INTPTR_C(x) stdint_intptr_glue3(INT,stdint_intptr_bits,_C)(x) +# endif +# ifndef UINTPTR_C +# define UINTPTR_C(x) stdint_intptr_glue3(UINT,stdint_intptr_bits,_C)(x) +# endif + typedef stdint_intptr_glue3(uint,stdint_intptr_bits,_t) uintptr_t; + typedef stdint_intptr_glue3( int,stdint_intptr_bits,_t) intptr_t; +# else +/* TODO -- This following is likely wrong for some platforms, and does + nothing for the definition of uintptr_t. */ + typedef ptrdiff_t intptr_t; +# endif +# define STDINT_H_UINTPTR_T_DEFINED +#endif + +/* + * Assumes sig_atomic_t is signed and we have a 2s complement machine. + */ + +#ifndef SIG_ATOMIC_MAX +# define SIG_ATOMIC_MAX ((((sig_atomic_t) 1) << (sizeof (sig_atomic_t)*CHAR_BIT-1)) - 1) +#endif + +#endif + +#if defined (__TEST_PSTDINT_FOR_CORRECTNESS) + +/* + * Please compile with the maximum warning settings to make sure macros are not + * defined more than once. + */ + +#include +#include +#include + +#define glue3_aux(x,y,z) x ## y ## z +#define glue3(x,y,z) glue3_aux(x,y,z) + +#define DECLU(bits) glue3(uint,bits,_t) glue3(u,bits,=) glue3(UINT,bits,_C) (0); +#define DECLI(bits) glue3(int,bits,_t) glue3(i,bits,=) glue3(INT,bits,_C) (0); + +#define DECL(us,bits) glue3(DECL,us,) (bits) + +#define TESTUMAX(bits) glue3(u,bits,=) glue3(~,u,bits); if (glue3(UINT,bits,_MAX) glue3(!=,u,bits)) printf ("Something wrong with UINT%d_MAX\n", bits) + +int main () { + DECL(I,8) + DECL(U,8) + DECL(I,16) + DECL(U,16) + DECL(I,32) + DECL(U,32) +#ifdef INT64_MAX + DECL(I,64) + DECL(U,64) +#endif + intmax_t imax = INTMAX_C(0); + uintmax_t umax = UINTMAX_C(0); + char str0[256], str1[256]; + + sprintf (str0, "%d %x\n", 0, ~0); + + sprintf (str1, "%d %x\n", i8, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with i8 : %s\n", str1); + sprintf (str1, "%u %x\n", u8, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with u8 : %s\n", str1); + sprintf (str1, "%d %x\n", i16, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with i16 : %s\n", str1); + sprintf (str1, "%u %x\n", u16, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with u16 : %s\n", str1); + sprintf (str1, "%" PRINTF_INT32_MODIFIER "d %x\n", i32, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with i32 : %s\n", str1); + sprintf (str1, "%" PRINTF_INT32_MODIFIER "u %x\n", u32, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with u32 : %s\n", str1); +#ifdef INT64_MAX + sprintf (str1, "%" PRINTF_INT64_MODIFIER "d %x\n", i64, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with i64 : %s\n", str1); +#endif + sprintf (str1, "%" PRINTF_INTMAX_MODIFIER "d %x\n", imax, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with imax : %s\n", str1); + sprintf (str1, "%" PRINTF_INTMAX_MODIFIER "u %x\n", umax, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with umax : %s\n", str1); + + TESTUMAX(8); + TESTUMAX(16); + TESTUMAX(32); +#ifdef INT64_MAX + TESTUMAX(64); +#endif + + return EXIT_SUCCESS; +} + +#endif \ No newline at end of file diff --git a/include/siphash24.c b/include/siphash24.c index 981bd31..ba8b02f 100644 --- a/include/siphash24.c +++ b/include/siphash24.c @@ -12,9 +12,11 @@ You should have received a copy of the CC0 Public Domain Dedication along with this software. If not, see . */ -#include + #include #include +#include "pstdint.h" + typedef uint64_t u64; typedef uint32_t u32; typedef uint8_t u8; From f967cea98fdd3f241a7f0989fd97ccd16836ad01 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 9 Apr 2013 11:01:51 +0100 Subject: [PATCH 23/37] Some C89 compatibility --- include/perfecthash.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/perfecthash.h b/include/perfecthash.h index b770cc8..c1b469e 100644 --- a/include/perfecthash.h +++ b/include/perfecthash.h @@ -133,7 +133,7 @@ int _PyCustomSlots_FindDisplacements(PyCustomSlots_Table *table, } int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { - int result; + int result, r, retcode; uint16_t bin, j; uint8_t binsize; uint16_t i, n = table->n, b = table->b; @@ -181,7 +181,6 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { /* Find perfect table -- try again for each choice of r */ table->m_f = m_f; table->m_g = m_g; - int r, retcode; for (r = 64; r != -1; --r) { table->r = r; retcode = _PyCustomSlots_FindDisplacements(table, hashes, binsizes, bins, p, From c41a36816ac2fc8861b6f35954c5c3768bc580a1 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 9 Apr 2013 11:32:45 +0100 Subject: [PATCH 24/37] Make sure we can draw hashes when sizeof(long) == 4 --- extensibletype/extensibletype.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/extensibletype/extensibletype.pyx b/extensibletype/extensibletype.pyx index 653a9cb..d1eefc3 100644 --- a/extensibletype/extensibletype.pyx +++ b/extensibletype/extensibletype.pyx @@ -13,10 +13,16 @@ def bucket_argsort(cnp.ndarray[uint16_t, mode='c'] p, _PyCustomSlots_bucket_argsort(&p[0], &binsizes[0], &number_of_bins_by_size[0]) +def get_random_hashes(rng, nitems): + return rng.randint(-2**31, 2**31-1, size=nitems).astype(np.uint64) + def draw_hashes(rng, nitems): - hashes = rng.randint(2**32, size=nitems).astype(np.uint64) + assert sizeof(long) >= 4 + + hashes = get_random_hashes(rng, nitems) hashes <<= 32 - hashes |= rng.randint(2**32, size=nitems).astype(np.uint64) + hashes |= get_random_hashes(rng, nitems) + return hashes def perfect_hash(cnp.ndarray[uint64_t] hashes, int repeat=1): From d09f8b98a2a7ab2b4564399fa30847ee1064df74 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 9 Apr 2013 15:13:42 +0100 Subject: [PATCH 25/37] Add test for pstdint.h --- extensibletype/test/pstdint.pyx | 21 +++++++++++++++++++++ extensibletype/test/test_pstdint.py | 4 ++++ setupconfig.py | 5 +++++ 3 files changed, 30 insertions(+) create mode 100644 extensibletype/test/pstdint.pyx create mode 100644 extensibletype/test/test_pstdint.py diff --git a/extensibletype/test/pstdint.pyx b/extensibletype/test/pstdint.pyx new file mode 100644 index 0000000..ec59c80 --- /dev/null +++ b/extensibletype/test/pstdint.pyx @@ -0,0 +1,21 @@ +cdef extern from "pstdint.h": + ctypedef int int8_t + ctypedef int int16_t + ctypedef int int32_t + ctypedef int int64_t + + ctypedef int uint8_t + ctypedef int uint16_t + ctypedef int uint32_t + ctypedef int uint64_t + + ctypedef int intptr_t + ctypedef int uintptr_t + +def test_pstdint(): + assert sizeof(int8_t) == sizeof(uint8_t) == 1 + assert sizeof(int16_t) == sizeof(uint16_t) == 2 + assert sizeof(int32_t) == sizeof(uint32_t) == 4 + assert sizeof(int64_t) == sizeof(uint64_t) == 8 + + assert sizeof(intptr_t) == sizeof(uintptr_t) >= sizeof(void *) diff --git a/extensibletype/test/test_pstdint.py b/extensibletype/test/test_pstdint.py new file mode 100644 index 0000000..63ba27e --- /dev/null +++ b/extensibletype/test/test_pstdint.py @@ -0,0 +1,4 @@ +from . import pstdint + +def test_pstdint(): + pstdint.test_pstdint() \ No newline at end of file diff --git a/setupconfig.py b/setupconfig.py index 11274af..9d1ee8c 100644 --- a/setupconfig.py +++ b/setupconfig.py @@ -54,6 +54,11 @@ def get_extensions(path_prefix, module_prefix=""): ["extensibletype/methodtable.pyx"], include_dirs=include_dirs, depends=perfecthash_deps), + + Extension("extensibletype.test.pstdint", + ["extensibletype/test/pstdint.pyx"], + include_dirs=include_dirs, + depends=["include/pstdint.h"]), ] return extensions From c14ea778574f1b90cee5f008c633e215c91fd535 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 9 Apr 2013 18:58:58 +0200 Subject: [PATCH 26/37] Fix type cast of entry id in hash table --- extensibletype/methodtable.pyx | 6 +++++- extensibletype/test/test_perfecthashing.py | 2 +- extensibletype/test/test_pstdint.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index f16e2cf..dc6963c 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -112,6 +112,10 @@ cdef class PerfectHashMethodTable(object): self.table.n - n) assert len(np.unique(hashes)) == len(hashes) + # print "-----------------------" + # print self + # print "-----------------------" + # Perfect hash our table if PyCustomSlots_PerfectHash(self.table, &hashes[0]) < 0: # TODO: sensible error messages @@ -135,7 +139,7 @@ cdef class PerfectHashMethodTable(object): assert 0 <= idx < self.size - if self.table.entries[idx].id != prehash: + if self.table.entries[idx].id != prehash: return None return ( self.table.entries[idx].ptr, diff --git a/extensibletype/test/test_perfecthashing.py b/extensibletype/test/test_perfecthashing.py index c8d65dc..63ccf22 100644 --- a/extensibletype/test/test_perfecthashing.py +++ b/extensibletype/test/test_perfecthashing.py @@ -40,7 +40,7 @@ def make_ids(): def test_methodtable(): # ids = ["ff->f", "dd->d", "ii->i", "ll->l", "OO->O"] - ids = make_ids()[:31] + ids = make_ids()[:500] flags = range(1, len(ids) + 1) funcs = range(len(ids)) diff --git a/extensibletype/test/test_pstdint.py b/extensibletype/test/test_pstdint.py index 63ba27e..45edb3d 100644 --- a/extensibletype/test/test_pstdint.py +++ b/extensibletype/test/test_pstdint.py @@ -1,4 +1,4 @@ from . import pstdint def test_pstdint(): - pstdint.test_pstdint() \ No newline at end of file + pstdint.test_pstdint() From e0f55af4732b58f83b5d69ba71232a4068a74fb1 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 12 Apr 2013 16:14:03 +0100 Subject: [PATCH 27/37] Print table in hashing error message --- extensibletype/methodtable.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index dc6963c..2c579cf 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -119,7 +119,8 @@ cdef class PerfectHashMethodTable(object): # Perfect hash our table if PyCustomSlots_PerfectHash(self.table, &hashes[0]) < 0: # TODO: sensible error messages - raise HashingError("Unable to create perfect hash table") + raise HashingError( + "Unable to create perfect hash table for table: %s" % self) for i, signature in enumerate(ids): assert self.find_method(signature), (i, signature) From 17d24023fb65607d52d53a68f452081136fae824 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 12 Apr 2013 17:10:28 +0100 Subject: [PATCH 28/37] Add some error messages (TODO: use errnos or error return codes) --- include/perfecthash.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/perfecthash.h b/include/perfecthash.h index c1b469e..60b2825 100644 --- a/include/perfecthash.h +++ b/include/perfecthash.h @@ -146,8 +146,10 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { uint8_t number_of_bins_by_size[BIN_LIMIT]; PyCustomSlots_Entry *entries_copy = malloc(sizeof(PyCustomSlots_Entry) * n); - if (!bins || !binsizes || !p || !taken || !entries_copy) + if (!bins || !binsizes || !p || !taken || !entries_copy) { + printf("Error: Unable to allocate memory\n"); goto error; + } for (i = 0; i != n; ++i) { entries_copy[i] = table->entries[i]; @@ -166,7 +168,7 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { bin = hashes[i] & m_g; binsize = ++binsizes[bin]; if (binsize == BIN_LIMIT) { - printf("ERROR 1\n"); + printf("Error: Bin limit reached\n"); goto error; } bins[BIN_LIMIT * bin + binsize - 1] = i; @@ -191,7 +193,7 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { } if (retcode != 0) { - printf("no suitable table found\n"); + printf("Error: No suitable table found\n"); goto error; } From 316307c17ec210983c2a3045438a44ec6f1380a6 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 12 Apr 2013 19:34:29 +0100 Subject: [PATCH 29/37] Allow for larger hash tables --- extensibletype/extensibletype.pxd | 2 +- extensibletype/methodtable.pyx | 8 ++++++- include/perfecthash.h | 35 +++++++++++++++++++++++++------ 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/extensibletype/extensibletype.pxd b/extensibletype/extensibletype.pxd index 6599e76..0a75e2e 100644 --- a/extensibletype/extensibletype.pxd +++ b/extensibletype/extensibletype.pxd @@ -14,7 +14,7 @@ cdef extern from "perfecthash.h": uint64_t flags uint64_t m_f, m_g PyCustomSlots_Entry *entries - uint16_t n, b + uint32_t n, b uint8_t r ctypedef struct PyCustomSlots_Table_64_64: diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index dc6963c..fbd9c2f 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -29,7 +29,7 @@ cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: cdef int nbins size = roundup(size) - nbins = size + nbins = size * 2 table = stdlib.calloc( 1, sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * nbins + @@ -42,6 +42,8 @@ cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: table.b = nbins table.flags = 0 + assert table.b >= table.n, (table.b, table.n, nbins) + table.entries = (( &table[1]) + table.b * sizeof(uint16_t)) @@ -108,14 +110,18 @@ cdef class PerfectHashMethodTable(object): hashes[i] = id self.id_to_signature[id] = signature + hashes[n:self.table.n] = extensibletype.draw_hashes(np.random, self.table.n - n) + # print "n", n, "table.n", self.table.n, "table.b", self.table.b assert len(np.unique(hashes)) == len(hashes) # print "-----------------------" # print self # print "-----------------------" + assert self.table.b >= self.table.n, (self.table.b, self.table.n) + # Perfect hash our table if PyCustomSlots_PerfectHash(self.table, &hashes[0]) < 0: # TODO: sensible error messages diff --git a/include/perfecthash.h b/include/perfecthash.h index c1b469e..2eba727 100644 --- a/include/perfecthash.h +++ b/include/perfecthash.h @@ -10,7 +10,7 @@ typedef struct { uint64_t flags; uint64_t m_f, m_g; PyCustomSlots_Entry *entries; - uint16_t n, b; + uint32_t n, b; uint8_t r; uint8_t reserved; @@ -40,9 +40,9 @@ uint64_t PyCustomSlots_roundup_2pow(uint64_t x) { #define BIN_LIMIT 8 - + void _PyCustomSlots_bucket_argsort(uint16_t *p, uint8_t *binsizes, - uint8_t *number_of_bins_by_size) { + uint32_t *number_of_bins_by_size) { uint16_t *sort_bins[BIN_LIMIT]; int binsize, ibin, nbins; nbins = 0; @@ -136,18 +136,26 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { int result, r, retcode; uint16_t bin, j; uint8_t binsize; - uint16_t i, n = table->n, b = table->b; + uint32_t i, n = table->n, b = table->b; uint64_t m_f = PyCustomSlots_roundup_2pow(table->n) - 1; uint64_t m_g = (b - 1) & 0xffff; uint16_t *bins = malloc(sizeof(uint16_t) * b * BIN_LIMIT); uint8_t *binsizes = malloc(sizeof(uint8_t) * b); uint16_t *p = malloc(sizeof(uint16_t) * b); uint8_t *taken = malloc(sizeof(uint8_t) * n); - uint8_t number_of_bins_by_size[BIN_LIMIT]; + uint32_t number_of_bins_by_size[BIN_LIMIT]; PyCustomSlots_Entry *entries_copy = malloc(sizeof(PyCustomSlots_Entry) * n); - if (!bins || !binsizes || !p || !taken || !entries_copy) + + if (b <= 0) { + printf("Error: Invalid number of bins, %d %d\n", b, table->b); + abort(); + } + + if (!bins || !binsizes || !p || !taken || !entries_copy) { + printf("Error: Unable to allocate memory\n"); goto error; + } for (i = 0; i != n; ++i) { entries_copy[i] = table->entries[i]; @@ -164,6 +172,8 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { } for (i = 0; i != n; ++i) { bin = hashes[i] & m_g; + if (bin > b) + abort(); binsize = ++binsizes[bin]; if (binsize == BIN_LIMIT) { printf("ERROR 1\n"); @@ -177,6 +187,19 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { /* argsort the bins (p stores permutation) from largest to smallest, using binsort */ _PyCustomSlots_bucket_argsort(p, binsizes, &number_of_bins_by_size[0]); + /* + for (i = 0; i < BIN_LIMIT; i++) { + printf("bin_by_size[%d] = %d\n", i, number_of_bins_by_size[i]); + } + */ + + /* Sanity check */ + for (i = 0; i < b; ++i) { + if (!(p[i] >= 0 && p[i] < b)) { + printf("ERROR: p[%d]=%d\n", i, p[i]); + abort(); + } + } /* Find perfect table -- try again for each choice of r */ table->m_f = m_f; From 4aa139d88a7fab242050420f4e52a2a55d2fc701 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 12 Apr 2013 22:23:23 +0100 Subject: [PATCH 30/37] Update bucketsort test --- extensibletype/extensibletype.pyx | 4 ++-- extensibletype/test/test_perfecthashing.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/extensibletype/extensibletype.pyx b/extensibletype/extensibletype.pyx index d1eefc3..3ec76a4 100644 --- a/extensibletype/extensibletype.pyx +++ b/extensibletype/extensibletype.pyx @@ -5,11 +5,11 @@ import hashlib cdef extern from "perfecthash.h": void _PyCustomSlots_bucket_argsort(uint16_t *p, uint8_t *binsizes, - uint8_t *number_of_bins_by_size) + uint32_t *number_of_bins_by_size) def bucket_argsort(cnp.ndarray[uint16_t, mode='c'] p, cnp.ndarray[uint8_t, mode='c'] binsizes, - cnp.ndarray[uint8_t, mode='c'] number_of_bins_by_size): + cnp.ndarray[uint32_t, mode='c'] number_of_bins_by_size): _PyCustomSlots_bucket_argsort(&p[0], &binsizes[0], &number_of_bins_by_size[0]) diff --git a/extensibletype/test/test_perfecthashing.py b/extensibletype/test/test_perfecthashing.py index 63ccf22..28e3906 100644 --- a/extensibletype/test/test_perfecthashing.py +++ b/extensibletype/test/test_perfecthashing.py @@ -10,7 +10,7 @@ def test_binsort(): p = np.zeros(nbins, dtype=np.uint16) binsizes = np.random.randint(0, 7, size=nbins).astype(np.uint8) num_by_size = np.zeros(8, dtype=np.uint8) - x = np.bincount(binsizes).astype(np.uint8) + x = np.bincount(binsizes).astype(np.uint32) num_by_size[:x.shape[0]] = x extensibletype.bucket_argsort(p, binsizes, num_by_size) assert np.all(sorted(binsizes) == binsizes[p][::-1]) From 9c04c4cdc80e0fd25412b3c37c4552c612102d93 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 12 Apr 2013 22:23:47 +0100 Subject: [PATCH 31/37] Add more thorough intern test --- extensibletype/test/test_interning.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/extensibletype/test/test_interning.py b/extensibletype/test/test_interning.py index 9395778..3e8815b 100644 --- a/extensibletype/test/test_interning.py +++ b/extensibletype/test/test_interning.py @@ -23,3 +23,11 @@ def test_interning(): id3 = intern.global_intern("hallo") assert id1 == id2 assert id1 != id3 + +def test_intern_many(): + table = intern.InternTable() + + for i in range(1000000): + table.global_intern("my randrom string %d" % i) + table.global_intern("my randrom string %d" % (i // 2)) + table.global_intern("my randrom string %d" % (i // 4)) From cd2c8de6a27ad87559d30dfaf39390f8e9cdfa71 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Sun, 14 Apr 2013 23:04:43 +0100 Subject: [PATCH 32/37] Add better test for method table --- extensibletype/extensibletype.pxd | 2 +- extensibletype/extensibletype.pyx | 4 +-- extensibletype/methodtable.pyx | 10 ++++--- extensibletype/test/test_interning.py | 6 ++-- extensibletype/test/test_perfecthashing.py | 35 ++++++++++++++++------ include/interning.h | 1 + include/perfecthash.h | 26 +++++++--------- 7 files changed, 50 insertions(+), 34 deletions(-) diff --git a/extensibletype/extensibletype.pxd b/extensibletype/extensibletype.pxd index 0a75e2e..6599e76 100644 --- a/extensibletype/extensibletype.pxd +++ b/extensibletype/extensibletype.pxd @@ -14,7 +14,7 @@ cdef extern from "perfecthash.h": uint64_t flags uint64_t m_f, m_g PyCustomSlots_Entry *entries - uint32_t n, b + uint16_t n, b uint8_t r ctypedef struct PyCustomSlots_Table_64_64: diff --git a/extensibletype/extensibletype.pyx b/extensibletype/extensibletype.pyx index 3ec76a4..268fdc9 100644 --- a/extensibletype/extensibletype.pyx +++ b/extensibletype/extensibletype.pyx @@ -5,11 +5,11 @@ import hashlib cdef extern from "perfecthash.h": void _PyCustomSlots_bucket_argsort(uint16_t *p, uint8_t *binsizes, - uint32_t *number_of_bins_by_size) + uint16_t *number_of_bins_by_size) def bucket_argsort(cnp.ndarray[uint16_t, mode='c'] p, cnp.ndarray[uint8_t, mode='c'] binsizes, - cnp.ndarray[uint32_t, mode='c'] number_of_bins_by_size): + cnp.ndarray[uint16_t, mode='c'] number_of_bins_by_size): _PyCustomSlots_bucket_argsort(&p[0], &binsizes[0], &number_of_bins_by_size[0]) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index fbd9c2f..eed3459 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -26,10 +26,11 @@ class HashingError(Exception): cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: cdef PyCustomSlots_Table *table - cdef int nbins + cdef uint16_t nbins size = roundup(size) - nbins = size * 2 + assert size * 4 <= 0xFFFF, hex(size) + nbins = size * 4 table = stdlib.calloc( 1, sizeof(PyCustomSlots_Table) + sizeof(uint16_t) * nbins + @@ -44,8 +45,9 @@ cdef PyCustomSlots_Table *allocate_hash_table(uint16_t size) except NULL: assert table.b >= table.n, (table.b, table.n, nbins) - table.entries = (( &table[1]) + - table.b * sizeof(uint16_t)) + table.entries = ( + ( table) + sizeof(PyCustomSlots_Table) + + nbins * sizeof(uint16_t)) return table diff --git a/extensibletype/test/test_interning.py b/extensibletype/test/test_interning.py index 3e8815b..109bf70 100644 --- a/extensibletype/test/test_interning.py +++ b/extensibletype/test/test_interning.py @@ -28,6 +28,6 @@ def test_intern_many(): table = intern.InternTable() for i in range(1000000): - table.global_intern("my randrom string %d" % i) - table.global_intern("my randrom string %d" % (i // 2)) - table.global_intern("my randrom string %d" % (i // 4)) + table.intern("my randrom string %d" % i) + table.intern("my randrom string %d" % (i // 2)) + table.intern("my randrom string %d" % (i // 4)) diff --git a/extensibletype/test/test_perfecthashing.py b/extensibletype/test/test_perfecthashing.py index 28e3906..690fb8c 100644 --- a/extensibletype/test/test_perfecthashing.py +++ b/extensibletype/test/test_perfecthashing.py @@ -1,3 +1,4 @@ +import time import itertools from nose.tools import eq_, ok_ @@ -7,10 +8,13 @@ def test_binsort(): nbins = 64 + p = np.zeros(nbins, dtype=np.uint16) binsizes = np.random.randint(0, 7, size=nbins).astype(np.uint8) - num_by_size = np.zeros(8, dtype=np.uint8) - x = np.bincount(binsizes).astype(np.uint32) + + num_by_size = np.zeros(8, dtype=np.uint16) + x = np.bincount(binsizes).astype(np.uint16) + num_by_size[:x.shape[0]] = x extensibletype.bucket_argsort(p, binsizes, num_by_size) assert np.all(sorted(binsizes) == binsizes[p][::-1]) @@ -35,15 +39,10 @@ def make_signature(type_permutation): def make_ids(): types = ['f', 'd', 'i', 'l', 'O'] - power = 6 + power = 5 return map(make_signature, itertools.product(*(types,) * power)) -def test_methodtable(): - # ids = ["ff->f", "dd->d", "ii->i", "ll->l", "OO->O"] - ids = make_ids()[:500] - flags = range(1, len(ids) + 1) - funcs = range(len(ids)) - +def build_and_verify_methodtable(ids, flags, funcs): table = methodtable.PerfectHashMethodTable(methodtable.Hasher()) table.generate_table(len(ids), ids, flags, funcs) @@ -54,3 +53,21 @@ def test_methodtable(): got_func, got_flag = result assert func == got_func, (func, got_func) # assert flag == got_flag, (flag, got_flag) + +def test_methodtable(): + # ids = ["ff->f", "dd->d", "ii->i", "ll->l", "OO->O"] + + ids = make_ids() + flags = range(1, len(ids) + 1) + funcs = range(len(ids)) + + step = 100 + + i = len(ids) + for i in range(1, len(ids), step): + t = time.time() + build_and_verify_methodtable(ids[:i], flags[:i], funcs[:i]) + t = time.time() - t + print i, "table building took", t, "seconds." + +test_methodtable() \ No newline at end of file diff --git a/include/interning.h b/include/interning.h index 3557ed4..9896b5a 100644 --- a/include/interning.h +++ b/include/interning.h @@ -204,6 +204,7 @@ _intern_key(intern_table_t *table, PyObject *key_obj, const char *key) if (value == NULL) { /* Key not in dict */ + Py_INCREF(key_obj); if (make_prehash(table, key_obj, key, &prehash) < 0) goto bad; } else { diff --git a/include/perfecthash.h b/include/perfecthash.h index 2eba727..9b51a8d 100644 --- a/include/perfecthash.h +++ b/include/perfecthash.h @@ -10,7 +10,7 @@ typedef struct { uint64_t flags; uint64_t m_f, m_g; PyCustomSlots_Entry *entries; - uint32_t n, b; + uint16_t n, b; uint8_t r; uint8_t reserved; @@ -42,9 +42,10 @@ uint64_t PyCustomSlots_roundup_2pow(uint64_t x) { void _PyCustomSlots_bucket_argsort(uint16_t *p, uint8_t *binsizes, - uint32_t *number_of_bins_by_size) { + uint16_t *number_of_bins_by_size) { uint16_t *sort_bins[BIN_LIMIT]; - int binsize, ibin, nbins; + int binsize; + uint32_t ibin, nbins; nbins = 0; /* We know how many bins there are of each size, so place pointers for each size along on the output array p */ @@ -72,7 +73,7 @@ int _PyCustomSlots_FindDisplacements(PyCustomSlots_Table *table, uint16_t nbins = table->b; uint64_t m_f = table->m_f; uint8_t r = table->r; - int i, j, bin; + uint16_t i, j, bin; /* Step 1: Validate that f is 1:1 in each bin */ for (j = 0; j != nbins; ++j) { @@ -93,6 +94,7 @@ int _PyCustomSlots_FindDisplacements(PyCustomSlots_Table *table, for (i = 0; i != nbins; ++i) { taken[i] = 0; } + for (j = 0; j != nbins; ++j) { uint16_t dval; bin = p[j]; @@ -134,24 +136,18 @@ int _PyCustomSlots_FindDisplacements(PyCustomSlots_Table *table, int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { int result, r, retcode; - uint16_t bin, j; + uint32_t bin, j; uint8_t binsize; - uint32_t i, n = table->n, b = table->b; + uint16_t i, n = table->n, b = table->b; uint64_t m_f = PyCustomSlots_roundup_2pow(table->n) - 1; uint64_t m_g = (b - 1) & 0xffff; uint16_t *bins = malloc(sizeof(uint16_t) * b * BIN_LIMIT); uint8_t *binsizes = malloc(sizeof(uint8_t) * b); uint16_t *p = malloc(sizeof(uint16_t) * b); - uint8_t *taken = malloc(sizeof(uint8_t) * n); - uint32_t number_of_bins_by_size[BIN_LIMIT]; + uint8_t *taken = malloc(sizeof(uint8_t) * b); + uint16_t number_of_bins_by_size[BIN_LIMIT]; PyCustomSlots_Entry *entries_copy = malloc(sizeof(PyCustomSlots_Entry) * n); - - if (b <= 0) { - printf("Error: Invalid number of bins, %d %d\n", b, table->b); - abort(); - } - if (!bins || !binsizes || !p || !taken || !entries_copy) { printf("Error: Unable to allocate memory\n"); goto error; @@ -195,7 +191,7 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { /* Sanity check */ for (i = 0; i < b; ++i) { - if (!(p[i] >= 0 && p[i] < b)) { + if (!(p[i] < b)) { printf("ERROR: p[%d]=%d\n", i, p[i]); abort(); } From c1ec5db323f2a0009bd5cda7f1326b74e4e0c699 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 15 Apr 2013 08:10:38 -0500 Subject: [PATCH 33/37] Disable global intern exception test --- extensibletype/test/test_interning.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/extensibletype/test/test_interning.py b/extensibletype/test/test_interning.py index 109bf70..f76ffa3 100644 --- a/extensibletype/test/test_interning.py +++ b/extensibletype/test/test_interning.py @@ -1,12 +1,13 @@ from .. import intern def test_global_interning(): - try: - intern.global_intern("hello") - except AssertionError as e: - pass - else: - raise Exception("Expects complaint about uninitialized table") + # Can't really test for this with nose... + # try: + # intern.global_intern("hello") + # except AssertionError as e: + # pass + # else: + # raise Exception("Expects complaint about uninitialized table") intern.global_intern_initialize() id1 = intern.global_intern("hello") From 63a9b0b3552dda8644a9cd821e030bec4f9a9cba Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 15 Apr 2013 08:10:54 -0500 Subject: [PATCH 34/37] Verify ids in interning test --- extensibletype/test/test_interning.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/extensibletype/test/test_interning.py b/extensibletype/test/test_interning.py index f76ffa3..24affce 100644 --- a/extensibletype/test/test_interning.py +++ b/extensibletype/test/test_interning.py @@ -28,7 +28,15 @@ def test_interning(): def test_intern_many(): table = intern.InternTable() + itoid = {} for i in range(1000000): - table.intern("my randrom string %d" % i) - table.intern("my randrom string %d" % (i // 2)) - table.intern("my randrom string %d" % (i // 4)) + id = table.intern("my randrom string %d" % i) + itoid[i] = id + + id1 = table.intern("my randrom string %d" % (i // 2)) + id2 = table.intern("my randrom string %d" % (i // 4)) + + assert id1 == itoid[i//2] + assert id2 == itoid[i//4] + +test_intern_many() \ No newline at end of file From 18b192b1066a0a6591bdfa2f5d20ddfb3bba398f Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 15 Apr 2013 14:33:31 +0100 Subject: [PATCH 35/37] Guard some module-level test calls --- extensibletype/test/test_interning.py | 3 ++- extensibletype/test/test_perfecthashing.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/extensibletype/test/test_interning.py b/extensibletype/test/test_interning.py index 24affce..8968193 100644 --- a/extensibletype/test/test_interning.py +++ b/extensibletype/test/test_interning.py @@ -39,4 +39,5 @@ def test_intern_many(): assert id1 == itoid[i//2] assert id2 == itoid[i//4] -test_intern_many() \ No newline at end of file +if __name__ == '__main__': + test_intern_many() \ No newline at end of file diff --git a/extensibletype/test/test_perfecthashing.py b/extensibletype/test/test_perfecthashing.py index 690fb8c..2dc4499 100644 --- a/extensibletype/test/test_perfecthashing.py +++ b/extensibletype/test/test_perfecthashing.py @@ -70,4 +70,5 @@ def test_methodtable(): t = time.time() - t print i, "table building took", t, "seconds." -test_methodtable() \ No newline at end of file +if __name__ == '__main__': + test_methodtable() \ No newline at end of file From 0c17bbc2132b3e7c82dd034742d26ec324db490e Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 15 Apr 2013 17:06:24 +0100 Subject: [PATCH 36/37] Temporarily switch to deteministic interning keys --- extensibletype/methodtable.pyx | 1 + include/interning.h | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index eed3459..0d3f9d7 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -142,6 +142,7 @@ cdef class PerfectHashMethodTable(object): """ cdef uint64_t prehash = intern.global_intern(make_bytes(signature)) + assert 0 <= self.displacements[prehash & self.table.m_g] < self.table.b cdef uint64_t idx = (((prehash >> self.table.r) & self.table.m_f) ^ self.displacements[prehash & self.table.m_g]) diff --git a/include/interning.h b/include/interning.h index 9896b5a..e7dd656 100644 --- a/include/interning.h +++ b/include/interning.h @@ -75,6 +75,11 @@ intern_create_table(intern_table_t *table) table->secrets[i + 0] = ((char *) &randval)[0]; table->secrets[i + 1] = ((char *) &randval)[1]; } + /* Amend this! */ + memset(&table->secrets[16*0], 0, 16); + memset(&table->secrets[16*1], 1, 16); + memset(&table->secrets[16*2], 2, 16); + memset(&table->secrets[16*3], 3, 16); /* _print_secrets(table); */ From cd3a4f542fbf8b452d619d03b431718e9a1059ee Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 15 Apr 2013 19:00:22 +0100 Subject: [PATCH 37/37] Make sure displacements xor inbounds --- extensibletype/methodtable.pyx | 4 ++-- include/perfecthash.h | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/extensibletype/methodtable.pyx b/extensibletype/methodtable.pyx index 0d3f9d7..921f145 100644 --- a/extensibletype/methodtable.pyx +++ b/extensibletype/methodtable.pyx @@ -130,7 +130,7 @@ cdef class PerfectHashMethodTable(object): raise HashingError("Unable to create perfect hash table") for i, signature in enumerate(ids): - assert self.find_method(signature), (i, signature) + assert self.find_method(signature) is not None, (i, signature) # For debugging self.signatures = ids @@ -146,7 +146,7 @@ cdef class PerfectHashMethodTable(object): cdef uint64_t idx = (((prehash >> self.table.r) & self.table.m_f) ^ self.displacements[prehash & self.table.m_g]) - assert 0 <= idx < self.size + assert 0 <= idx < self.size, (idx, self.size) if self.table.entries[idx].id != prehash: return None diff --git a/include/perfecthash.h b/include/perfecthash.h index 9b51a8d..b180da9 100644 --- a/include/perfecthash.h +++ b/include/perfecthash.h @@ -71,6 +71,7 @@ int _PyCustomSlots_FindDisplacements(PyCustomSlots_Table *table, PyCustomSlots_Entry *entries_copy) { uint16_t *d = (void*)((char*)table + sizeof(PyCustomSlots_Table)); uint16_t nbins = table->b; + uint16_t n = table->n; uint64_t m_f = table->m_f; uint8_t r = table->r; uint16_t i, j, bin; @@ -91,7 +92,7 @@ int _PyCustomSlots_FindDisplacements(PyCustomSlots_Table *table, /* Step 2: Attempt to assign displacements d[bin], starting with the largest bin */ - for (i = 0; i != nbins; ++i) { + for (i = 0; i != n; ++i) { taken[i] = 0; } @@ -101,7 +102,7 @@ int _PyCustomSlots_FindDisplacements(PyCustomSlots_Table *table, if (binsizes[bin] == 0) { d[bin] = 0; } else { - for (dval = 0; dval != nbins; ++dval) { + for (dval = 0; dval != n; ++dval) { int k; int collides = 0; for (k = 0; k != binsizes[bin]; ++k) { @@ -114,7 +115,7 @@ int _PyCustomSlots_FindDisplacements(PyCustomSlots_Table *table, } if (!collides) break; } - if (dval == nbins) { + if (dval == n) { /* no appropriate dval found */ return -1; } else { @@ -144,7 +145,7 @@ int PyCustomSlots_PerfectHash(PyCustomSlots_Table *table, uint64_t *hashes) { uint16_t *bins = malloc(sizeof(uint16_t) * b * BIN_LIMIT); uint8_t *binsizes = malloc(sizeof(uint8_t) * b); uint16_t *p = malloc(sizeof(uint16_t) * b); - uint8_t *taken = malloc(sizeof(uint8_t) * b); + uint8_t *taken = malloc(sizeof(uint8_t) * n); uint16_t number_of_bins_by_size[BIN_LIMIT]; PyCustomSlots_Entry *entries_copy = malloc(sizeof(PyCustomSlots_Entry) * n);