From b068efa5821ba43e3ef7d723b116c8cff4b59045 Mon Sep 17 00:00:00 2001 From: Michael Poole Date: Wed, 6 Oct 2004 02:28:07 +0000 Subject: [PATCH] Replace old hash function with one based on CRC-32. git-svn-id: file:///home/klmitch/undernet-ircu/undernet-ircu-svn/ircu2/trunk@1236 c9e4aea6-c8fd-4c43-8297-357d70d61c8c --- ChangeLog | 6 ++ ircd/hash.c | 246 ++++++++-------------------------------------------- 2 files changed, 44 insertions(+), 208 deletions(-) diff --git a/ChangeLog b/ChangeLog index 3d65126..24bf3e7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2004-10-05 Michael Poole + + * ircd/hash.c: Replace the old hash function with one based on + randomized CRC-32. The new one avoids a big table from the old + function. + 2004-10-05 Michael Poole * ircd/random.c: Convert to use ircd_md5 interface and hopefully diff --git a/ircd/hash.c b/ircd/hash.c index 94ff51d..7d33502 100644 --- a/ircd/hash.c +++ b/ircd/hash.c @@ -28,6 +28,7 @@ #include "ircd_string.h" #include "ircd.h" #include "msg.h" +#include "random.h" #include "send.h" #include "struct.h" #include "sys.h" @@ -38,241 +39,70 @@ #include -/************************* Nemesi's hash alghoritm ***********************/ /** @file * @brief Hash table management. * @version $Id$ * - * This hash function returns *exactly* N%HASHSIZE, where 'N' - * is the string itself (included the trailing '\\0') seen as - * a baseHASHSHIFT number whose "digits" are the bytes of the - * number mapped through a "weight" transformation that gives - * the same "weight" to caseless-equal chars, example: - * - * Hashing the string "Nick\0" the result will be: - * N i c k \\0 - * | | | | `---> ( (hash_weight('\\0') * (HASHSHIFT**0) + - * | | | `------> (hash_weight('k') * (HASHSHIFT**1) + - * | | `---------> (hash_weight('c') * (HASHSHIFT**2) + - * | `------------> (hash_weight('i') * (HASHSHIFT**3) + - * `---------------> (hash_weight('N') * (HASHSHIFT**4) ) % HASHSIZE - * - * It's actually a lot similar to a base transformation of the - * text representation of an integer. - * Looking at it this way seems slow and requiring unlimited integer - * precision, but we actually do it with a *very* fast loop, using only - * short integer arithmetic and by means of two memory accesses and - * 3 additions per each byte processed.. and nothing else, as a side - * note the distribution of real nicks over the hash table of this - * function is about 3 times better than the previous one, and the - * hash function itself is about 25% faster with a "normal" HASHSIZE - * (it gets slower with larger ones and faster for smallest ones - * because the hash table size affect the size of some maps and thus - * the effectiveness of RAM caches while accesing them). - * These two pages of macros are here to make the following code - * _more_ understandeable... I hope ;) - * - * If you ask me, this whole mess is ungodly complicated for very - * little benefit. -Entrope + * This file used to use some very complicated hash function. Now it + * uses CRC-32, but effectively remaps each input byte according to a + * table initialized at startup. */ -/** Internal stuff, think well before changing this, it's how - much the weights of two lexicograhically contiguous chars - differ, i.e.\ (hash_weight('b')-hash_weight('a')) == HASHSTEP. - One seems to be fine but the algorithm doesn't depend on it. */ -#define HASHSTEP 1 - -/** The smallest _prime_ int beeing HASHSTEP times bigger than a byte. - That is, the first prime bigger than the maximum hash_weight - (since the maximum hash weight is gonna be the "biggest-byte * HASHSTEP") - */ -#define HASHSHIFT 257 - -/* Are we sure that HASHSHIFT is big enough ? */ -#if !(HASHSHIFT > (HASHSTEP*(CHAR_MAX-CHAR_MIN))) -#error "No no, I cannot, please make HASHSHIFT a bigger prime !" -#endif - -/* Now HASHSIZE doesn't need to be a prime, but we really don't want it - to be an exact multiple of HASHSHIFT, that would make the distribution - a LOT worse, once is not multiple of HASHSHIFT it can be anything */ -#if ((HASHSIZE%HASHSHIFT)==0) -#error "Please set HASHSIZE to something not multiple of HASHSHIFT" -#endif - -/* What type of integer do we need in our computations ? the largest - value we need to work on is (HASHSIZE+HASHSHIFT+1), for memory - operations we want to keep the tables compact (the cache will work - better and we will run faster) while for work variables we prefer - to roundup to 'int' if it is the case: on platforms where int!=short - int arithmetic is often faster than short arithmetic, we prefer signed - types if they are big enough since on some architectures they are faster - than unsigned, but we always keep signedness of mem and regs the same, - to avoid sign conversions that sometimes require time, the following - precompile stuff will set HASHMEMS to an appropriate integer type for - the tables stored in memory and HASHREGS to an appropriate int type - for the work registers/variables/return types. Everything of type - HASH???S will remain internal to this source file so I placed this stuff - here and not in the header file. */ - -#undef HASHMEMS -#undef HASHREGS - -#if ((!defined(HASHMEMS)) && (HASHSIZE < (SHRT_MAX-HASHSHIFT))) -#define HASHMEMS short -#define HASHREGS int -#endif - -#if ((!defined(HASHMEMS)) && (HASHSIZE < (USHRT_MAX-HASHSHIFT))) -#define HASHMEMS unsigned short -#define HASHREGS unsigned int -#endif - -#if ((!defined(HASHMEMS)) && (HASHSIZE < (INT_MAX-HASHSHIFT))) -#define HASHMEMS int -#define HASHREGS int -#endif - -#if ((!defined(HASHMEMS)) && (HASHSIZE < (UINT_MAX-HASHSHIFT))) -#define HASHMEMS unsigned int -#define HASHREGS unsigned int -#endif - -#if ((!defined(HASHMEMS)) && (HASHSIZE < (LONG_MAX-HASHSHIFT))) -#define HASHMEMS long -#define HASHREGS long -#endif - -#if ((!defined(HASHMEMS)) && (HASHSIZE < (ULONG_MAX-HASHSHIFT))) -#define HASHMEMS unsigned long -#define HASHREGS unsigned long -#endif - -#if (!defined(HASHMEMS)) -#error "Uh oh... I have a problem, do you want a 16GB hash table ? !" -#endif - -/* Now we are sure that HASHMEMS and HASHREGS can contain the following */ -/** Size of #hash_map array. */ -#define HASHMAPSIZE (HASHSIZE+HASHSHIFT+1) - -/* Static memory structures */ - -/** We need a first function that, given an integer h between 1 and - HASHSIZE+HASHSHIFT, returns ( (h * HASHSHIFT) % HASHSIZE ) ). - We'll map this function in this table. */ -static HASHMEMS hash_map[HASHMAPSIZE]; - -/** Then we need a second function that "maps" a char to its weight. - Changed to a table this one too, with this macro we can use a char - as index and not care if it is signed or not, no.. this will not - cause an addition to take place at each access, trust me, the - optimizer takes it out of the actual code and passes "label+shift" - to the linker, and the linker does the addition :) */ -static HASHMEMS hash_weight_table[CHAR_MAX - CHAR_MIN + 1]; -/** Helper macro to look characters up in #hash_weight_table. */ -#define hash_weight(ch) hash_weight_table[ch-CHAR_MIN] - -/* The actual hash tables, both MUST be of the same HASHSIZE, variable - size tables could be supported but the rehash routine should also - rebuild the transformation maps, I kept the tables of equal size - so that I can use one hash function and one transformation map */ /** Hash table for clients. */ static struct Client *clientTable[HASHSIZE]; /** Hash table for channels. */ static struct Channel *channelTable[HASHSIZE]; +/** CRC-32 update table. */ +static uint32_t crc32hash[256]; -/* This is what the hash function will consider "equal" chars, this function - MUST be transitive, if HASHEQ(y,x)&&HASHEQ(y,z) then HASHEQ(y,z), and MUST - be symmetric, if HASHEQ(a,b) then HASHEQ(b,a), obvious ok but... :) */ -/** Helper macro for character comparison. */ -#define HASHEQ(x,y) ((ToLower(x)) == (ToLower(y))) - -/** Initialize the maps used by hash functions and clear the tables. */ +/** Initialize the map used by the hash function. */ void init_hash(void) { - int i; - int j; - unsigned long l; - unsigned long m; + unsigned int ii, jj, rand, poly; - /* Clear the hash tables first */ - for (l = 0; l < HASHSIZE; l++) + /* First calculate a normal CRC-32 table. */ + for (ii = 0, poly = 0xedb88320; ii < 256; ii++) { - channelTable[l] = 0; - clientTable[l] = 0; - } - - /* Here is to what we "map" a char before working on it */ - for (i = CHAR_MIN; i <= CHAR_MAX; i++) - hash_weight(i) = (HASHMEMS) (HASHSTEP * ((unsigned char)i)); - - /* Make them equal for case-independently equal chars, it's - lame to do it this way but I wanted the code flexible, it's - possible to change the HASHEQ macro and not touch here. - I don't actually care about the 32768 loops since it happens - only once at startup */ - for (i = CHAR_MIN; i <= CHAR_MAX; i++) { - for (j = CHAR_MIN; j < i; j++) { - if (HASHEQ(i, j)) - hash_weight(i) = hash_weight(j); - } + rand = ii; + for (jj = 0; jj < 8; jj++) + rand = (rand & 1) ? poly ^ (rand >> 1) : rand >> 1; + crc32hash[ii] = rand; } - /* And this is our hash-loop "transformation" function, - basically it will be hash_map[x] == ((x*HASHSHIFT)%HASHSIZE) - defined for 0<=x<=(HASHSIZE+HASHSHIFT) */ - for (m = 0; m < (unsigned long)HASHMAPSIZE; m++) + /* Now reorder the hash table. To make it case-insensitive, skip + * upper-case letters, and have lower-case letters write to the + * corresponding upper-case character. + */ + for (ii = 0, rand = 0; ii < 256; ii++) { - l = m; - l *= (unsigned long)HASHSHIFT; - l %= (unsigned long)HASHSIZE; - hash_map[m] = (HASHMEMS) l; + char ch = ii + CHAR_MIN; + if (ch != ToLower(ch)) + continue; + if (!rand) + rand = ircrandom(); + poly = ii + rand % (256 - ii); + jj = crc32hash[ii]; + crc32hash[ToUpper(ch) - CHAR_MIN] = crc32hash[ii] = crc32hash[poly]; + crc32hash[poly] = jj; + rand >>= 8; } } -/* These are the actual hash functions, since they are static - and very short any decent compiler at a good optimization level - WILL inline these in the following functions */ +/** Output type of hash function. */ +typedef unsigned int HASHREGS; -/* This is the string hash function, - WARNING: n must be a valid pointer to a _non-null_ string - this means that not only strhash(NULL) but also - strhash("") _will_ coredump, it's responsibility - the caller to eventually check BadPtr(nick). */ - -/** Calculate hash value for a string. */ +/** Calculate hash value for a string. + * @param[in] n String to hash. + * @return Hash value for string. + */ static HASHREGS strhash(const char *n) { - HASHREGS hash = hash_weight(*n++); + HASHREGS hash = crc32hash[*n++ & 255]; while (*n) - hash = hash_map[hash] + hash_weight(*n++); - return hash_map[hash]; + hash = (hash >> 8) ^ crc32hash[(hash ^ *n++) & 255]; + return hash % HASHSIZE; } -/* And this is the string hash function for limited lenght strings - WARNING: n must be a valid pointer to a non-null string - and i must be > 0 ! */ - -/* REMOVED - - The time taken to decrement i makes the function - slower than strhash for the average of channel names (tested - on 16000 real channel names, 1000 loops. I left the code here - as a bookmark if a strnhash is evetually needed in the future. - - static HASHREGS strnhash(const char *n, int i) { - HASHREGS hash = hash_weight(*n++); - i--; - while(*n && i--) - hash = hash_map[hash] + hash_weight(*n++); - return hash_map[hash]; - } - - #define CHANHASHLEN 30 - - !REMOVED */ - /************************** Externally visible functions ********************/ /* Optimization note: in these functions I supposed that the CSE optimization -- 2.20.1