From b068efa5821ba43e3ef7d723b116c8cff4b59045 Mon Sep 17 00:00:00 2001
From: Michael Poole <mdpoole@troilus.org>
Date: Wed, 6 Oct 2004 02:28:07 +0000
Subject: [PATCH] Replace old hash function with one based on CRC-32.

git-svn-id: file:///home/klmitch/undernet-ircu/undernet-ircu-svn/ircu2/trunk@1236 c9e4aea6-c8fd-4c43-8297-357d70d61c8c
---
 ChangeLog   |   6 ++
 ircd/hash.c | 246 ++++++++--------------------------------------------
 2 files changed, 44 insertions(+), 208 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 3d65126..24bf3e7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2004-10-05  Michael Poole <mdpoole@troilus.org>
+
+	* ircd/hash.c: Replace the old hash function with one based on
+	randomized CRC-32.  The new one avoids a big table from the old
+	function.
+
 2004-10-05  Michael Poole <mdpoole@troilus.org>
 
 	* ircd/random.c: Convert to use ircd_md5 interface and hopefully
diff --git a/ircd/hash.c b/ircd/hash.c
index 94ff51d..7d33502 100644
--- a/ircd/hash.c
+++ b/ircd/hash.c
@@ -28,6 +28,7 @@
 #include "ircd_string.h"
 #include "ircd.h"
 #include "msg.h"
+#include "random.h"
 #include "send.h"
 #include "struct.h"
 #include "sys.h"
@@ -38,241 +39,70 @@
 #include <string.h>
 
 
-/************************* Nemesi's hash alghoritm ***********************/
 /** @file
  * @brief Hash table management.
  * @version $Id$
  *
- * This hash function returns *exactly* N%HASHSIZE, where 'N'
- * is the string itself (included the trailing '\\0') seen as
- * a baseHASHSHIFT number whose "digits" are the bytes of the
- * number mapped through a "weight" transformation that gives
- * the same "weight" to caseless-equal chars, example:
- *
- * Hashing the string "Nick\0" the result will be:
- * N  i  c  k \\0
- * |  |  |  |  `--->  ( (hash_weight('\\0') * (HASHSHIFT**0) +
- * |  |  |  `------>    (hash_weight('k')  * (HASHSHIFT**1) +
- * |  |  `--------->    (hash_weight('c')  * (HASHSHIFT**2) +
- * |  `------------>    (hash_weight('i')  * (HASHSHIFT**3) +
- * `--------------->    (hash_weight('N')  * (HASHSHIFT**4)   ) % HASHSIZE
- *
- * It's actually a lot similar to a base transformation of the
- * text representation of an integer.
- * Looking at it this way seems slow and requiring unlimited integer
- * precision, but we actually do it with a *very* fast loop, using only 
- * short integer arithmetic and by means of two memory accesses and 
- * 3 additions per each byte processed.. and nothing else, as a side
- * note the distribution of real nicks over the hash table of this
- * function is about 3 times better than the previous one, and the
- * hash function itself is about 25% faster with a "normal" HASHSIZE
- * (it gets slower with larger ones and faster for smallest ones
- * because the hash table size affect the size of some maps and thus
- * the effectiveness of RAM caches while accesing them).
- * These two pages of macros are here to make the following code
- * _more_ understandeable... I hope ;)
- *
- * If you ask me, this whole mess is ungodly complicated for very
- * little benefit. -Entrope
+ * This file used to use some very complicated hash function.  Now it
+ * uses CRC-32, but effectively remaps each input byte according to a
+ * table initialized at startup.
  */
 
-/** Internal stuff, think well before changing this, it's how
-   much the weights of two lexicograhically contiguous chars
-   differ, i.e.\ (hash_weight('b')-hash_weight('a')) == HASHSTEP.
-   One seems to be fine but the algorithm doesn't depend on it. */
-#define HASHSTEP 1
-
-/** The smallest _prime_ int beeing HASHSTEP times bigger than a byte.
-   That is, the first prime bigger than the maximum hash_weight
-   (since the maximum hash weight is gonna be the "biggest-byte * HASHSTEP")
- */
-#define HASHSHIFT 257
-
-/* Are we sure that HASHSHIFT is big enough ? */
-#if !(HASHSHIFT > (HASHSTEP*(CHAR_MAX-CHAR_MIN)))
-#error "No no, I cannot, please make HASHSHIFT a bigger prime !"
-#endif
-
-/* Now HASHSIZE doesn't need to be a prime, but we really don't want it
-   to be an exact multiple of HASHSHIFT, that would make the distribution
-   a LOT worse, once is not multiple of HASHSHIFT it can be anything */
-#if ((HASHSIZE%HASHSHIFT)==0)
-#error "Please set HASHSIZE to something not multiple of HASHSHIFT"
-#endif
-
-/* What type of integer do we need in our computations ? the largest
-   value we need to work on is (HASHSIZE+HASHSHIFT+1), for memory
-   operations we want to keep the tables compact (the cache will work
-   better and we will run faster) while for work variables we prefer
-   to roundup to 'int' if it is the case: on platforms where int!=short
-   int arithmetic is often faster than short arithmetic, we prefer signed
-   types if they are big enough since on some architectures they are faster
-   than unsigned, but we always keep signedness of mem and regs the same,
-   to avoid sign conversions that sometimes require time, the following 
-   precompile stuff will set HASHMEMS to an appropriate integer type for 
-   the tables stored in memory and HASHREGS to an appropriate int type
-   for the work registers/variables/return types. Everything of type
-   HASH???S will remain internal to this source file so I placed this stuff
-   here and not in the header file. */
-
-#undef HASHMEMS
-#undef HASHREGS
-
-#if ((!defined(HASHMEMS)) && (HASHSIZE < (SHRT_MAX-HASHSHIFT)))
-#define HASHMEMS short
-#define HASHREGS int
-#endif
-
-#if ((!defined(HASHMEMS)) && (HASHSIZE < (USHRT_MAX-HASHSHIFT)))
-#define HASHMEMS unsigned short
-#define HASHREGS unsigned int
-#endif
-
-#if ((!defined(HASHMEMS)) && (HASHSIZE < (INT_MAX-HASHSHIFT)))
-#define HASHMEMS int
-#define HASHREGS int
-#endif
-
-#if ((!defined(HASHMEMS)) && (HASHSIZE < (UINT_MAX-HASHSHIFT)))
-#define HASHMEMS unsigned int
-#define HASHREGS unsigned int
-#endif
-
-#if ((!defined(HASHMEMS)) && (HASHSIZE < (LONG_MAX-HASHSHIFT)))
-#define HASHMEMS long
-#define HASHREGS long
-#endif
-
-#if ((!defined(HASHMEMS)) && (HASHSIZE < (ULONG_MAX-HASHSHIFT)))
-#define HASHMEMS unsigned long
-#define HASHREGS unsigned long
-#endif
-
-#if (!defined(HASHMEMS))
-#error "Uh oh... I have a problem, do you want a 16GB hash table ? !"
-#endif
-
-/* Now we are sure that HASHMEMS and HASHREGS can contain the following */
-/** Size of #hash_map array. */
-#define HASHMAPSIZE (HASHSIZE+HASHSHIFT+1)
-
-/* Static memory structures */
-
-/** We need a first function that, given an integer h between 1 and
-   HASHSIZE+HASHSHIFT, returns ( (h * HASHSHIFT) % HASHSIZE ) ).
-   We'll map this function in this table. */
-static HASHMEMS hash_map[HASHMAPSIZE];
-
-/** Then we need a second function that "maps" a char to its weight.
-   Changed to a table this one too, with this macro we can use a char
-   as index and not care if it is signed or not, no.. this will not
-   cause an addition to take place at each access, trust me, the
-   optimizer takes it out of the actual code and passes "label+shift"
-   to the linker, and the linker does the addition :) */
-static HASHMEMS hash_weight_table[CHAR_MAX - CHAR_MIN + 1];
-/** Helper macro to look characters up in #hash_weight_table. */
-#define hash_weight(ch) hash_weight_table[ch-CHAR_MIN]
-
-/* The actual hash tables, both MUST be of the same HASHSIZE, variable
-   size tables could be supported but the rehash routine should also
-   rebuild the transformation maps, I kept the tables of equal size 
-   so that I can use one hash function and one transformation map */
 /** Hash table for clients. */
 static struct Client *clientTable[HASHSIZE];
 /** Hash table for channels. */
 static struct Channel *channelTable[HASHSIZE];
+/** CRC-32 update table. */
+static uint32_t crc32hash[256];
 
-/* This is what the hash function will consider "equal" chars, this function 
-   MUST be transitive, if HASHEQ(y,x)&&HASHEQ(y,z) then HASHEQ(y,z), and MUST
-   be symmetric, if HASHEQ(a,b) then HASHEQ(b,a), obvious ok but... :) */
-/** Helper macro for character comparison. */
-#define HASHEQ(x,y) ((ToLower(x)) == (ToLower(y)))
-
-/** Initialize the maps used by hash functions and clear the tables. */
+/** Initialize the map used by the hash function. */
 void init_hash(void)
 {
-  int           i;
-  int           j;
-  unsigned long l;
-  unsigned long m;
+  unsigned int ii, jj, rand, poly;
 
-  /* Clear the hash tables first */
-  for (l = 0; l < HASHSIZE; l++)
+  /* First calculate a normal CRC-32 table. */
+  for (ii = 0, poly = 0xedb88320; ii < 256; ii++)
   {
-    channelTable[l] = 0;
-    clientTable[l]  = 0;
-  }
-
-  /* Here is to what we "map" a char before working on it */
-  for (i = CHAR_MIN; i <= CHAR_MAX; i++)
-    hash_weight(i) = (HASHMEMS) (HASHSTEP * ((unsigned char)i));
-
-  /* Make them equal for case-independently equal chars, it's
-     lame to do it this way but I wanted the code flexible, it's
-     possible to change the HASHEQ macro and not touch here. 
-     I don't actually care about the 32768 loops since it happens 
-     only once at startup */
-  for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
-    for (j = CHAR_MIN; j < i; j++) {
-      if (HASHEQ(i, j))
-        hash_weight(i) = hash_weight(j);
-    }
+    rand = ii;
+    for (jj = 0; jj < 8; jj++)
+      rand = (rand & 1) ? poly ^ (rand >> 1) : rand >> 1;
+    crc32hash[ii] = rand;
   }
 
-  /* And this is our hash-loop "transformation" function, 
-     basically it will be hash_map[x] == ((x*HASHSHIFT)%HASHSIZE)
-     defined for 0<=x<=(HASHSIZE+HASHSHIFT) */
-  for (m = 0; m < (unsigned long)HASHMAPSIZE; m++)
+  /* Now reorder the hash table.  To make it case-insensitive, skip
+   * upper-case letters, and have lower-case letters write to the
+   * corresponding upper-case character.
+   */
+  for (ii = 0, rand = 0; ii < 256; ii++)
   {
-    l = m;
-    l *= (unsigned long)HASHSHIFT;
-    l %= (unsigned long)HASHSIZE;
-    hash_map[m] = (HASHMEMS) l;
+    char ch = ii + CHAR_MIN;
+    if (ch != ToLower(ch))
+      continue;
+    if (!rand)
+      rand = ircrandom();
+    poly = ii + rand % (256 - ii);
+    jj = crc32hash[ii];
+    crc32hash[ToUpper(ch) - CHAR_MIN] = crc32hash[ii] = crc32hash[poly];
+    crc32hash[poly] = jj;
+    rand >>= 8;
   }
 }
 
-/* These are the actual hash functions, since they are static
-   and very short any decent compiler at a good optimization level
-   WILL inline these in the following functions */
+/** Output type of hash function. */
+typedef unsigned int HASHREGS;
 
-/* This is the string hash function,
-   WARNING: n must be a valid pointer to a _non-null_ string
-   this means that not only strhash(NULL) but also
-   strhash("") _will_ coredump, it's responsibility
-   the caller to eventually check BadPtr(nick). */
-
-/** Calculate hash value for a string. */
+/** Calculate hash value for a string.
+ * @param[in] n String to hash.
+ * @return Hash value for string.
+ */
 static HASHREGS strhash(const char *n)
 {
-  HASHREGS hash = hash_weight(*n++);
+  HASHREGS hash = crc32hash[*n++ & 255];
   while (*n)
-    hash = hash_map[hash] + hash_weight(*n++);
-  return hash_map[hash];
+    hash = (hash >> 8) ^ crc32hash[(hash ^ *n++) & 255];
+  return hash % HASHSIZE;
 }
 
-/* And this is the string hash function for limited lenght strings
-   WARNING: n must be a valid pointer to a non-null string
-   and i must be > 0 ! */
-
-/* REMOVED
-
-   The time taken to decrement i makes the function
-   slower than strhash for the average of channel names (tested
-   on 16000 real channel names, 1000 loops. I left the code here
-   as a bookmark if a strnhash is evetually needed in the future.
-
-   static HASHREGS strnhash(const char *n, int i) {
-   HASHREGS hash = hash_weight(*n++);
-   i--;
-   while(*n && i--)
-   hash = hash_map[hash] + hash_weight(*n++);
-   return hash_map[hash];
-   }
-
-   #define CHANHASHLEN 30
-
-   !REMOVED */
-
 /************************** Externally visible functions ********************/
 
 /* Optimization note: in these functions I supposed that the CSE optimization
-- 
2.20.1