mirror of
git://git.sv.gnu.org/coreutils.git
synced 2026-02-28 10:04:18 +02:00
Revamp to allow fine-tuning to control when and by how
much the table grows and shrinks. (next_prime): Don't assert. (hash_reset_tuning): New function. (check_tuning): New function. (hash_initialize): Accept and use new tuning parameter. (hash_rehash): Rewrite, updating for tuning. (hash_insert): Honor tuning semantics. (hash_delete): Likewise. From Franois Pinard.
This commit is contained in:
327
lib/hash.c
327
lib/hash.c
@@ -76,10 +76,36 @@ char *malloc ();
|
||||
become inordinate, as unused slots in the hash table take some space. The
|
||||
best bet is to make sure you are using a good `hasher' function (beware
|
||||
that those are not that easy to write! :-), and to use a table size at
|
||||
least bigger than the actual number of entries.
|
||||
least bigger than the actual number of entries. */
|
||||
|
||||
Currently, whenever the addition of an entry gets 80% of buckets to be
|
||||
non-empty, this package automatically doubles the number of buckets. */
|
||||
/* If, while adding an to a bucket which was empty, the ratio of used buckets
|
||||
over table size goes over the growth threshold (a number between 0.0 and
|
||||
1.0), then reorganise the table size bigger by the growth factor (a number
|
||||
greater than 1.0). The growth threshold defaults to 0.8, and the growth
|
||||
factor defaults to 1.414, meaning that the table will have doubled its size
|
||||
every second time 80% of the buckets get used. */
|
||||
#define DEFAULT_GROWTH_THRESHOLD 0.8
|
||||
#define DEFAULT_GROWTH_FACTOR 1.414
|
||||
|
||||
/* If, while emptying a bucket, the ratio of used buckets over table size
|
||||
drops below the shrink threshold (a number between 0.0 and 1.0), then
|
||||
reorganise the table size smaller through the usage of a shrink factor (a
|
||||
number greater than the shrink threshold but smaller than 1.0). The shrink
|
||||
threshold and factor default to 0.0 and 1.0, meaning that the table never
|
||||
shrinks. */
|
||||
#define DEFAULT_SHRINK_THRESHOLD 0.0
|
||||
#define DEFAULT_SHRINK_FACTOR 1.0
|
||||
|
||||
/* Use this to initialize or reset a TUNING structure to
|
||||
some sensible values. */
|
||||
static const Hash_tuning default_tuning =
|
||||
{
|
||||
DEFAULT_SHRINK_THRESHOLD,
|
||||
DEFAULT_SHRINK_FACTOR,
|
||||
DEFAULT_GROWTH_THRESHOLD,
|
||||
DEFAULT_GROWTH_FACTOR,
|
||||
false
|
||||
};
|
||||
|
||||
/* Information and lookup. */
|
||||
|
||||
@@ -91,7 +117,7 @@ char *malloc ();
|
||||
number of buckets (used plus unused), or the maximum number of slots, are
|
||||
the same quantity. */
|
||||
|
||||
unsigned int
|
||||
unsigned
|
||||
hash_get_n_buckets (const Hash_table *table)
|
||||
{
|
||||
return table->n_buckets;
|
||||
@@ -99,7 +125,7 @@ hash_get_n_buckets (const Hash_table *table)
|
||||
|
||||
/* Return the number of slots in use (non-empty buckets). */
|
||||
|
||||
unsigned int
|
||||
unsigned
|
||||
hash_get_n_buckets_used (const Hash_table *table)
|
||||
{
|
||||
return table->n_buckets_used;
|
||||
@@ -107,7 +133,7 @@ hash_get_n_buckets_used (const Hash_table *table)
|
||||
|
||||
/* Return the number of active entries. */
|
||||
|
||||
unsigned int
|
||||
unsigned
|
||||
hash_get_n_entries (const Hash_table *table)
|
||||
{
|
||||
return table->n_entries;
|
||||
@@ -115,18 +141,18 @@ hash_get_n_entries (const Hash_table *table)
|
||||
|
||||
/* Return the length of the most lenghty chain (bucket). */
|
||||
|
||||
unsigned int
|
||||
unsigned
|
||||
hash_get_max_bucket_length (const Hash_table *table)
|
||||
{
|
||||
struct hash_entry *bucket;
|
||||
unsigned int max_bucket_length = 0;
|
||||
unsigned max_bucket_length = 0;
|
||||
|
||||
for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
|
||||
{
|
||||
if (bucket->data)
|
||||
{
|
||||
struct hash_entry *cursor = bucket;
|
||||
unsigned int bucket_length = 1;
|
||||
unsigned bucket_length = 1;
|
||||
|
||||
while (cursor = cursor->next, cursor)
|
||||
bucket_length++;
|
||||
@@ -146,8 +172,8 @@ bool
|
||||
hash_table_ok (const Hash_table *table)
|
||||
{
|
||||
struct hash_entry *bucket;
|
||||
unsigned int n_buckets_used = 0;
|
||||
unsigned int n_entries = 0;
|
||||
unsigned n_buckets_used = 0;
|
||||
unsigned n_entries = 0;
|
||||
|
||||
for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
|
||||
{
|
||||
@@ -174,10 +200,10 @@ hash_table_ok (const Hash_table *table)
|
||||
void
|
||||
hash_print_statistics (const Hash_table *table, FILE *stream)
|
||||
{
|
||||
unsigned int n_entries = hash_get_n_entries (table);
|
||||
unsigned int n_buckets = hash_get_n_buckets (table);
|
||||
unsigned int n_buckets_used = hash_get_n_buckets_used (table);
|
||||
unsigned int max_bucket_length = hash_get_max_bucket_length (table);
|
||||
unsigned n_entries = hash_get_n_entries (table);
|
||||
unsigned n_buckets = hash_get_n_buckets (table);
|
||||
unsigned n_buckets_used = hash_get_n_buckets_used (table);
|
||||
unsigned max_bucket_length = hash_get_max_bucket_length (table);
|
||||
|
||||
fprintf (stream, "# entries: %u\n", n_entries);
|
||||
fprintf (stream, "# buckets: %u\n", n_buckets);
|
||||
@@ -186,8 +212,8 @@ hash_print_statistics (const Hash_table *table, FILE *stream)
|
||||
fprintf (stream, "max bucket length: %u\n", max_bucket_length);
|
||||
}
|
||||
|
||||
/* If an entry from table, TABLE, matches ENTRY, return the one from
|
||||
the table. Otherwise, return NULL. */
|
||||
/* Return the user entry from the hash table, if some entry in the hash table
|
||||
compares equally with ENTRY, or NULL otherwise. */
|
||||
|
||||
void *
|
||||
hash_lookup (const Hash_table *table, const void *entry)
|
||||
@@ -263,11 +289,11 @@ hash_get_next (const Hash_table *table, const void *entry)
|
||||
return the number of pointers copied. Do not copy more than BUFFER_SIZE
|
||||
pointers. */
|
||||
|
||||
unsigned int
|
||||
unsigned
|
||||
hash_get_entries (const Hash_table *table, void **buffer,
|
||||
unsigned int buffer_size)
|
||||
unsigned buffer_size)
|
||||
{
|
||||
unsigned int counter = 0;
|
||||
unsigned counter = 0;
|
||||
struct hash_entry *bucket;
|
||||
struct hash_entry *cursor;
|
||||
|
||||
@@ -295,11 +321,11 @@ hash_get_entries (const Hash_table *table, void **buffer,
|
||||
as received. The walking continue for as long as the PROCESSOR function
|
||||
returns nonzero. When it returns zero, the walking is interrupted. */
|
||||
|
||||
unsigned int
|
||||
unsigned
|
||||
hash_do_for_each (const Hash_table *table, Hash_processor processor,
|
||||
void *processor_data)
|
||||
{
|
||||
unsigned int counter = 0;
|
||||
unsigned counter = 0;
|
||||
struct hash_entry *bucket;
|
||||
struct hash_entry *cursor;
|
||||
|
||||
@@ -332,8 +358,8 @@ hash_do_for_each (const Hash_table *table, Hash_processor processor,
|
||||
algorithms tend to be domain-specific, so what's good for [diffutils'] io.c
|
||||
may not be good for your application." */
|
||||
|
||||
unsigned int
|
||||
hash_string (const char *string, unsigned int n_buckets)
|
||||
unsigned
|
||||
hash_string (const char *string, unsigned n_buckets)
|
||||
{
|
||||
# ifndef CHAR_BIT
|
||||
# define CHAR_BIT 8
|
||||
@@ -360,8 +386,8 @@ hash_string (const char *string, unsigned int n_buckets)
|
||||
very old Cyber `snoop', itself written in typical Greg Mansfield style.
|
||||
(By the way, what happened to this excellent man? Is he still alive?) */
|
||||
|
||||
unsigned int
|
||||
hash_string (const char *string, unsigned int n_buckets)
|
||||
unsigned
|
||||
hash_string (const char *string, unsigned n_buckets)
|
||||
{
|
||||
unsigned value = 0;
|
||||
|
||||
@@ -393,12 +419,14 @@ is_prime (unsigned long candidate)
|
||||
}
|
||||
|
||||
/* Round a given CANDIDATE number up to the nearest prime, and return that
|
||||
prime. CANDIDATE should be at least equal to 10. */
|
||||
prime. Primes lower than 10 are merely skipped. */
|
||||
|
||||
static unsigned long
|
||||
next_prime (unsigned long candidate)
|
||||
{
|
||||
assert (candidate >= 10);
|
||||
/* Skip small primes. */
|
||||
if (candidate < 10)
|
||||
candidate = 10;
|
||||
|
||||
/* Make it definitely odd. */
|
||||
candidate |= 1;
|
||||
@@ -409,33 +437,72 @@ next_prime (unsigned long candidate)
|
||||
return candidate;
|
||||
}
|
||||
|
||||
void
|
||||
hash_reset_tuning (Hash_tuning *tuning)
|
||||
{
|
||||
*tuning = default_tuning;
|
||||
}
|
||||
|
||||
/* For the given hash TABLE, check the user supplied tuning structure for
|
||||
reasonable values, and return true if there is no gross error with it.
|
||||
Otherwise, definitvely reset the TUNING field to some acceptable default in
|
||||
the hash table (that is, the user looses the right of further modifying
|
||||
tuning arguments), and return false. */
|
||||
|
||||
static bool
|
||||
check_tuning (Hash_table *table)
|
||||
{
|
||||
const Hash_tuning *tuning = table->tuning;
|
||||
|
||||
if (tuning->growth_threshold > 0.0
|
||||
&& tuning->growth_threshold < 1.0
|
||||
&& tuning->growth_factor > 1.0
|
||||
&& tuning->shrink_threshold >= 0.0
|
||||
&& tuning->shrink_threshold < 1.0
|
||||
&& tuning->shrink_factor > tuning->shrink_threshold
|
||||
&& tuning->shrink_factor <= 1.0
|
||||
&& tuning->shrink_threshold < tuning->growth_threshold)
|
||||
return true;
|
||||
|
||||
table->tuning = &default_tuning;
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Allocate and return a new hash table, or NULL if an error is met. The
|
||||
initial number of buckets would be at least CANDIDATE (which need not be
|
||||
prime).
|
||||
initial number of buckets is automatically selected so to _guarantee_ that
|
||||
you may insert at least CANDIDATE different user entries before any growth
|
||||
of the hash table size occurs. So, if you happen to know beforehand the
|
||||
number of entries you intend to insert in the hash table, you may save some
|
||||
table memory and insertion time, by specifying it here. If the
|
||||
IS_N_BUCKETS field of the TUNING structure is true, the CANDIDATE argument
|
||||
has its meaning changed to the wanted number of buckets.
|
||||
|
||||
If DATA_FREER is not NULL, this function may be later called with the data
|
||||
as an argument, just before they entry containing the data gets freed. The
|
||||
HASHER function should be supplied, and FIXME. The COMPARATOR function
|
||||
should also be supplied, and FIXME. */
|
||||
TUNING points to a structure of user-supplied values, in case some fine
|
||||
tuning is wanted over the default behaviour of the hasher. If TUNING is
|
||||
NULL, proper defaults are used instead.
|
||||
|
||||
/* User-supplied function for freeing datas. It is specified in
|
||||
hash_initialize. If non-null, it is used by hash_free and hash_clear.
|
||||
You should specify `free' here only if you want these functions to free
|
||||
all of your `data' data. This is typically the case when your data is
|
||||
simply an auxilliary struct that you have malloc'd to aggregate several
|
||||
values. */
|
||||
The user-supplied HASHER function should be provided. It accepts two
|
||||
arguments ENTRY and TABLE_SIZE. It computes, by hasing ENTRY contents, a
|
||||
slot number for that entry which should be in the range 0..TABLE_SIZE-1.
|
||||
This slot number is then returned.
|
||||
|
||||
/* User-supplied hash function that hashes entry ENTRY to an integer in
|
||||
the range 0..TABLE_SIZE-1. */
|
||||
The user-supplied COMPARATOR function should be provided. It accepts two
|
||||
arguments pointing to user data, it then returns true for a pair of entries
|
||||
that compare equal, or false otherwise. This function is internally called
|
||||
on entries which are already known to hash to the same bucket index.
|
||||
|
||||
/* User-supplied function that determines whether a new entry is unique by
|
||||
comparing the new entry to entries that hashed to the same bucket
|
||||
index. It should return zero for a pair of entries that compare equal,
|
||||
non-zero otherwise. */
|
||||
The user-supplied DATA_FREER function, when not NULL, may be later called
|
||||
with the user data as an argument, just before the entry containing the
|
||||
data gets freed. This happens from within `hash_free' or `hash_clear'.
|
||||
You should specify this function only if you want these functions to free
|
||||
all of your `data' data. This is typically the case when your data is
|
||||
simply an auxiliary struct that you have malloc'd to aggregate several
|
||||
values. */
|
||||
|
||||
Hash_table *
|
||||
hash_initialize (unsigned int candidate, Hash_hasher hasher,
|
||||
Hash_comparator comparator, Hash_data_freer data_freer)
|
||||
hash_initialize (unsigned candidate, const Hash_tuning *tuning,
|
||||
Hash_hasher hasher, Hash_comparator comparator,
|
||||
Hash_data_freer data_freer)
|
||||
{
|
||||
Hash_table *table;
|
||||
struct hash_entry *bucket;
|
||||
@@ -447,7 +514,23 @@ hash_initialize (unsigned int candidate, Hash_hasher hasher,
|
||||
if (table == NULL)
|
||||
return NULL;
|
||||
|
||||
table->n_buckets = next_prime (candidate < 10 ? 10 : candidate);
|
||||
if (!tuning)
|
||||
tuning = &default_tuning;
|
||||
table->tuning = tuning;
|
||||
if (!check_tuning (table))
|
||||
{
|
||||
/* Abort initialisation if tuning arguments are improper. This is the
|
||||
only occasion when the user gets some feedback about it. Later on,
|
||||
if the user modifies the tuning wrongly, it gets restored to some
|
||||
proper default, and the user looses the right of tuning further. */
|
||||
free (table);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
table->n_buckets
|
||||
= next_prime (tuning->is_n_buckets ? candidate
|
||||
: (unsigned) (candidate / tuning->growth_threshold));
|
||||
|
||||
table->bucket = (struct hash_entry *)
|
||||
malloc (table->n_buckets * sizeof (struct hash_entry));
|
||||
if (table->bucket == NULL)
|
||||
@@ -682,21 +765,24 @@ hash_find_entry (Hash_table *table, const void *entry,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* For an already existing hash table, change the number of buckets and make
|
||||
it NEW_TABLE_SIZE. The contents of the hash table are preserved. */
|
||||
/* For an already existing hash table, change the number of buckets through
|
||||
specifying CANDIDATE. The contents of the hash table are preserved. The
|
||||
new number of buckets is automatically selected so to _guarantee_ that the
|
||||
table may receive at least CANDIDATE different user entries, including
|
||||
those already in the table, before any other growth of the hash table size
|
||||
occurs. If the IS_N_BUCKETS field of the TUNING structure is true, the
|
||||
CANDIDATE argument has its meaning changed to the wanted number of buckets.
|
||||
*/
|
||||
|
||||
bool
|
||||
hash_rehash (Hash_table *table, unsigned int new_n_buckets)
|
||||
hash_rehash (Hash_table *table, unsigned candidate)
|
||||
{
|
||||
Hash_table *new_table;
|
||||
struct hash_entry *bucket;
|
||||
struct hash_entry *cursor;
|
||||
struct hash_entry *next;
|
||||
|
||||
if (table->n_buckets <= 0 || new_n_buckets == 0)
|
||||
return false;
|
||||
|
||||
new_table = hash_initialize (new_n_buckets, table->hasher,
|
||||
new_table = hash_initialize (candidate, table->tuning, table->hasher,
|
||||
table->comparator, table->data_freer);
|
||||
if (new_table == NULL)
|
||||
return false;
|
||||
@@ -709,43 +795,50 @@ hash_rehash (Hash_table *table, unsigned int new_n_buckets)
|
||||
new_table->free_entry_list = table->free_entry_list;
|
||||
|
||||
for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
|
||||
{
|
||||
if (bucket->data)
|
||||
if (bucket->data)
|
||||
for (cursor = bucket; cursor; cursor = next)
|
||||
{
|
||||
for (cursor = bucket; cursor; cursor = next)
|
||||
void *data = cursor->data;
|
||||
struct hash_entry *new_bucket
|
||||
= (new_table->bucket
|
||||
+ new_table->hasher (data, new_table->n_buckets));
|
||||
|
||||
assert (new_bucket < new_table->bucket_limit);
|
||||
next = cursor->next;
|
||||
|
||||
if (new_bucket->data)
|
||||
if (cursor == bucket)
|
||||
{
|
||||
/* Allocate or recycle an entry, when moving from a bucket
|
||||
header into a bucket overflow. */
|
||||
struct hash_entry *new_entry = allocate_entry (new_table);
|
||||
|
||||
if (new_entry == NULL)
|
||||
return false;
|
||||
|
||||
new_entry->data = data;
|
||||
new_entry->next = new_bucket->next;
|
||||
new_bucket->next = new_entry;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Merely relink an existing entry, when moving from a
|
||||
bucket overflow into a bucket overflow. */
|
||||
cursor->next = new_bucket->next;
|
||||
new_bucket->next = cursor;
|
||||
}
|
||||
else
|
||||
{
|
||||
void *data = cursor->data;
|
||||
struct hash_entry *new_bucket
|
||||
= new_table->bucket + new_table->hasher (data, new_n_buckets);
|
||||
|
||||
assert (new_bucket < new_table->bucket_limit);
|
||||
|
||||
/* Free overflow entries as soon as possible, moving them from the
|
||||
old hash table into the new one, as they may be needed now. */
|
||||
next = cursor->next;
|
||||
/* Free an existing entry, when moving from a bucket
|
||||
overflow into a bucket header. Also take care of the
|
||||
simple case of moving from a bucket header into a bucket
|
||||
header. */
|
||||
new_bucket->data = data;
|
||||
new_table->n_buckets_used++;
|
||||
if (cursor != bucket)
|
||||
free_entry (new_table, cursor);
|
||||
|
||||
/* Insert the entry into the new hash table. */
|
||||
if (new_bucket->data)
|
||||
{
|
||||
struct hash_entry *new_entry = allocate_entry (new_table);
|
||||
|
||||
if (new_entry == NULL)
|
||||
return false;
|
||||
|
||||
new_entry->data = data;
|
||||
new_entry->next = new_bucket->next;
|
||||
new_bucket->next = new_entry;
|
||||
}
|
||||
else
|
||||
{
|
||||
new_bucket->data = data;
|
||||
new_table->n_buckets_used++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free (table->bucket);
|
||||
table->bucket = new_table->bucket;
|
||||
@@ -801,18 +894,31 @@ hash_insert (Hash_table *table, const void *entry)
|
||||
table->n_entries++;
|
||||
table->n_buckets_used++;
|
||||
|
||||
/* If more than 80% of the buckets are in use, rehash the table so it's two
|
||||
times bigger. There's no point in checking the number of entries,
|
||||
because if the hashing function is ill-conditioned, rehashing is not
|
||||
likely to improve it. */
|
||||
/* If the growth threshold of the buckets in use has been reached, rehash
|
||||
the table bigger. It's no real use checking the number of entries, as
|
||||
if the hashing function is ill-conditioned, rehashing is not likely to
|
||||
improve it. */
|
||||
|
||||
if (5 * table->n_buckets_used > 4 * table->n_buckets)
|
||||
if (table->n_buckets_used
|
||||
> table->tuning->growth_threshold * table->n_buckets)
|
||||
{
|
||||
unsigned int new_n_buckets = next_prime (2 * table->n_buckets + 1);
|
||||
/* Check more fully, before starting real work. If tuning arguments got
|
||||
improper, the second check will rely on proper defaults. */
|
||||
check_tuning (table);
|
||||
if (table->n_buckets_used
|
||||
> table->tuning->growth_threshold * table->n_buckets)
|
||||
{
|
||||
const Hash_tuning *tuning = table->tuning;
|
||||
unsigned candidate
|
||||
= (unsigned) (tuning->is_n_buckets
|
||||
? (table->n_buckets * tuning->growth_factor)
|
||||
: (table->n_buckets * tuning->growth_factor
|
||||
* tuning->growth_threshold));
|
||||
|
||||
/* If the rehash fails, arrange to return NULL. */
|
||||
if (!hash_rehash (table, new_n_buckets))
|
||||
entry = NULL;
|
||||
/* If the rehash fails, arrange to return NULL. */
|
||||
if (!hash_rehash (table, candidate))
|
||||
entry = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return (void *) entry;
|
||||
@@ -831,9 +937,34 @@ hash_delete (Hash_table *table, const void *entry)
|
||||
if (data = hash_find_entry (table, entry, &bucket, true), !data)
|
||||
return NULL;
|
||||
|
||||
if (!bucket->data)
|
||||
table->n_buckets_used--;
|
||||
table->n_entries--;
|
||||
if (!bucket->data)
|
||||
{
|
||||
table->n_buckets_used--;
|
||||
|
||||
/* If the shrink threshold of the buckets in use has been reached,
|
||||
rehash the table smaller. */
|
||||
|
||||
if (table->n_buckets_used
|
||||
< table->tuning->shrink_threshold * table->n_buckets)
|
||||
{
|
||||
/* Check more fully, before starting real work. If tuning arguments
|
||||
got improper, the second check will rely on proper defaults. */
|
||||
check_tuning (table);
|
||||
if (table->n_buckets_used
|
||||
< table->tuning->shrink_threshold * table->n_buckets)
|
||||
{
|
||||
const Hash_tuning *tuning = table->tuning;
|
||||
unsigned candidate
|
||||
= (unsigned) (tuning->is_n_buckets
|
||||
? table->n_buckets * tuning->shrink_factor
|
||||
: (table->n_buckets * tuning->shrink_factor
|
||||
* tuning->growth_threshold));
|
||||
|
||||
hash_rehash (table, candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user