adding the hash test code

This commit is contained in:
Jidong Xiao
2025-03-31 20:02:51 -04:00
committed by JamesFlare
parent b33e6b0376
commit 0c9dd2e191
2 changed files with 96 additions and 1 deletions

View File

@@ -178,13 +178,27 @@ The problem is its high collision rate:
unsigned int hash(const std::string& k, unsigned int N) {
unsigned int value = 0;
for (unsigned int i=0; i<k.size(); ++i) {
value = value*8 + k[i]; // conversion to int is automatic
value = value*31 + k[i]; // conversion to int is automatic
}
return value % N;
}
```
- The 2nd method is better, but can be improved further. The theory of good hash functions is quite involved and beyond the scope of this course.
- You can run [this program](hash_test.cpp) which will show that the second hash function produces a lower collision rate:
```console
$ g++ -Wall -Wextra hash_test.cpp
$ ./a.out
Testing badHash (Summing ASCII values):
Total Collisions: 4914
Execution Time: 0.000142 seconds
Testing betterHash (Multiplication by 31, a prime):
Total Collisions: 4000
Execution Time: 0.000148 seconds
```
## 22.10 How do we Resolve Collisions? METHOD 1: Separate Chaining
- Each table location stores a linked list of keys (and values) hashed to that location (as shown above in the phonebook hashtable). Thus, the hashing function really just selects which list to search or modify.

View File

@@ -0,0 +1,81 @@
#include <iostream>
#include <vector>
#include <unordered_map>
#include <set>
#include <string>
#include <ctime>
unsigned int badHash(const std::string& k, unsigned int N) {
unsigned int value = 0;
for (unsigned int i = 0; i < k.size(); ++i) {
value += k[i]; // simple sum of ASCII values
}
return value % N;
}
unsigned int betterHash(const std::string& k, unsigned int N) {
unsigned int value = 0;
unsigned int prime = 31;
for (unsigned int i = 0; i < k.size(); ++i) {
value = value * prime + k[i]; // use multiplications which involve the position and value of the key; also uses prime for better distribution
}
return value % N;
}
// a good hash function should distribute values evenly across N buckets.
void testCollisions(unsigned int (*hashFunc)(const std::string&, unsigned int),
const std::vector<std::string>& testStrings, unsigned int N) {
std::unordered_map<unsigned int, int> bucketCounts;
for (const std::string& str : testStrings) {
unsigned int hashValue = hashFunc(str, N);
bucketCounts[hashValue]++;
}
// count how many buckets have collisions
int collisions = 0;
for (const std::pair<unsigned int, int> entry : bucketCounts) {
if (entry.second > 1) {
collisions += (entry.second - 1);
}
}
std::cout << "Total Collisions: " << collisions << std::endl;
}
// generate many test strings and see how well they spread over N buckets.
std::vector<std::string> generateTestStrings(int count) {
std::vector<std::string> testStrings;
for (int i = 0; i < count; i++) {
std::string str = "str" + std::to_string(i); // example: "str0", "str1"...
testStrings.push_back(str);
}
return testStrings;
}
void benchmark(unsigned int (*hashFunc)(const std::string&, unsigned int),
const std::vector<std::string>& testStrings, unsigned int N) {
clock_t start = clock();
for (const std::string& str : testStrings) {
hashFunc(str, N);
}
clock_t end = clock();
double timeTaken = double(end - start) / CLOCKS_PER_SEC;
std::cout << "Execution Time: " << timeTaken << " seconds" << std::endl;
}
int main() {
unsigned int N = 1000; // hash table size
std::vector<std::string> testStrings = generateTestStrings(5000);
std::cout << "Testing badHash (Summing ASCII values):\n";
testCollisions(badHash, testStrings, N);
benchmark(badHash, testStrings, N);
std::cout << "\nTesting betterHash (Multiplication by 31, a prime):\n";
testCollisions(betterHash, testStrings, N);
benchmark(betterHash, testStrings, N);
return 0;
}