From 0c9dd2e1918fd3c2f7be2163f9e5dc093e8c243f Mon Sep 17 00:00:00 2001 From: Jidong Xiao Date: Mon, 31 Mar 2025 20:02:51 -0400 Subject: [PATCH] adding the hash test code --- lectures/22_hash_tables_I/README.md | 16 ++++- lectures/22_hash_tables_I/hash_test.cpp | 81 +++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 lectures/22_hash_tables_I/hash_test.cpp diff --git a/lectures/22_hash_tables_I/README.md b/lectures/22_hash_tables_I/README.md index ba6610f..46e677b 100644 --- a/lectures/22_hash_tables_I/README.md +++ b/lectures/22_hash_tables_I/README.md @@ -178,13 +178,27 @@ The problem is its high collision rate: unsigned int hash(const std::string& k, unsigned int N) { unsigned int value = 0; for (unsigned int i=0; i +#include +#include +#include +#include +#include + +unsigned int badHash(const std::string& k, unsigned int N) { + unsigned int value = 0; + for (unsigned int i = 0; i < k.size(); ++i) { + value += k[i]; // simple sum of ASCII values + } + return value % N; +} + +unsigned int betterHash(const std::string& k, unsigned int N) { + unsigned int value = 0; + unsigned int prime = 31; + for (unsigned int i = 0; i < k.size(); ++i) { + value = value * prime + k[i]; // use multiplications which involve the position and value of the key; also uses prime for better distribution + } + return value % N; +} + +// a good hash function should distribute values evenly across N buckets. +void testCollisions(unsigned int (*hashFunc)(const std::string&, unsigned int), + const std::vector& testStrings, unsigned int N) { + std::unordered_map bucketCounts; + + for (const std::string& str : testStrings) { + unsigned int hashValue = hashFunc(str, N); + bucketCounts[hashValue]++; + } + + // count how many buckets have collisions + int collisions = 0; + for (const std::pair entry : bucketCounts) { + if (entry.second > 1) { + collisions += (entry.second - 1); + } + } + + std::cout << "Total Collisions: " << collisions << std::endl; +} + +// generate many test strings and see how well they spread over N buckets. +std::vector generateTestStrings(int count) { + std::vector testStrings; + for (int i = 0; i < count; i++) { + std::string str = "str" + std::to_string(i); // example: "str0", "str1"... + testStrings.push_back(str); + } + return testStrings; +} + +void benchmark(unsigned int (*hashFunc)(const std::string&, unsigned int), + const std::vector& testStrings, unsigned int N) { + clock_t start = clock(); + for (const std::string& str : testStrings) { + hashFunc(str, N); + } + clock_t end = clock(); + double timeTaken = double(end - start) / CLOCKS_PER_SEC; + std::cout << "Execution Time: " << timeTaken << " seconds" << std::endl; +} + +int main() { + unsigned int N = 1000; // hash table size + std::vector testStrings = generateTestStrings(5000); + + std::cout << "Testing badHash (Summing ASCII values):\n"; + testCollisions(badHash, testStrings, N); + benchmark(badHash, testStrings, N); + + std::cout << "\nTesting betterHash (Multiplication by 31, a prime):\n"; + testCollisions(betterHash, testStrings, N); + benchmark(betterHash, testStrings, N); + + return 0; +} +