From 0c9dd2e1918fd3c2f7be2163f9e5dc093e8c243f Mon Sep 17 00:00:00 2001
From: Jidong Xiao <xiaoj8@rpi.edu>
Date: Mon, 31 Mar 2025 20:02:51 -0400
Subject: [PATCH] adding the hash test code

---
 lectures/22_hash_tables_I/README.md     | 16 ++++-
 lectures/22_hash_tables_I/hash_test.cpp | 81 +++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 lectures/22_hash_tables_I/hash_test.cpp
diff --git a/lectures/22_hash_tables_I/README.md b/lectures/22_hash_tables_I/README.md
index ba6610f..46e677b 100644
--- a/lectures/22_hash_tables_I/README.md
+++ b/lectures/22_hash_tables_I/README.md
@@ -178,13 +178,27 @@ The problem is its high collision rate:
 unsigned int hash(const std::string& k, unsigned int N) {
 	unsigned int value = 0;
 	for (unsigned int i=0; i<k.size(); ++i) {
-		value = value*8 + k[i]; // conversion to int is automatic
+		value = value*31 + k[i]; // conversion to int is automatic
 	}
 	return value % N;
 }
 ```
 - The 2nd method is better, but can be improved further. The theory of good hash functions is quite involved and beyond the scope of this course.
 
+- You can run [this program](hash_test.cpp) which will show that the second hash function produces a lower collision rate:
+
+```console
+$ g++ -Wall -Wextra hash_test.cpp 
+$ ./a.out 
+Testing badHash (Summing ASCII values):
+Total Collisions: 4914
+Execution Time: 0.000142 seconds
+
+Testing betterHash (Multiplication by 31, a prime):
+Total Collisions: 4000
+Execution Time: 0.000148 seconds
+```
+
 ## 22.10 How do we Resolve Collisions? METHOD 1: Separate Chaining
 
 - Each table location stores a linked list of keys (and values) hashed to that location (as shown above in the phonebook hashtable). Thus, the hashing function really just selects which list to search or modify.
diff --git a/lectures/22_hash_tables_I/hash_test.cpp b/lectures/22_hash_tables_I/hash_test.cpp
new file mode 100644
index 0000000..a68bc63
--- /dev/null
+++ b/lectures/22_hash_tables_I/hash_test.cpp
@@ -0,0 +1,81 @@
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include <set>
+#include <string>
+#include <ctime>
+
+unsigned int badHash(const std::string& k, unsigned int N) {
+    unsigned int value = 0;
+    for (unsigned int i = 0; i < k.size(); ++i) {
+        value += k[i]; // simple sum of ASCII values
+    }
+    return value % N;
+}
+
+unsigned int betterHash(const std::string& k, unsigned int N) {
+    unsigned int value = 0;
+    unsigned int prime = 31;
+    for (unsigned int i = 0; i < k.size(); ++i) {
+        value = value * prime + k[i]; // use multiplications which involve the position and value of the key; also uses prime for better distribution
+    }
+    return value % N;
+}
+
+// a good hash function should distribute values evenly across N buckets.
+void testCollisions(unsigned int (*hashFunc)(const std::string&, unsigned int), 
+                    const std::vector<std::string>& testStrings, unsigned int N) {
+    std::unordered_map<unsigned int, int> bucketCounts;
+    
+    for (const std::string& str : testStrings) {
+        unsigned int hashValue = hashFunc(str, N);
+        bucketCounts[hashValue]++;
+    }
+
+    // count how many buckets have collisions
+    int collisions = 0;
+    for (const std::pair<unsigned int, int> entry : bucketCounts) {
+        if (entry.second > 1) {
+            collisions += (entry.second - 1);
+        }
+    }
+
+    std::cout << "Total Collisions: " << collisions << std::endl;
+}
+
+// generate many test strings and see how well they spread over N buckets.
+std::vector<std::string> generateTestStrings(int count) {
+    std::vector<std::string> testStrings;
+    for (int i = 0; i < count; i++) {
+        std::string str = "str" + std::to_string(i); // example: "str0", "str1"...
+        testStrings.push_back(str);
+    }
+    return testStrings;
+}
+
+void benchmark(unsigned int (*hashFunc)(const std::string&, unsigned int),
+               const std::vector<std::string>& testStrings, unsigned int N) {
+    clock_t start = clock();
+    for (const std::string& str : testStrings) {
+        hashFunc(str, N);
+    }
+    clock_t end = clock();
+    double timeTaken = double(end - start) / CLOCKS_PER_SEC;
+    std::cout << "Execution Time: " << timeTaken << " seconds" << std::endl;
+}
+
+int main() {
+    unsigned int N = 1000; // hash table size
+    std::vector<std::string> testStrings = generateTestStrings(5000);
+
+    std::cout << "Testing badHash (Summing ASCII values):\n";
+    testCollisions(badHash, testStrings, N);
+    benchmark(badHash, testStrings, N);
+
+    std::cout << "\nTesting betterHash (Multiplication by 31, a prime):\n";
+    testCollisions(betterHash, testStrings, N);
+    benchmark(betterHash, testStrings, N);
+
+    return 0;
+}
+