adding the hash test code
This commit is contained in:
@@ -178,13 +178,27 @@ The problem is its high collision rate:
|
|||||||
unsigned int hash(const std::string& k, unsigned int N) {
|
unsigned int hash(const std::string& k, unsigned int N) {
|
||||||
unsigned int value = 0;
|
unsigned int value = 0;
|
||||||
for (unsigned int i=0; i<k.size(); ++i) {
|
for (unsigned int i=0; i<k.size(); ++i) {
|
||||||
value = value*8 + k[i]; // conversion to int is automatic
|
value = value*31 + k[i]; // conversion to int is automatic
|
||||||
}
|
}
|
||||||
return value % N;
|
return value % N;
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
- The 2nd method is better, but can be improved further. The theory of good hash functions is quite involved and beyond the scope of this course.
|
- The 2nd method is better, but can be improved further. The theory of good hash functions is quite involved and beyond the scope of this course.
|
||||||
|
|
||||||
|
- You can run [this program](hash_test.cpp) which will show that the second hash function produces a lower collision rate:
|
||||||
|
|
||||||
|
```console
|
||||||
|
$ g++ -Wall -Wextra hash_test.cpp
|
||||||
|
$ ./a.out
|
||||||
|
Testing badHash (Summing ASCII values):
|
||||||
|
Total Collisions: 4914
|
||||||
|
Execution Time: 0.000142 seconds
|
||||||
|
|
||||||
|
Testing betterHash (Multiplication by 31, a prime):
|
||||||
|
Total Collisions: 4000
|
||||||
|
Execution Time: 0.000148 seconds
|
||||||
|
```
|
||||||
|
|
||||||
## 22.10 How do we Resolve Collisions? METHOD 1: Separate Chaining
|
## 22.10 How do we Resolve Collisions? METHOD 1: Separate Chaining
|
||||||
|
|
||||||
- Each table location stores a linked list of keys (and values) hashed to that location (as shown above in the phonebook hashtable). Thus, the hashing function really just selects which list to search or modify.
|
- Each table location stores a linked list of keys (and values) hashed to that location (as shown above in the phonebook hashtable). Thus, the hashing function really just selects which list to search or modify.
|
||||||
|
|||||||
81
lectures/22_hash_tables_I/hash_test.cpp
Normal file
81
lectures/22_hash_tables_I/hash_test.cpp
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
#include <ctime>
|
||||||
|
|
||||||
|
unsigned int badHash(const std::string& k, unsigned int N) {
|
||||||
|
unsigned int value = 0;
|
||||||
|
for (unsigned int i = 0; i < k.size(); ++i) {
|
||||||
|
value += k[i]; // simple sum of ASCII values
|
||||||
|
}
|
||||||
|
return value % N;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int betterHash(const std::string& k, unsigned int N) {
|
||||||
|
unsigned int value = 0;
|
||||||
|
unsigned int prime = 31;
|
||||||
|
for (unsigned int i = 0; i < k.size(); ++i) {
|
||||||
|
value = value * prime + k[i]; // use multiplications which involve the position and value of the key; also uses prime for better distribution
|
||||||
|
}
|
||||||
|
return value % N;
|
||||||
|
}
|
||||||
|
|
||||||
|
// a good hash function should distribute values evenly across N buckets.
|
||||||
|
void testCollisions(unsigned int (*hashFunc)(const std::string&, unsigned int),
|
||||||
|
const std::vector<std::string>& testStrings, unsigned int N) {
|
||||||
|
std::unordered_map<unsigned int, int> bucketCounts;
|
||||||
|
|
||||||
|
for (const std::string& str : testStrings) {
|
||||||
|
unsigned int hashValue = hashFunc(str, N);
|
||||||
|
bucketCounts[hashValue]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// count how many buckets have collisions
|
||||||
|
int collisions = 0;
|
||||||
|
for (const std::pair<unsigned int, int> entry : bucketCounts) {
|
||||||
|
if (entry.second > 1) {
|
||||||
|
collisions += (entry.second - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Total Collisions: " << collisions << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// generate many test strings and see how well they spread over N buckets.
|
||||||
|
std::vector<std::string> generateTestStrings(int count) {
|
||||||
|
std::vector<std::string> testStrings;
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
std::string str = "str" + std::to_string(i); // example: "str0", "str1"...
|
||||||
|
testStrings.push_back(str);
|
||||||
|
}
|
||||||
|
return testStrings;
|
||||||
|
}
|
||||||
|
|
||||||
|
void benchmark(unsigned int (*hashFunc)(const std::string&, unsigned int),
|
||||||
|
const std::vector<std::string>& testStrings, unsigned int N) {
|
||||||
|
clock_t start = clock();
|
||||||
|
for (const std::string& str : testStrings) {
|
||||||
|
hashFunc(str, N);
|
||||||
|
}
|
||||||
|
clock_t end = clock();
|
||||||
|
double timeTaken = double(end - start) / CLOCKS_PER_SEC;
|
||||||
|
std::cout << "Execution Time: " << timeTaken << " seconds" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
unsigned int N = 1000; // hash table size
|
||||||
|
std::vector<std::string> testStrings = generateTestStrings(5000);
|
||||||
|
|
||||||
|
std::cout << "Testing badHash (Summing ASCII values):\n";
|
||||||
|
testCollisions(badHash, testStrings, N);
|
||||||
|
benchmark(badHash, testStrings, N);
|
||||||
|
|
||||||
|
std::cout << "\nTesting betterHash (Multiplication by 31, a prime):\n";
|
||||||
|
testCollisions(betterHash, testStrings, N);
|
||||||
|
benchmark(betterHash, testStrings, N);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
Reference in New Issue
Block a user