solve hw 7

2025-03-27 23:45:56 -04:00
parent 48d8b8fcfd
commit ff290ace15
5 changed files with 478 additions and 6 deletions
--- a/hws/search_engine/README.txt
+++ b/hws/search_engine/README.txt
@@ -1,7 +1,7 @@
 HOMEWORK 7: Search Engine


-NAME:  < insert name >
+NAME:  Jinshan Zhou


 COLLABORATORS AND OTHER RESOURCES:
@@ -10,13 +10,13 @@ List the names of everyone you talked to about this assignment
 LMS, etc.), and all of the resources (books, online reference
 material, etc.) you consulted in completing this assignment.

-< insert collaborators / resources >
+some examples about recursive listing file under a path

 Remember: Your implementation for this assignment must be done on your
 own, as described in "Academic Integrity for Homework" handout.


-ESTIMATE OF # OF HOURS SPENT ON THIS ASSIGNMENT:  < insert # hours >
+ESTIMATE OF # OF HOURS SPENT ON THIS ASSIGNMENT:  12hr


 MISC. COMMENTS TO GRADER:
@@ -33,5 +33,7 @@ What parts of the assignment did you find challenging? Is there anything that
 finally "clicked" for you in the process of working on this assignment? How well
 did the development and testing process go for you?

-< insert reflection >
-
+The program made a lot of mistake at the begin and I don't know why. After
+a deep research, I found it's because some missing "/" when we join path. Then.
+I fixed it and everything works fine. The snippet also trick me a bit. But, isn't
+that hard (since easy to debug)
--- a/hws/search_engine/dir_tree.txt
+++ b/hws/search_engine/dir_tree.txt
@@ -0,0 +1,49 @@
+html_files
+├── file1.html
+├── file2.html
+├── index.html
+└── subdir1
+    ├── file3.html
+    └── subdir2
+        ├── file4.html
+        ├── file5.html
+        ├── file6.html
+        ├── file7.html
+        └── subdir3
+            ├── file10.html
+            ├── file8.html
+            ├── file9.html
+            ├── subdir4
+            │   ├── file11.html
+            │   ├── file12.html
+            │   └── file13.html
+            └── subdir5
+                ├── file14.html
+                ├── file15.html
+                ├── file16.html
+                └── subdir6
+                    └── subdir7
+                        ├── file17.html
+                        ├── file18.html
+                        ├── file19.html
+                        └── subdir8
+                            ├── file20.html
+                            ├── file21.html
+                            ├── file22.html
+                            ├── file23.html
+                            └── subdir9
+                                ├── file24.html
+                                ├── file25.html
+                                └── subdir10
+                                    ├── file26.html
+                                    ├── file27.html
+                                    ├── file28.html
+                                    └── subdir11
+                                        ├── file29.html
+                                        ├── file30.html
+                                        └── subdir12
+                                            ├── file31.html
+                                            └── subdir13
+                                                └── file32.html
+
+14 directories, 33 files
--- a/hws/search_engine/main.cpp
+++ b/hws/search_engine/main.cpp
@@ -0,0 +1,405 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <list>
+#include <regex>
+#include <algorithm>
+#include <cctype>
+#include <sstream>
+
+// Global data structures
+std::map<std::string, std::map<std::string, int> > invertedIndex;
+std::map<std::string, int> docLengths;
+std::map<std::string, int> outgoingLinksCount;
+std::map<std::string, std::set<std::string> > backlinks;
+std::map<std::string, std::string> docContents;
+
+// Provided function to extract links from HTML
+std::list<std::string> extractLinksFromHTML(const std::string& fileContent) {
+    std::list<std::string> links;
+    std::regex linkRegex("<a\\s+[^>]*href\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");
+    std::smatch match;
+    std::string::const_iterator start = fileContent.begin();
+    while (std::regex_search(start, fileContent.end(), match, linkRegex)) {
+        if (match.size() > 1) {
+            links.push_back(match[1].str());
+        }
+        start = match.suffix().first;
+    }
+    return links;
+}
+
+// Resolve relative path based on current directory
+std::string resolvePath(const std::string& currentDir, const std::string& link) {
+    if (link.find("../") == 0) {
+        std::string dir = currentDir;
+        if (!dir.empty() && dir.back() == '/') {
+            dir.pop_back(); // Remove trailing slash if present
+        }
+        size_t pos = dir.find_last_of('/');
+        if (pos != std::string::npos) {
+            dir = dir.substr(0, pos);
+        }
+        std::string remaining = link.substr(3);
+        return resolvePath(dir, remaining);
+    } else if (link.find("./") == 0) {
+        std::string remaining = link.substr(2);
+        return resolvePath(currentDir, remaining);
+    } else if (link.empty()) {
+        return currentDir; // Handle empty links if applicable
+    } else {
+        // Ensure exactly one slash between currentDir and link
+        std::string dir = currentDir;
+        std::string lnk = link;
+        if (!dir.empty() && dir.back() == '/') {
+            dir.pop_back(); // Remove trailing slash from currentDir
+        }
+        if (!lnk.empty() && lnk.front() == '/') {
+            lnk.erase(0, 1); // Remove leading slash from link
+        }
+        return dir + "/" + lnk;
+    }
+}
+
+// Get directory from URL
+std::string getDirectory(const std::string& url) {
+    size_t pos = url.find_last_of('/');
+    if (pos != std::string::npos) {
+        return url.substr(0, pos + 1);
+    }
+    return "";
+}
+
+// Extract word frequencies with word boundaries
+std::map<std::string, int> extractWordFrequencies(const std::string& content) {
+    std::map<std::string, int> freq;
+    size_t i = 0;
+    while (i < content.size()) {
+        if (std::isalnum(content[i])) {
+            if (i == 0 || !std::isalnum(content[i - 1])) {
+                size_t j = i;
+                while (j < content.size() && std::isalnum(content[j])) {
+                    j++;
+                }
+                if (j == content.size() || !std::isalnum(content[j])) {
+                    std::string word = content.substr(i, j - i);
+                    freq[word]++;
+                    i = j;
+                } else {
+                    i = j;
+                }
+            } else {
+                i++;
+            }
+        } else {
+            i++;
+        }
+    }
+    return freq;
+}
+
+// Check if phrase exists with word boundaries
+bool phraseExists(const std::string& content, const std::string& phrase) {
+    size_t pos = 0;
+    while ((pos = content.find(phrase, pos)) != std::string::npos) {
+        bool before = (pos == 0 || !std::isalnum(content[pos - 1]));
+        bool after = (pos + phrase.size() == content.size() || !std::isalnum(content[pos + phrase.size()]));
+        if (before && after) {
+            return true;
+        }
+        pos += 1;
+    }
+    return false;
+}
+
+// Helper function to find a whole word with word boundaries
+size_t findWholeWord(const std::string& str, const std::string& word) {
+    size_t pos = 0;
+    while ((pos = str.find(word, pos)) != std::string::npos) {
+        bool before = (pos == 0 || !std::isalnum(str[pos - 1]));
+        bool after = (pos + word.size() == str.size() || !std::isalnum(str[pos + word.size()]));
+        if (before && after) {
+            return pos;
+        }
+        pos += 1;
+    }
+    return std::string::npos;
+}
+
+// Extract title from <head>
+std::string getTitle(const std::string& content) {
+    size_t start = content.find("<title>");
+    if (start == std::string::npos) return "";
+    start += 7;
+    size_t end = content.find("</title>", start);
+    if (end == std::string::npos) return "";
+    return content.substr(start, end - start);
+}
+
+// Extract description from <meta>
+std::string getDescription(const std::string& content) {
+    size_t pos = content.find("<meta name=\"description\" content=\"");
+    if (pos == std::string::npos) return "";
+    pos += 34;
+    size_t end = content.find("\"", pos);
+    if (end == std::string::npos) return "";
+    return content.substr(pos, end - pos);
+}
+
+// Extract <body> content
+std::string getBody(const std::string& content) {
+    size_t start = content.find("<body>");
+    if (start == std::string::npos) return "";
+    start += 6;
+    size_t end = content.find("</body>", start);
+    if (end == std::string::npos) return "";
+    return content.substr(start, end - start);
+}
+
+// Find sentence start position
+size_t findSentenceStart(const std::string& body, size_t pos) {
+    size_t periodPos = body.rfind(".", pos);
+    if (periodPos == std::string::npos) {
+        return 0;
+    }
+    size_t start = periodPos + 1;
+    while (start < body.size() && std::isspace(body[start])) {
+        start++;
+    }
+    return start;
+}
+
+// Generate 120-character snippet
+std::string generateSnippet(const std::string& body, const std::string& query) {
+    // Try to find the exact query with word boundaries first
+    size_t pos = findWholeWord(body, query);
+    if (pos != std::string::npos) {
+        size_t start = findSentenceStart(body, pos);
+        if (start + 120 <= body.size()) {
+            return body.substr(start, 120);
+        }
+        return body.substr(start);
+    } else {
+        // If not found, try individual words
+        std::vector<std::string> words;
+        std::istringstream iss(query);
+        std::string word;
+        while (iss >> word) {
+            words.push_back(word);
+        }
+        if (!words.empty()) {
+            for (const std::string& w : words) {
+                pos = findWholeWord(body, w);
+                if (pos != std::string::npos) {
+                    size_t start = findSentenceStart(body, pos);
+                    if (start + 120 <= body.size()) {
+                        return body.substr(start, 120);
+                    }
+                    return body.substr(start);
+                }
+            }
+        }
+
+        // If nothing found, return beginning of body
+        if (body.size() <= 120) {
+            return body;
+        }
+        return body.substr(0, 120);
+    }
+}
+
+// Split string by whitespace
+std::vector<std::string> split(const std::string& s) {
+    std::vector<std::string> words;
+    std::istringstream iss(s);
+    std::string word;
+    while (iss >> word) {
+        words.push_back(word);
+    }
+    return words;
+}
+
+// Recursive crawl function
+void crawl(const std::string& currentURL, std::set<std::string>& visited) {
+    if (visited.count(currentURL)) return;
+    visited.insert(currentURL);
+
+    std::ifstream fileStream(currentURL.c_str());
+    if (!fileStream.is_open()) {
+        std::cerr << "Failed to open " << currentURL << std::endl;
+        return;
+    }
+    std::string content((std::istreambuf_iterator<char>(fileStream)), std::istreambuf_iterator<char>());
+    fileStream.close();
+
+    docContents[currentURL] = content;
+    docLengths[currentURL] = content.length();
+
+    std::map<std::string, int> freq = extractWordFrequencies(content);
+    for (std::map<std::string, int>::const_iterator it = freq.begin(); it != freq.end(); ++it) {
+        invertedIndex[it->first][currentURL] = it->second;
+    }
+
+    std::list<std::string> links = extractLinksFromHTML(content);
+    outgoingLinksCount[currentURL] = links.size();
+    std::string currentDir = getDirectory(currentURL);
+
+    for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
+        std::string targetURL = resolvePath(currentDir, *it);
+        backlinks[targetURL].insert(currentURL);
+    }
+
+    for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
+        std::string targetURL = resolvePath(currentDir, *it);
+        if (!visited.count(targetURL)) {
+            crawl(targetURL, visited);
+        }
+    }
+}
+
+int main(int argc, char* argv[]) {
+    if (argc != 3) {
+        std::cerr << "Usage: " << argv[0] << " <seedURL> <inputFile>" << std::endl;
+        return 1;
+    }
+
+    std::string seedURL = argv[1];
+    std::string inputFile = argv[2];
+    std::set<std::string> visited;
+    crawl(seedURL, visited);
+
+    double totalLength = 0.0;
+    for (std::map<std::string, int>::const_iterator it = docLengths.begin(); it != docLengths.end(); ++it) {
+        totalLength += it->second;
+    }
+
+    std::ifstream inputStream(inputFile.c_str());
+    if (!inputStream.is_open()) {
+        std::cerr << "Failed to open " << inputFile << std::endl;
+        return 1;
+    }
+
+    std::vector<std::string> queries;
+    std::string line;
+    while (std::getline(inputStream, line)) {
+        queries.push_back(line);
+    }
+    inputStream.close();
+
+    for (size_t i = 0; i < queries.size(); ++i) {
+        std::string query = queries[i];
+        std::string outputFile = "out" + std::to_string(i + 1) + ".txt";
+        std::ofstream outStream(outputFile.c_str());
+        if (!outStream.is_open()) {
+            std::cerr << "Failed to open " << outputFile << std::endl;
+            continue;
+        }
+
+        bool isPhraseSearch = (query.size() >= 2 && query[0] == '"' && query[query.size() - 1] == '"');
+        std::string phrase;
+        if (isPhraseSearch) {
+            phrase = query.substr(1, query.size() - 2);
+            query = phrase;
+        }
+
+        std::vector<std::string> words = split(query);
+        std::set<std::string> candidates;
+        if (!words.empty()) {
+            std::string firstWord = words[0];
+            if (invertedIndex.count(firstWord)) {
+                std::map<std::string, int> docs = invertedIndex[firstWord];
+                for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
+                    candidates.insert(it->first);
+                }
+            }
+            for (size_t j = 1; j < words.size(); ++j) {
+                std::string word = words[j];
+                if (invertedIndex.count(word)) {
+                    std::set<std::string> temp;
+                    std::map<std::string, int> docs = invertedIndex[word];
+                    for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
+                        if (candidates.count(it->first)) {
+                            temp.insert(it->first);
+                        }
+                    }
+                    candidates = temp;
+                } else {
+                    candidates.clear();
+                    break;
+                }
+            }
+        }
+
+        if (isPhraseSearch) {
+            std::set<std::string> filtered;
+            for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
+                if (phraseExists(docContents[*it], phrase)) {
+                    filtered.insert(*it);
+                }
+            }
+            candidates = filtered;
+        }
+
+        if (candidates.empty()) {
+            outStream << "Your search - " << queries[i] << " - did not match any documents." << std::endl;
+            outStream.close();
+            continue;
+        }
+
+        std::vector<std::pair<std::string, double> > scores;
+        for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
+            std::string doc = *it;
+            double densityScore = 0.0;
+            for (size_t j = 0; j < words.size(); ++j) {
+                std::string word = words[j];
+                if (invertedIndex[word].count(doc)) {
+                    int freq = invertedIndex[word][doc];
+                    double totalOccurrences = 0.0;
+                    std::map<std::string, int> docs = invertedIndex[word];
+                    for (std::map<std::string, int>::const_iterator docIt = docs.begin(); docIt != docs.end(); ++docIt) {
+                        totalOccurrences += docIt->second;
+                    }
+                    double keywordDensityAcrossAll = totalOccurrences / totalLength;
+                    densityScore += freq / (docLengths[doc] * keywordDensityAcrossAll);
+                }
+            }
+
+            double backlinksScore = 0.0;
+            if (backlinks.count(doc)) {
+                std::set<std::string> linkers = backlinks[doc];
+                for (std::set<std::string>::const_iterator linkIt = linkers.begin(); linkIt != linkers.end(); ++linkIt) {
+                    backlinksScore += 1.0 / (1 + outgoingLinksCount[*linkIt]);
+                }
+            }
+
+            double pageScore = 0.5 * densityScore + 0.5 * backlinksScore;
+            scores.push_back(std::pair<std::string, double>(doc, pageScore));
+        }
+
+        std::sort(scores.begin(), scores.end(),
+                  [](const std::pair<std::string, double>& a, const std::pair<std::string, double>& b) {
+                      return a.second > b.second;
+                  });
+
+        outStream << "Matching documents: " << std::endl;
+        for (size_t j = 0; j < scores.size(); ++j) {
+            std::string doc = scores[j].first;
+            std::string content = docContents[doc];
+            std::string title = getTitle(content);
+            std::string description = getDescription(content);
+            std::string body = getBody(content);
+            std::string snippet = generateSnippet(body, query);
+
+            outStream << std::endl << "Title: " << title << std::endl;
+            outStream << "URL: " << doc << std::endl;
+            outStream << "Description: " << description << std::endl;
+            outStream << "Snippet: " << snippet << std::endl;
+        }
+        outStream.close();
+    }
+
+    return 0;
+}