solve hw 7

2025-03-27 23:45:56 -04:00
parent 48d8b8fcfd
commit ff290ace15
5 changed files with 478 additions and 6 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -106,18 +106,30 @@
            "preLaunchTask": "C/C++: g++ build active file"
        },
        {
 <<<<<<< HEAD
            "name": "lab07",
 =======
            "name": "nysearch",
 >>>>>>> 6b2a5ae (solve hw 7)
            "type": "cppdbg",
            "request": "launch",
            "program": "${fileDirname}/${fileBasenameNoExtension}",
            "args": [
 <<<<<<< HEAD
 =======
                "html_files/index.html",
 >>>>>>> 6b2a5ae (solve hw 7)
                "input.txt"
            ],
            "cwd": "${fileDirname}",
            "environment": [],
            "MIMode": "gdb",
            "miDebuggerPath": "/usr/bin/gdb",
 <<<<<<< HEAD
            "preLaunchTask": "C/C++: g++ build single active file"
 =======
            "preLaunchTask": "C/C++: g++ build active file"
 >>>>>>> 6b2a5ae (solve hw 7)
        }
    ]
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -73,6 +73,10 @@
        "queue": "cpp",
        "stack": "cpp",
        "set": "cpp",
-        "climits": "cpp"
+        "climits": "cpp",
        "map": "cpp",
        "unordered_set": "cpp",
        "regex": "cpp",
        "cinttypes": "cpp"
    }
 }
--- a/hws/search_engine/README.txt
+++ b/hws/search_engine/README.txt
@@ -1,7 +1,7 @@
 HOMEWORK 7: Search Engine
-NAME:  < insert name >
+NAME:  Jinshan Zhou
 COLLABORATORS AND OTHER RESOURCES:
@@ -10,13 +10,13 @@ List the names of everyone you talked to about this assignment
 LMS, etc.), and all of the resources (books, online reference
 material, etc.) you consulted in completing this assignment.
-< insert collaborators / resources >
+some examples about recursive listing file under a path
 Remember: Your implementation for this assignment must be done on your
 own, as described in "Academic Integrity for Homework" handout.
-ESTIMATE OF # OF HOURS SPENT ON THIS ASSIGNMENT:  < insert # hours >
+ESTIMATE OF # OF HOURS SPENT ON THIS ASSIGNMENT:  12hr
 MISC. COMMENTS TO GRADER:
@@ -33,5 +33,7 @@ What parts of the assignment did you find challenging? Is there anything that
 finally "clicked" for you in the process of working on this assignment? How well
 did the development and testing process go for you?
-< insert reflection >
+The program made a lot of mistake at the begin and I don't know why. After
-
+a deep research, I found it's because some missing "/" when we join path. Then.
 I fixed it and everything works fine. The snippet also trick me a bit. But, isn't
 that hard (since easy to debug)
--- a/hws/search_engine/dir_tree.txt
+++ b/hws/search_engine/dir_tree.txt
@@ -0,0 +1,49 @@
 html_files
 ├── file1.html
 ├── file2.html
 ├── index.html
 └── subdir1
    ├── file3.html
    └── subdir2
        ├── file4.html
        ├── file5.html
        ├── file6.html
        ├── file7.html
        └── subdir3
            ├── file10.html
            ├── file8.html
            ├── file9.html
            ├── subdir4
            │   ├── file11.html
            │   ├── file12.html
            │   └── file13.html
            └── subdir5
                ├── file14.html
                ├── file15.html
                ├── file16.html
                └── subdir6
                    └── subdir7
                        ├── file17.html
                        ├── file18.html
                        ├── file19.html
                        └── subdir8
                            ├── file20.html
                            ├── file21.html
                            ├── file22.html
                            ├── file23.html
                            └── subdir9
                                ├── file24.html
                                ├── file25.html
                                └── subdir10
                                    ├── file26.html
                                    ├── file27.html
                                    ├── file28.html
                                    └── subdir11
                                        ├── file29.html
                                        ├── file30.html
                                        └── subdir12
                                            ├── file31.html
                                            └── subdir13
                                                └── file32.html
 14 directories, 33 files
--- a/hws/search_engine/main.cpp
+++ b/hws/search_engine/main.cpp
@@ -0,0 +1,405 @@
 #include <iostream>
 #include <fstream>
 #include <string>
 #include <map>
 #include <set>
 #include <vector>
 #include <list>
 #include <regex>
 #include <algorithm>
 #include <cctype>
 #include <sstream>
 // Global data structures
 std::map<std::string, std::map<std::string, int> > invertedIndex;
 std::map<std::string, int> docLengths;
 std::map<std::string, int> outgoingLinksCount;
 std::map<std::string, std::set<std::string> > backlinks;
 std::map<std::string, std::string> docContents;
 // Provided function to extract links from HTML
 std::list<std::string> extractLinksFromHTML(const std::string& fileContent) {
    std::list<std::string> links;
    std::regex linkRegex("<a\\s+[^>]*href\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");
    std::smatch match;
    std::string::const_iterator start = fileContent.begin();
    while (std::regex_search(start, fileContent.end(), match, linkRegex)) {
        if (match.size() > 1) {
            links.push_back(match[1].str());
        }
        start = match.suffix().first;
    }
    return links;
 }
 // Resolve relative path based on current directory
 std::string resolvePath(const std::string& currentDir, const std::string& link) {
    if (link.find("../") == 0) {
        std::string dir = currentDir;
        if (!dir.empty() && dir.back() == '/') {
            dir.pop_back(); // Remove trailing slash if present
        }
        size_t pos = dir.find_last_of('/');
        if (pos != std::string::npos) {
            dir = dir.substr(0, pos);
        }
        std::string remaining = link.substr(3);
        return resolvePath(dir, remaining);
    } else if (link.find("./") == 0) {
        std::string remaining = link.substr(2);
        return resolvePath(currentDir, remaining);
    } else if (link.empty()) {
        return currentDir; // Handle empty links if applicable
    } else {
        // Ensure exactly one slash between currentDir and link
        std::string dir = currentDir;
        std::string lnk = link;
        if (!dir.empty() && dir.back() == '/') {
            dir.pop_back(); // Remove trailing slash from currentDir
        }
        if (!lnk.empty() && lnk.front() == '/') {
            lnk.erase(0, 1); // Remove leading slash from link
        }
        return dir + "/" + lnk;
    }
 }
 // Get directory from URL
 std::string getDirectory(const std::string& url) {
    size_t pos = url.find_last_of('/');
    if (pos != std::string::npos) {
        return url.substr(0, pos + 1);
    }
    return "";
 }
 // Extract word frequencies with word boundaries
 std::map<std::string, int> extractWordFrequencies(const std::string& content) {
    std::map<std::string, int> freq;
    size_t i = 0;
    while (i < content.size()) {
        if (std::isalnum(content[i])) {
            if (i == 0 || !std::isalnum(content[i - 1])) {
                size_t j = i;
                while (j < content.size() && std::isalnum(content[j])) {
                    j++;
                }
                if (j == content.size() || !std::isalnum(content[j])) {
                    std::string word = content.substr(i, j - i);
                    freq[word]++;
                    i = j;
                } else {
                    i = j;
                }
            } else {
                i++;
            }
        } else {
            i++;
        }
    }
    return freq;
 }
 // Check if phrase exists with word boundaries
 bool phraseExists(const std::string& content, const std::string& phrase) {
    size_t pos = 0;
    while ((pos = content.find(phrase, pos)) != std::string::npos) {
        bool before = (pos == 0 || !std::isalnum(content[pos - 1]));
        bool after = (pos + phrase.size() == content.size() || !std::isalnum(content[pos + phrase.size()]));
        if (before && after) {
            return true;
        }
        pos += 1;
    }
    return false;
 }
 // Helper function to find a whole word with word boundaries
 size_t findWholeWord(const std::string& str, const std::string& word) {
    size_t pos = 0;
    while ((pos = str.find(word, pos)) != std::string::npos) {
        bool before = (pos == 0 || !std::isalnum(str[pos - 1]));
        bool after = (pos + word.size() == str.size() || !std::isalnum(str[pos + word.size()]));
        if (before && after) {
            return pos;
        }
        pos += 1;
    }
    return std::string::npos;
 }
 // Extract title from <head>
 std::string getTitle(const std::string& content) {
    size_t start = content.find("<title>");
    if (start == std::string::npos) return "";
    start += 7;
    size_t end = content.find("</title>", start);
    if (end == std::string::npos) return "";
    return content.substr(start, end - start);
 }
 // Extract description from <meta>
 std::string getDescription(const std::string& content) {
    size_t pos = content.find("<meta name=\"description\" content=\"");
    if (pos == std::string::npos) return "";
    pos += 34;
    size_t end = content.find("\"", pos);
    if (end == std::string::npos) return "";
    return content.substr(pos, end - pos);
 }
 // Extract <body> content
 std::string getBody(const std::string& content) {
    size_t start = content.find("<body>");
    if (start == std::string::npos) return "";
    start += 6;
    size_t end = content.find("</body>", start);
    if (end == std::string::npos) return "";
    return content.substr(start, end - start);
 }
 // Find sentence start position
 size_t findSentenceStart(const std::string& body, size_t pos) {
    size_t periodPos = body.rfind(".", pos);
    if (periodPos == std::string::npos) {
        return 0;
    }
    size_t start = periodPos + 1;
    while (start < body.size() && std::isspace(body[start])) {
        start++;
    }
    return start;
 }
 // Generate 120-character snippet
 std::string generateSnippet(const std::string& body, const std::string& query) {
    // Try to find the exact query with word boundaries first
    size_t pos = findWholeWord(body, query);
    if (pos != std::string::npos) {
        size_t start = findSentenceStart(body, pos);
        if (start + 120 <= body.size()) {
            return body.substr(start, 120);
        }
        return body.substr(start);
    } else {
        // If not found, try individual words
        std::vector<std::string> words;
        std::istringstream iss(query);
        std::string word;
        while (iss >> word) {
            words.push_back(word);
        }
        if (!words.empty()) {
            for (const std::string& w : words) {
                pos = findWholeWord(body, w);
                if (pos != std::string::npos) {
                    size_t start = findSentenceStart(body, pos);
                    if (start + 120 <= body.size()) {
                        return body.substr(start, 120);
                    }
                    return body.substr(start);
                }
            }
        }
        // If nothing found, return beginning of body
        if (body.size() <= 120) {
            return body;
        }
        return body.substr(0, 120);
    }
 }
 // Split string by whitespace
 std::vector<std::string> split(const std::string& s) {
    std::vector<std::string> words;
    std::istringstream iss(s);
    std::string word;
    while (iss >> word) {
        words.push_back(word);
    }
    return words;
 }
 // Recursive crawl function
 void crawl(const std::string& currentURL, std::set<std::string>& visited) {
    if (visited.count(currentURL)) return;
    visited.insert(currentURL);
    std::ifstream fileStream(currentURL.c_str());
    if (!fileStream.is_open()) {
        std::cerr << "Failed to open " << currentURL << std::endl;
        return;
    }
    std::string content((std::istreambuf_iterator<char>(fileStream)), std::istreambuf_iterator<char>());
    fileStream.close();
    docContents[currentURL] = content;
    docLengths[currentURL] = content.length();
    std::map<std::string, int> freq = extractWordFrequencies(content);
    for (std::map<std::string, int>::const_iterator it = freq.begin(); it != freq.end(); ++it) {
        invertedIndex[it->first][currentURL] = it->second;
    }
    std::list<std::string> links = extractLinksFromHTML(content);
    outgoingLinksCount[currentURL] = links.size();
    std::string currentDir = getDirectory(currentURL);
    for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
        std::string targetURL = resolvePath(currentDir, *it);
        backlinks[targetURL].insert(currentURL);
    }
    for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
        std::string targetURL = resolvePath(currentDir, *it);
        if (!visited.count(targetURL)) {
            crawl(targetURL, visited);
        }
    }
 }
 int main(int argc, char* argv[]) {
    if (argc != 3) {
        std::cerr << "Usage: " << argv[0] << " <seedURL> <inputFile>" << std::endl;
        return 1;
    }
    std::string seedURL = argv[1];
    std::string inputFile = argv[2];
    std::set<std::string> visited;
    crawl(seedURL, visited);
    double totalLength = 0.0;
    for (std::map<std::string, int>::const_iterator it = docLengths.begin(); it != docLengths.end(); ++it) {
        totalLength += it->second;
    }
    std::ifstream inputStream(inputFile.c_str());
    if (!inputStream.is_open()) {
        std::cerr << "Failed to open " << inputFile << std::endl;
        return 1;
    }
    std::vector<std::string> queries;
    std::string line;
    while (std::getline(inputStream, line)) {
        queries.push_back(line);
    }
    inputStream.close();
    for (size_t i = 0; i < queries.size(); ++i) {
        std::string query = queries[i];
        std::string outputFile = "out" + std::to_string(i + 1) + ".txt";
        std::ofstream outStream(outputFile.c_str());
        if (!outStream.is_open()) {
            std::cerr << "Failed to open " << outputFile << std::endl;
            continue;
        }
        bool isPhraseSearch = (query.size() >= 2 && query[0] == '"' && query[query.size() - 1] == '"');
        std::string phrase;
        if (isPhraseSearch) {
            phrase = query.substr(1, query.size() - 2);
            query = phrase;
        }
        std::vector<std::string> words = split(query);
        std::set<std::string> candidates;
        if (!words.empty()) {
            std::string firstWord = words[0];
            if (invertedIndex.count(firstWord)) {
                std::map<std::string, int> docs = invertedIndex[firstWord];
                for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
                    candidates.insert(it->first);
                }
            }
            for (size_t j = 1; j < words.size(); ++j) {
                std::string word = words[j];
                if (invertedIndex.count(word)) {
                    std::set<std::string> temp;
                    std::map<std::string, int> docs = invertedIndex[word];
                    for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
                        if (candidates.count(it->first)) {
                            temp.insert(it->first);
                        }
                    }
                    candidates = temp;
                } else {
                    candidates.clear();
                    break;
                }
            }
        }
        if (isPhraseSearch) {
            std::set<std::string> filtered;
            for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
                if (phraseExists(docContents[*it], phrase)) {
                    filtered.insert(*it);
                }
            }
            candidates = filtered;
        }
        if (candidates.empty()) {
            outStream << "Your search - " << queries[i] << " - did not match any documents." << std::endl;
            outStream.close();
            continue;
        }
        std::vector<std::pair<std::string, double> > scores;
        for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
            std::string doc = *it;
            double densityScore = 0.0;
            for (size_t j = 0; j < words.size(); ++j) {
                std::string word = words[j];
                if (invertedIndex[word].count(doc)) {
                    int freq = invertedIndex[word][doc];
                    double totalOccurrences = 0.0;
                    std::map<std::string, int> docs = invertedIndex[word];
                    for (std::map<std::string, int>::const_iterator docIt = docs.begin(); docIt != docs.end(); ++docIt) {
                        totalOccurrences += docIt->second;
                    }
                    double keywordDensityAcrossAll = totalOccurrences / totalLength;
                    densityScore += freq / (docLengths[doc] * keywordDensityAcrossAll);
                }
            }
            double backlinksScore = 0.0;
            if (backlinks.count(doc)) {
                std::set<std::string> linkers = backlinks[doc];
                for (std::set<std::string>::const_iterator linkIt = linkers.begin(); linkIt != linkers.end(); ++linkIt) {
                    backlinksScore += 1.0 / (1 + outgoingLinksCount[*linkIt]);
                }
            }
            double pageScore = 0.5 * densityScore + 0.5 * backlinksScore;
            scores.push_back(std::pair<std::string, double>(doc, pageScore));
        }
        std::sort(scores.begin(), scores.end(),
                  [](const std::pair<std::string, double>& a, const std::pair<std::string, double>& b) {
                      return a.second > b.second;
                  });
        outStream << "Matching documents: " << std::endl;
        for (size_t j = 0; j < scores.size(); ++j) {
            std::string doc = scores[j].first;
            std::string content = docContents[doc];
            std::string title = getTitle(content);
            std::string description = getDescription(content);
            std::string body = getBody(content);
            std::string snippet = generateSnippet(body, query);
            outStream << std::endl << "Title: " << title << std::endl;
            outStream << "URL: " << doc << std::endl;
            outStream << "Description: " << description << std::endl;
            outStream << "Snippet: " << snippet << std::endl;
        }
        outStream.close();
    }
    return 0;
 }