diff --git a/.vscode/launch.json b/.vscode/launch.json index 71daa57..1ab0735 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -106,18 +106,30 @@ "preLaunchTask": "C/C++: g++ build active file" }, { +<<<<<<< HEAD "name": "lab07", +======= + "name": "nysearch", +>>>>>>> 6b2a5ae (solve hw 7) "type": "cppdbg", "request": "launch", "program": "${fileDirname}/${fileBasenameNoExtension}", "args": [ +<<<<<<< HEAD +======= + "html_files/index.html", +>>>>>>> 6b2a5ae (solve hw 7) "input.txt" ], "cwd": "${fileDirname}", "environment": [], "MIMode": "gdb", "miDebuggerPath": "/usr/bin/gdb", +<<<<<<< HEAD "preLaunchTask": "C/C++: g++ build single active file" +======= + "preLaunchTask": "C/C++: g++ build active file" +>>>>>>> 6b2a5ae (solve hw 7) } ] } diff --git a/.vscode/settings.json b/.vscode/settings.json index e53c2b2..5115d15 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -73,6 +73,10 @@ "queue": "cpp", "stack": "cpp", "set": "cpp", - "climits": "cpp" + "climits": "cpp", + "map": "cpp", + "unordered_set": "cpp", + "regex": "cpp", + "cinttypes": "cpp" } } \ No newline at end of file diff --git a/hws/search_engine/README.txt b/hws/search_engine/README.txt index 6381d47..ed948b2 100644 --- a/hws/search_engine/README.txt +++ b/hws/search_engine/README.txt @@ -1,7 +1,7 @@ HOMEWORK 7: Search Engine -NAME: < insert name > +NAME: Jinshan Zhou COLLABORATORS AND OTHER RESOURCES: @@ -10,13 +10,13 @@ List the names of everyone you talked to about this assignment LMS, etc.), and all of the resources (books, online reference material, etc.) you consulted in completing this assignment. -< insert collaborators / resources > +some examples about recursive listing file under a path Remember: Your implementation for this assignment must be done on your own, as described in "Academic Integrity for Homework" handout. -ESTIMATE OF # OF HOURS SPENT ON THIS ASSIGNMENT: < insert # hours > +ESTIMATE OF # OF HOURS SPENT ON THIS ASSIGNMENT: 12hr MISC. COMMENTS TO GRADER: @@ -33,5 +33,7 @@ What parts of the assignment did you find challenging? Is there anything that finally "clicked" for you in the process of working on this assignment? How well did the development and testing process go for you? -< insert reflection > - +The program made a lot of mistake at the begin and I don't know why. After +a deep research, I found it's because some missing "/" when we join path. Then. +I fixed it and everything works fine. The snippet also trick me a bit. But, isn't +that hard (since easy to debug) diff --git a/hws/search_engine/dir_tree.txt b/hws/search_engine/dir_tree.txt new file mode 100644 index 0000000..2f4d437 --- /dev/null +++ b/hws/search_engine/dir_tree.txt @@ -0,0 +1,49 @@ +html_files +├── file1.html +├── file2.html +├── index.html +└── subdir1 + ├── file3.html + └── subdir2 + ├── file4.html + ├── file5.html + ├── file6.html + ├── file7.html + └── subdir3 + ├── file10.html + ├── file8.html + ├── file9.html + ├── subdir4 + │   ├── file11.html + │   ├── file12.html + │   └── file13.html + └── subdir5 + ├── file14.html + ├── file15.html + ├── file16.html + └── subdir6 + └── subdir7 + ├── file17.html + ├── file18.html + ├── file19.html + └── subdir8 + ├── file20.html + ├── file21.html + ├── file22.html + ├── file23.html + └── subdir9 + ├── file24.html + ├── file25.html + └── subdir10 + ├── file26.html + ├── file27.html + ├── file28.html + └── subdir11 + ├── file29.html + ├── file30.html + └── subdir12 + ├── file31.html + └── subdir13 + └── file32.html + +14 directories, 33 files diff --git a/hws/search_engine/main.cpp b/hws/search_engine/main.cpp new file mode 100644 index 0000000..e6b9004 --- /dev/null +++ b/hws/search_engine/main.cpp @@ -0,0 +1,405 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Global data structures +std::map > invertedIndex; +std::map docLengths; +std::map outgoingLinksCount; +std::map > backlinks; +std::map docContents; + +// Provided function to extract links from HTML +std::list extractLinksFromHTML(const std::string& fileContent) { + std::list links; + std::regex linkRegex("]*href\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>"); + std::smatch match; + std::string::const_iterator start = fileContent.begin(); + while (std::regex_search(start, fileContent.end(), match, linkRegex)) { + if (match.size() > 1) { + links.push_back(match[1].str()); + } + start = match.suffix().first; + } + return links; +} + +// Resolve relative path based on current directory +std::string resolvePath(const std::string& currentDir, const std::string& link) { + if (link.find("../") == 0) { + std::string dir = currentDir; + if (!dir.empty() && dir.back() == '/') { + dir.pop_back(); // Remove trailing slash if present + } + size_t pos = dir.find_last_of('/'); + if (pos != std::string::npos) { + dir = dir.substr(0, pos); + } + std::string remaining = link.substr(3); + return resolvePath(dir, remaining); + } else if (link.find("./") == 0) { + std::string remaining = link.substr(2); + return resolvePath(currentDir, remaining); + } else if (link.empty()) { + return currentDir; // Handle empty links if applicable + } else { + // Ensure exactly one slash between currentDir and link + std::string dir = currentDir; + std::string lnk = link; + if (!dir.empty() && dir.back() == '/') { + dir.pop_back(); // Remove trailing slash from currentDir + } + if (!lnk.empty() && lnk.front() == '/') { + lnk.erase(0, 1); // Remove leading slash from link + } + return dir + "/" + lnk; + } +} + +// Get directory from URL +std::string getDirectory(const std::string& url) { + size_t pos = url.find_last_of('/'); + if (pos != std::string::npos) { + return url.substr(0, pos + 1); + } + return ""; +} + +// Extract word frequencies with word boundaries +std::map extractWordFrequencies(const std::string& content) { + std::map freq; + size_t i = 0; + while (i < content.size()) { + if (std::isalnum(content[i])) { + if (i == 0 || !std::isalnum(content[i - 1])) { + size_t j = i; + while (j < content.size() && std::isalnum(content[j])) { + j++; + } + if (j == content.size() || !std::isalnum(content[j])) { + std::string word = content.substr(i, j - i); + freq[word]++; + i = j; + } else { + i = j; + } + } else { + i++; + } + } else { + i++; + } + } + return freq; +} + +// Check if phrase exists with word boundaries +bool phraseExists(const std::string& content, const std::string& phrase) { + size_t pos = 0; + while ((pos = content.find(phrase, pos)) != std::string::npos) { + bool before = (pos == 0 || !std::isalnum(content[pos - 1])); + bool after = (pos + phrase.size() == content.size() || !std::isalnum(content[pos + phrase.size()])); + if (before && after) { + return true; + } + pos += 1; + } + return false; +} + +// Helper function to find a whole word with word boundaries +size_t findWholeWord(const std::string& str, const std::string& word) { + size_t pos = 0; + while ((pos = str.find(word, pos)) != std::string::npos) { + bool before = (pos == 0 || !std::isalnum(str[pos - 1])); + bool after = (pos + word.size() == str.size() || !std::isalnum(str[pos + word.size()])); + if (before && after) { + return pos; + } + pos += 1; + } + return std::string::npos; +} + +// Extract title from +std::string getTitle(const std::string& content) { + size_t start = content.find(""); + if (start == std::string::npos) return ""; + start += 7; + size_t end = content.find("", start); + if (end == std::string::npos) return ""; + return content.substr(start, end - start); +} + +// Extract description from +std::string getDescription(const std::string& content) { + size_t pos = content.find(" content +std::string getBody(const std::string& content) { + size_t start = content.find(""); + if (start == std::string::npos) return ""; + start += 6; + size_t end = content.find("", start); + if (end == std::string::npos) return ""; + return content.substr(start, end - start); +} + +// Find sentence start position +size_t findSentenceStart(const std::string& body, size_t pos) { + size_t periodPos = body.rfind(".", pos); + if (periodPos == std::string::npos) { + return 0; + } + size_t start = periodPos + 1; + while (start < body.size() && std::isspace(body[start])) { + start++; + } + return start; +} + +// Generate 120-character snippet +std::string generateSnippet(const std::string& body, const std::string& query) { + // Try to find the exact query with word boundaries first + size_t pos = findWholeWord(body, query); + if (pos != std::string::npos) { + size_t start = findSentenceStart(body, pos); + if (start + 120 <= body.size()) { + return body.substr(start, 120); + } + return body.substr(start); + } else { + // If not found, try individual words + std::vector words; + std::istringstream iss(query); + std::string word; + while (iss >> word) { + words.push_back(word); + } + if (!words.empty()) { + for (const std::string& w : words) { + pos = findWholeWord(body, w); + if (pos != std::string::npos) { + size_t start = findSentenceStart(body, pos); + if (start + 120 <= body.size()) { + return body.substr(start, 120); + } + return body.substr(start); + } + } + } + + // If nothing found, return beginning of body + if (body.size() <= 120) { + return body; + } + return body.substr(0, 120); + } +} + +// Split string by whitespace +std::vector split(const std::string& s) { + std::vector words; + std::istringstream iss(s); + std::string word; + while (iss >> word) { + words.push_back(word); + } + return words; +} + +// Recursive crawl function +void crawl(const std::string& currentURL, std::set& visited) { + if (visited.count(currentURL)) return; + visited.insert(currentURL); + + std::ifstream fileStream(currentURL.c_str()); + if (!fileStream.is_open()) { + std::cerr << "Failed to open " << currentURL << std::endl; + return; + } + std::string content((std::istreambuf_iterator(fileStream)), std::istreambuf_iterator()); + fileStream.close(); + + docContents[currentURL] = content; + docLengths[currentURL] = content.length(); + + std::map freq = extractWordFrequencies(content); + for (std::map::const_iterator it = freq.begin(); it != freq.end(); ++it) { + invertedIndex[it->first][currentURL] = it->second; + } + + std::list links = extractLinksFromHTML(content); + outgoingLinksCount[currentURL] = links.size(); + std::string currentDir = getDirectory(currentURL); + + for (std::list::const_iterator it = links.begin(); it != links.end(); ++it) { + std::string targetURL = resolvePath(currentDir, *it); + backlinks[targetURL].insert(currentURL); + } + + for (std::list::const_iterator it = links.begin(); it != links.end(); ++it) { + std::string targetURL = resolvePath(currentDir, *it); + if (!visited.count(targetURL)) { + crawl(targetURL, visited); + } + } +} + +int main(int argc, char* argv[]) { + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + std::string seedURL = argv[1]; + std::string inputFile = argv[2]; + std::set visited; + crawl(seedURL, visited); + + double totalLength = 0.0; + for (std::map::const_iterator it = docLengths.begin(); it != docLengths.end(); ++it) { + totalLength += it->second; + } + + std::ifstream inputStream(inputFile.c_str()); + if (!inputStream.is_open()) { + std::cerr << "Failed to open " << inputFile << std::endl; + return 1; + } + + std::vector queries; + std::string line; + while (std::getline(inputStream, line)) { + queries.push_back(line); + } + inputStream.close(); + + for (size_t i = 0; i < queries.size(); ++i) { + std::string query = queries[i]; + std::string outputFile = "out" + std::to_string(i + 1) + ".txt"; + std::ofstream outStream(outputFile.c_str()); + if (!outStream.is_open()) { + std::cerr << "Failed to open " << outputFile << std::endl; + continue; + } + + bool isPhraseSearch = (query.size() >= 2 && query[0] == '"' && query[query.size() - 1] == '"'); + std::string phrase; + if (isPhraseSearch) { + phrase = query.substr(1, query.size() - 2); + query = phrase; + } + + std::vector words = split(query); + std::set candidates; + if (!words.empty()) { + std::string firstWord = words[0]; + if (invertedIndex.count(firstWord)) { + std::map docs = invertedIndex[firstWord]; + for (std::map::const_iterator it = docs.begin(); it != docs.end(); ++it) { + candidates.insert(it->first); + } + } + for (size_t j = 1; j < words.size(); ++j) { + std::string word = words[j]; + if (invertedIndex.count(word)) { + std::set temp; + std::map docs = invertedIndex[word]; + for (std::map::const_iterator it = docs.begin(); it != docs.end(); ++it) { + if (candidates.count(it->first)) { + temp.insert(it->first); + } + } + candidates = temp; + } else { + candidates.clear(); + break; + } + } + } + + if (isPhraseSearch) { + std::set filtered; + for (std::set::const_iterator it = candidates.begin(); it != candidates.end(); ++it) { + if (phraseExists(docContents[*it], phrase)) { + filtered.insert(*it); + } + } + candidates = filtered; + } + + if (candidates.empty()) { + outStream << "Your search - " << queries[i] << " - did not match any documents." << std::endl; + outStream.close(); + continue; + } + + std::vector > scores; + for (std::set::const_iterator it = candidates.begin(); it != candidates.end(); ++it) { + std::string doc = *it; + double densityScore = 0.0; + for (size_t j = 0; j < words.size(); ++j) { + std::string word = words[j]; + if (invertedIndex[word].count(doc)) { + int freq = invertedIndex[word][doc]; + double totalOccurrences = 0.0; + std::map docs = invertedIndex[word]; + for (std::map::const_iterator docIt = docs.begin(); docIt != docs.end(); ++docIt) { + totalOccurrences += docIt->second; + } + double keywordDensityAcrossAll = totalOccurrences / totalLength; + densityScore += freq / (docLengths[doc] * keywordDensityAcrossAll); + } + } + + double backlinksScore = 0.0; + if (backlinks.count(doc)) { + std::set linkers = backlinks[doc]; + for (std::set::const_iterator linkIt = linkers.begin(); linkIt != linkers.end(); ++linkIt) { + backlinksScore += 1.0 / (1 + outgoingLinksCount[*linkIt]); + } + } + + double pageScore = 0.5 * densityScore + 0.5 * backlinksScore; + scores.push_back(std::pair(doc, pageScore)); + } + + std::sort(scores.begin(), scores.end(), + [](const std::pair& a, const std::pair& b) { + return a.second > b.second; + }); + + outStream << "Matching documents: " << std::endl; + for (size_t j = 0; j < scores.size(); ++j) { + std::string doc = scores[j].first; + std::string content = docContents[doc]; + std::string title = getTitle(content); + std::string description = getDescription(content); + std::string body = getBody(content); + std::string snippet = generateSnippet(body, query); + + outStream << std::endl << "Title: " << title << std::endl; + outStream << "URL: " << doc << std::endl; + outStream << "Description: " << description << std::endl; + outStream << "Snippet: " << snippet << std::endl; + } + outStream.close(); + } + + return 0; +} \ No newline at end of file