#include #include #include #include #include #include #include #include #include #include #include // Global data structures std::map > invertedIndex; std::map docLengths; std::map outgoingLinksCount; std::map > backlinks; std::map docContents; // Provided function to extract links from HTML std::list extractLinksFromHTML(const std::string& fileContent) { std::list links; std::regex linkRegex("]*href\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>"); std::smatch match; std::string::const_iterator start = fileContent.begin(); while (std::regex_search(start, fileContent.end(), match, linkRegex)) { if (match.size() > 1) { links.push_back(match[1].str()); } start = match.suffix().first; } return links; } // Resolve relative path based on current directory std::string resolvePath(const std::string& currentDir, const std::string& link) { if (link.find("../") == 0) { std::string dir = currentDir; if (!dir.empty() && dir.back() == '/') { dir.pop_back(); // Remove trailing slash if present } size_t pos = dir.find_last_of('/'); if (pos != std::string::npos) { dir = dir.substr(0, pos); } std::string remaining = link.substr(3); return resolvePath(dir, remaining); } else if (link.find("./") == 0) { std::string remaining = link.substr(2); return resolvePath(currentDir, remaining); } else if (link.empty()) { return currentDir; // Handle empty links if applicable } else { // Ensure exactly one slash between currentDir and link std::string dir = currentDir; std::string lnk = link; if (!dir.empty() && dir.back() == '/') { dir.pop_back(); // Remove trailing slash from currentDir } if (!lnk.empty() && lnk.front() == '/') { lnk.erase(0, 1); // Remove leading slash from link } return dir + "/" + lnk; } } // Get directory from URL std::string getDirectory(const std::string& url) { size_t pos = url.find_last_of('/'); if (pos != std::string::npos) { return url.substr(0, pos + 1); } return ""; } // Extract word frequencies with word boundaries std::map extractWordFrequencies(const std::string& content) { std::map freq; size_t i = 0; while (i < content.size()) { if (std::isalnum(content[i])) { if (i == 0 || !std::isalnum(content[i - 1])) { size_t j = i; while (j < content.size() && std::isalnum(content[j])) { j++; } if (j == content.size() || !std::isalnum(content[j])) { std::string word = content.substr(i, j - i); freq[word]++; i = j; } else { i = j; } } else { i++; } } else { i++; } } return freq; } // Check if phrase exists with word boundaries bool phraseExists(const std::string& content, const std::string& phrase) { size_t pos = 0; while ((pos = content.find(phrase, pos)) != std::string::npos) { bool before = (pos == 0 || !std::isalnum(content[pos - 1])); bool after = (pos + phrase.size() == content.size() || !std::isalnum(content[pos + phrase.size()])); if (before && after) { return true; } pos += 1; } return false; } // Helper function to find a whole word with word boundaries size_t findWholeWord(const std::string& str, const std::string& word) { size_t pos = 0; while ((pos = str.find(word, pos)) != std::string::npos) { bool before = (pos == 0 || !std::isalnum(str[pos - 1])); bool after = (pos + word.size() == str.size() || !std::isalnum(str[pos + word.size()])); if (before && after) { return pos; } pos += 1; } return std::string::npos; } // Extract title from std::string getTitle(const std::string& content) { size_t start = content.find(""); if (start == std::string::npos) return ""; start += 7; size_t end = content.find("", start); if (end == std::string::npos) return ""; return content.substr(start, end - start); } // Extract description from std::string getDescription(const std::string& content) { size_t pos = content.find(" content std::string getBody(const std::string& content) { size_t start = content.find(""); if (start == std::string::npos) return ""; start += 6; size_t end = content.find("", start); if (end == std::string::npos) return ""; return content.substr(start, end - start); } // Find sentence start position size_t findSentenceStart(const std::string& body, size_t pos) { size_t periodPos = body.rfind(".", pos); if (periodPos == std::string::npos) { return 0; } size_t start = periodPos + 1; while (start < body.size() && std::isspace(body[start])) { start++; } return start; } // Generate 120-character snippet std::string generateSnippet(const std::string& body, const std::string& query) { // Try to find the exact query with word boundaries first size_t pos = findWholeWord(body, query); if (pos != std::string::npos) { size_t start = findSentenceStart(body, pos); if (start + 120 <= body.size()) { return body.substr(start, 120); } return body.substr(start); } else { // If not found, try individual words std::vector words; std::istringstream iss(query); std::string word; while (iss >> word) { words.push_back(word); } if (!words.empty()) { for (const std::string& w : words) { pos = findWholeWord(body, w); if (pos != std::string::npos) { size_t start = findSentenceStart(body, pos); if (start + 120 <= body.size()) { return body.substr(start, 120); } return body.substr(start); } } } // If nothing found, return beginning of body if (body.size() <= 120) { return body; } return body.substr(0, 120); } } // Split string by whitespace std::vector split(const std::string& s) { std::vector words; std::istringstream iss(s); std::string word; while (iss >> word) { words.push_back(word); } return words; } // Recursive crawl function void crawl(const std::string& currentURL, std::set& visited) { if (visited.count(currentURL)) return; visited.insert(currentURL); std::ifstream fileStream(currentURL.c_str()); if (!fileStream.is_open()) { std::cerr << "Failed to open " << currentURL << std::endl; return; } std::string content((std::istreambuf_iterator(fileStream)), std::istreambuf_iterator()); fileStream.close(); docContents[currentURL] = content; docLengths[currentURL] = content.length(); std::map freq = extractWordFrequencies(content); for (std::map::const_iterator it = freq.begin(); it != freq.end(); ++it) { invertedIndex[it->first][currentURL] = it->second; } std::list links = extractLinksFromHTML(content); outgoingLinksCount[currentURL] = links.size(); std::string currentDir = getDirectory(currentURL); for (std::list::const_iterator it = links.begin(); it != links.end(); ++it) { std::string targetURL = resolvePath(currentDir, *it); backlinks[targetURL].insert(currentURL); } for (std::list::const_iterator it = links.begin(); it != links.end(); ++it) { std::string targetURL = resolvePath(currentDir, *it); if (!visited.count(targetURL)) { crawl(targetURL, visited); } } } int main(int argc, char* argv[]) { if (argc != 3) { std::cerr << "Usage: " << argv[0] << " " << std::endl; return 1; } std::string seedURL = argv[1]; std::string inputFile = argv[2]; std::set visited; crawl(seedURL, visited); double totalLength = 0.0; for (std::map::const_iterator it = docLengths.begin(); it != docLengths.end(); ++it) { totalLength += it->second; } std::ifstream inputStream(inputFile.c_str()); if (!inputStream.is_open()) { std::cerr << "Failed to open " << inputFile << std::endl; return 1; } std::vector queries; std::string line; while (std::getline(inputStream, line)) { queries.push_back(line); } inputStream.close(); for (size_t i = 0; i < queries.size(); ++i) { std::string query = queries[i]; std::string outputFile = "out" + std::to_string(i + 1) + ".txt"; std::ofstream outStream(outputFile.c_str()); if (!outStream.is_open()) { std::cerr << "Failed to open " << outputFile << std::endl; continue; } bool isPhraseSearch = (query.size() >= 2 && query[0] == '"' && query[query.size() - 1] == '"'); std::string phrase; if (isPhraseSearch) { phrase = query.substr(1, query.size() - 2); query = phrase; } std::vector words = split(query); std::set candidates; if (!words.empty()) { std::string firstWord = words[0]; if (invertedIndex.count(firstWord)) { std::map docs = invertedIndex[firstWord]; for (std::map::const_iterator it = docs.begin(); it != docs.end(); ++it) { candidates.insert(it->first); } } for (size_t j = 1; j < words.size(); ++j) { std::string word = words[j]; if (invertedIndex.count(word)) { std::set temp; std::map docs = invertedIndex[word]; for (std::map::const_iterator it = docs.begin(); it != docs.end(); ++it) { if (candidates.count(it->first)) { temp.insert(it->first); } } candidates = temp; } else { candidates.clear(); break; } } } if (isPhraseSearch) { std::set filtered; for (std::set::const_iterator it = candidates.begin(); it != candidates.end(); ++it) { if (phraseExists(docContents[*it], phrase)) { filtered.insert(*it); } } candidates = filtered; } if (candidates.empty()) { outStream << "Your search - " << queries[i] << " - did not match any documents." << std::endl; outStream.close(); continue; } std::vector > scores; for (std::set::const_iterator it = candidates.begin(); it != candidates.end(); ++it) { std::string doc = *it; double densityScore = 0.0; for (size_t j = 0; j < words.size(); ++j) { std::string word = words[j]; if (invertedIndex[word].count(doc)) { int freq = invertedIndex[word][doc]; double totalOccurrences = 0.0; std::map docs = invertedIndex[word]; for (std::map::const_iterator docIt = docs.begin(); docIt != docs.end(); ++docIt) { totalOccurrences += docIt->second; } double keywordDensityAcrossAll = totalOccurrences / totalLength; densityScore += freq / (docLengths[doc] * keywordDensityAcrossAll); } } double backlinksScore = 0.0; if (backlinks.count(doc)) { std::set linkers = backlinks[doc]; for (std::set::const_iterator linkIt = linkers.begin(); linkIt != linkers.end(); ++linkIt) { backlinksScore += 1.0 / (1 + outgoingLinksCount[*linkIt]); } } double pageScore = 0.5 * densityScore + 0.5 * backlinksScore; scores.push_back(std::pair(doc, pageScore)); } std::sort(scores.begin(), scores.end(), [](const std::pair& a, const std::pair& b) { return a.second > b.second; }); outStream << "Matching documents: " << std::endl; for (size_t j = 0; j < scores.size(); ++j) { std::string doc = scores[j].first; std::string content = docContents[doc]; std::string title = getTitle(content); std::string description = getDescription(content); std::string body = getBody(content); std::string snippet = generateSnippet(body, query); outStream << std::endl << "Title: " << title << std::endl; outStream << "URL: " << doc << std::endl; outStream << "Description: " << description << std::endl; outStream << "Snippet: " << snippet << std::endl; } outStream.close(); } return 0; }