Files
CSCI-1200/hws/search_engine/main.cpp
JamesFlare1212 ff290ace15 solve hw 7
2025-03-27 23:46:37 -04:00

405 lines
14 KiB
C++

#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <set>
#include <vector>
#include <list>
#include <regex>
#include <algorithm>
#include <cctype>
#include <sstream>
// Global data structures
std::map<std::string, std::map<std::string, int> > invertedIndex;
std::map<std::string, int> docLengths;
std::map<std::string, int> outgoingLinksCount;
std::map<std::string, std::set<std::string> > backlinks;
std::map<std::string, std::string> docContents;
// Provided function to extract links from HTML
std::list<std::string> extractLinksFromHTML(const std::string& fileContent) {
std::list<std::string> links;
std::regex linkRegex("<a\\s+[^>]*href\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");
std::smatch match;
std::string::const_iterator start = fileContent.begin();
while (std::regex_search(start, fileContent.end(), match, linkRegex)) {
if (match.size() > 1) {
links.push_back(match[1].str());
}
start = match.suffix().first;
}
return links;
}
// Resolve relative path based on current directory
std::string resolvePath(const std::string& currentDir, const std::string& link) {
if (link.find("../") == 0) {
std::string dir = currentDir;
if (!dir.empty() && dir.back() == '/') {
dir.pop_back(); // Remove trailing slash if present
}
size_t pos = dir.find_last_of('/');
if (pos != std::string::npos) {
dir = dir.substr(0, pos);
}
std::string remaining = link.substr(3);
return resolvePath(dir, remaining);
} else if (link.find("./") == 0) {
std::string remaining = link.substr(2);
return resolvePath(currentDir, remaining);
} else if (link.empty()) {
return currentDir; // Handle empty links if applicable
} else {
// Ensure exactly one slash between currentDir and link
std::string dir = currentDir;
std::string lnk = link;
if (!dir.empty() && dir.back() == '/') {
dir.pop_back(); // Remove trailing slash from currentDir
}
if (!lnk.empty() && lnk.front() == '/') {
lnk.erase(0, 1); // Remove leading slash from link
}
return dir + "/" + lnk;
}
}
// Get directory from URL
std::string getDirectory(const std::string& url) {
size_t pos = url.find_last_of('/');
if (pos != std::string::npos) {
return url.substr(0, pos + 1);
}
return "";
}
// Extract word frequencies with word boundaries
std::map<std::string, int> extractWordFrequencies(const std::string& content) {
std::map<std::string, int> freq;
size_t i = 0;
while (i < content.size()) {
if (std::isalnum(content[i])) {
if (i == 0 || !std::isalnum(content[i - 1])) {
size_t j = i;
while (j < content.size() && std::isalnum(content[j])) {
j++;
}
if (j == content.size() || !std::isalnum(content[j])) {
std::string word = content.substr(i, j - i);
freq[word]++;
i = j;
} else {
i = j;
}
} else {
i++;
}
} else {
i++;
}
}
return freq;
}
// Check if phrase exists with word boundaries
bool phraseExists(const std::string& content, const std::string& phrase) {
size_t pos = 0;
while ((pos = content.find(phrase, pos)) != std::string::npos) {
bool before = (pos == 0 || !std::isalnum(content[pos - 1]));
bool after = (pos + phrase.size() == content.size() || !std::isalnum(content[pos + phrase.size()]));
if (before && after) {
return true;
}
pos += 1;
}
return false;
}
// Helper function to find a whole word with word boundaries
size_t findWholeWord(const std::string& str, const std::string& word) {
size_t pos = 0;
while ((pos = str.find(word, pos)) != std::string::npos) {
bool before = (pos == 0 || !std::isalnum(str[pos - 1]));
bool after = (pos + word.size() == str.size() || !std::isalnum(str[pos + word.size()]));
if (before && after) {
return pos;
}
pos += 1;
}
return std::string::npos;
}
// Extract title from <head>
std::string getTitle(const std::string& content) {
size_t start = content.find("<title>");
if (start == std::string::npos) return "";
start += 7;
size_t end = content.find("</title>", start);
if (end == std::string::npos) return "";
return content.substr(start, end - start);
}
// Extract description from <meta>
std::string getDescription(const std::string& content) {
size_t pos = content.find("<meta name=\"description\" content=\"");
if (pos == std::string::npos) return "";
pos += 34;
size_t end = content.find("\"", pos);
if (end == std::string::npos) return "";
return content.substr(pos, end - pos);
}
// Extract <body> content
std::string getBody(const std::string& content) {
size_t start = content.find("<body>");
if (start == std::string::npos) return "";
start += 6;
size_t end = content.find("</body>", start);
if (end == std::string::npos) return "";
return content.substr(start, end - start);
}
// Find sentence start position
size_t findSentenceStart(const std::string& body, size_t pos) {
size_t periodPos = body.rfind(".", pos);
if (periodPos == std::string::npos) {
return 0;
}
size_t start = periodPos + 1;
while (start < body.size() && std::isspace(body[start])) {
start++;
}
return start;
}
// Generate 120-character snippet
std::string generateSnippet(const std::string& body, const std::string& query) {
// Try to find the exact query with word boundaries first
size_t pos = findWholeWord(body, query);
if (pos != std::string::npos) {
size_t start = findSentenceStart(body, pos);
if (start + 120 <= body.size()) {
return body.substr(start, 120);
}
return body.substr(start);
} else {
// If not found, try individual words
std::vector<std::string> words;
std::istringstream iss(query);
std::string word;
while (iss >> word) {
words.push_back(word);
}
if (!words.empty()) {
for (const std::string& w : words) {
pos = findWholeWord(body, w);
if (pos != std::string::npos) {
size_t start = findSentenceStart(body, pos);
if (start + 120 <= body.size()) {
return body.substr(start, 120);
}
return body.substr(start);
}
}
}
// If nothing found, return beginning of body
if (body.size() <= 120) {
return body;
}
return body.substr(0, 120);
}
}
// Split string by whitespace
std::vector<std::string> split(const std::string& s) {
std::vector<std::string> words;
std::istringstream iss(s);
std::string word;
while (iss >> word) {
words.push_back(word);
}
return words;
}
// Recursive crawl function
void crawl(const std::string& currentURL, std::set<std::string>& visited) {
if (visited.count(currentURL)) return;
visited.insert(currentURL);
std::ifstream fileStream(currentURL.c_str());
if (!fileStream.is_open()) {
std::cerr << "Failed to open " << currentURL << std::endl;
return;
}
std::string content((std::istreambuf_iterator<char>(fileStream)), std::istreambuf_iterator<char>());
fileStream.close();
docContents[currentURL] = content;
docLengths[currentURL] = content.length();
std::map<std::string, int> freq = extractWordFrequencies(content);
for (std::map<std::string, int>::const_iterator it = freq.begin(); it != freq.end(); ++it) {
invertedIndex[it->first][currentURL] = it->second;
}
std::list<std::string> links = extractLinksFromHTML(content);
outgoingLinksCount[currentURL] = links.size();
std::string currentDir = getDirectory(currentURL);
for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
std::string targetURL = resolvePath(currentDir, *it);
backlinks[targetURL].insert(currentURL);
}
for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
std::string targetURL = resolvePath(currentDir, *it);
if (!visited.count(targetURL)) {
crawl(targetURL, visited);
}
}
}
int main(int argc, char* argv[]) {
if (argc != 3) {
std::cerr << "Usage: " << argv[0] << " <seedURL> <inputFile>" << std::endl;
return 1;
}
std::string seedURL = argv[1];
std::string inputFile = argv[2];
std::set<std::string> visited;
crawl(seedURL, visited);
double totalLength = 0.0;
for (std::map<std::string, int>::const_iterator it = docLengths.begin(); it != docLengths.end(); ++it) {
totalLength += it->second;
}
std::ifstream inputStream(inputFile.c_str());
if (!inputStream.is_open()) {
std::cerr << "Failed to open " << inputFile << std::endl;
return 1;
}
std::vector<std::string> queries;
std::string line;
while (std::getline(inputStream, line)) {
queries.push_back(line);
}
inputStream.close();
for (size_t i = 0; i < queries.size(); ++i) {
std::string query = queries[i];
std::string outputFile = "out" + std::to_string(i + 1) + ".txt";
std::ofstream outStream(outputFile.c_str());
if (!outStream.is_open()) {
std::cerr << "Failed to open " << outputFile << std::endl;
continue;
}
bool isPhraseSearch = (query.size() >= 2 && query[0] == '"' && query[query.size() - 1] == '"');
std::string phrase;
if (isPhraseSearch) {
phrase = query.substr(1, query.size() - 2);
query = phrase;
}
std::vector<std::string> words = split(query);
std::set<std::string> candidates;
if (!words.empty()) {
std::string firstWord = words[0];
if (invertedIndex.count(firstWord)) {
std::map<std::string, int> docs = invertedIndex[firstWord];
for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
candidates.insert(it->first);
}
}
for (size_t j = 1; j < words.size(); ++j) {
std::string word = words[j];
if (invertedIndex.count(word)) {
std::set<std::string> temp;
std::map<std::string, int> docs = invertedIndex[word];
for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
if (candidates.count(it->first)) {
temp.insert(it->first);
}
}
candidates = temp;
} else {
candidates.clear();
break;
}
}
}
if (isPhraseSearch) {
std::set<std::string> filtered;
for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
if (phraseExists(docContents[*it], phrase)) {
filtered.insert(*it);
}
}
candidates = filtered;
}
if (candidates.empty()) {
outStream << "Your search - " << queries[i] << " - did not match any documents." << std::endl;
outStream.close();
continue;
}
std::vector<std::pair<std::string, double> > scores;
for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
std::string doc = *it;
double densityScore = 0.0;
for (size_t j = 0; j < words.size(); ++j) {
std::string word = words[j];
if (invertedIndex[word].count(doc)) {
int freq = invertedIndex[word][doc];
double totalOccurrences = 0.0;
std::map<std::string, int> docs = invertedIndex[word];
for (std::map<std::string, int>::const_iterator docIt = docs.begin(); docIt != docs.end(); ++docIt) {
totalOccurrences += docIt->second;
}
double keywordDensityAcrossAll = totalOccurrences / totalLength;
densityScore += freq / (docLengths[doc] * keywordDensityAcrossAll);
}
}
double backlinksScore = 0.0;
if (backlinks.count(doc)) {
std::set<std::string> linkers = backlinks[doc];
for (std::set<std::string>::const_iterator linkIt = linkers.begin(); linkIt != linkers.end(); ++linkIt) {
backlinksScore += 1.0 / (1 + outgoingLinksCount[*linkIt]);
}
}
double pageScore = 0.5 * densityScore + 0.5 * backlinksScore;
scores.push_back(std::pair<std::string, double>(doc, pageScore));
}
std::sort(scores.begin(), scores.end(),
[](const std::pair<std::string, double>& a, const std::pair<std::string, double>& b) {
return a.second > b.second;
});
outStream << "Matching documents: " << std::endl;
for (size_t j = 0; j < scores.size(); ++j) {
std::string doc = scores[j].first;
std::string content = docContents[doc];
std::string title = getTitle(content);
std::string description = getDescription(content);
std::string body = getBody(content);
std::string snippet = generateSnippet(body, query);
outStream << std::endl << "Title: " << title << std::endl;
outStream << "URL: " << doc << std::endl;
outStream << "Description: " << description << std::endl;
outStream << "Snippet: " << snippet << std::endl;
}
outStream.close();
}
return 0;
}