CSCI-1200/hws/search_engine/main.cpp

#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <set>
#include <vector>
#include <list>
#include <regex>
#include <algorithm>
#include <cctype>
#include <sstream>

// Global data structures
std::map<std::string, std::map<std::string, int> > invertedIndex;
std::map<std::string, int> docLengths;
std::map<std::string, int> outgoingLinksCount;
std::map<std::string, std::set<std::string> > backlinks;
std::map<std::string, std::string> docContents;

// Provided function to extract links from HTML
std::list<std::string> extractLinksFromHTML(const std::string& fileContent) {
    std::list<std::string> links;
    std::regex linkRegex("<a\\s+[^>]*href\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");
    std::smatch match;
    std::string::const_iterator start = fileContent.begin();
    while (std::regex_search(start, fileContent.end(), match, linkRegex)) {
        if (match.size() > 1) {
            links.push_back(match[1].str());
        }
        start = match.suffix().first;
    }
    return links;
}

// Resolve relative path based on current directory
std::string resolvePath(const std::string& currentDir, const std::string& link) {
    if (link.find("../") == 0) {
        std::string dir = currentDir;
        if (!dir.empty() && dir.back() == '/') {
            dir.pop_back(); // Remove trailing slash if present
        }
        size_t pos = dir.find_last_of('/');
        if (pos != std::string::npos) {
            dir = dir.substr(0, pos);
        }
        std::string remaining = link.substr(3);
        return resolvePath(dir, remaining);
    } else if (link.find("./") == 0) {
        std::string remaining = link.substr(2);
        return resolvePath(currentDir, remaining);
    } else if (link.empty()) {
        return currentDir; // Handle empty links if applicable
    } else {
        // Ensure exactly one slash between currentDir and link
        std::string dir = currentDir;
        std::string lnk = link;
        if (!dir.empty() && dir.back() == '/') {
            dir.pop_back(); // Remove trailing slash from currentDir
        }
        if (!lnk.empty() && lnk.front() == '/') {
            lnk.erase(0, 1); // Remove leading slash from link
        }
        return dir + "/" + lnk;
    }
}

// Get directory from URL
std::string getDirectory(const std::string& url) {
    size_t pos = url.find_last_of('/');
    if (pos != std::string::npos) {
        return url.substr(0, pos + 1);
    }
    return "";
}

// Extract word frequencies with word boundaries
std::map<std::string, int> extractWordFrequencies(const std::string& content) {
    std::map<std::string, int> freq;
    size_t i = 0;
    while (i < content.size()) {
        if (std::isalnum(content[i])) {
            if (i == 0 || !std::isalnum(content[i - 1])) {
                size_t j = i;
                while (j < content.size() && std::isalnum(content[j])) {
                    j++;
                }
                if (j == content.size() || !std::isalnum(content[j])) {
                    std::string word = content.substr(i, j - i);
                    freq[word]++;
                    i = j;
                } else {
                    i = j;
                }
            } else {
                i++;
            }
        } else {
            i++;
        }
    }
    return freq;
}

// Check if phrase exists with word boundaries
bool phraseExists(const std::string& content, const std::string& phrase) {
    size_t pos = 0;
    while ((pos = content.find(phrase, pos)) != std::string::npos) {
        bool before = (pos == 0 || !std::isalnum(content[pos - 1]));
        bool after = (pos + phrase.size() == content.size() || !std::isalnum(content[pos + phrase.size()]));
        if (before && after) {
            return true;
        }
        pos += 1;
    }
    return false;
}

// Helper function to find a whole word with word boundaries
size_t findWholeWord(const std::string& str, const std::string& word) {
    size_t pos = 0;
    while ((pos = str.find(word, pos)) != std::string::npos) {
        bool before = (pos == 0 || !std::isalnum(str[pos - 1]));
        bool after = (pos + word.size() == str.size() || !std::isalnum(str[pos + word.size()]));
        if (before && after) {
            return pos;
        }
        pos += 1;
    }
    return std::string::npos;
}

// Extract title from <head>
std::string getTitle(const std::string& content) {
    size_t start = content.find("<title>");
    if (start == std::string::npos) return "";
    start += 7;
    size_t end = content.find("</title>", start);
    if (end == std::string::npos) return "";
    return content.substr(start, end - start);
}

// Extract description from <meta>
std::string getDescription(const std::string& content) {
    size_t pos = content.find("<meta name=\"description\" content=\"");
    if (pos == std::string::npos) return "";
    pos += 34;
    size_t end = content.find("\"", pos);
    if (end == std::string::npos) return "";
    return content.substr(pos, end - pos);
}

// Extract <body> content
std::string getBody(const std::string& content) {
    size_t start = content.find("<body>");
    if (start == std::string::npos) return "";
    start += 6;
    size_t end = content.find("</body>", start);
    if (end == std::string::npos) return "";
    return content.substr(start, end - start);
}

// Find sentence start position
size_t findSentenceStart(const std::string& body, size_t pos) {
    size_t periodPos = body.rfind(".", pos);
    if (periodPos == std::string::npos) {
        return 0;
    }
    size_t start = periodPos + 1;
    while (start < body.size() && std::isspace(body[start])) {
        start++;
    }
    return start;
}

// Generate 120-character snippet
std::string generateSnippet(const std::string& body, const std::string& query) {
    // Try to find the exact query with word boundaries first
    size_t pos = findWholeWord(body, query);
    if (pos != std::string::npos) {
        size_t start = findSentenceStart(body, pos);
        if (start + 120 <= body.size()) {
            return body.substr(start, 120);
        }
        return body.substr(start);
    } else {
        // If not found, try individual words
        std::vector<std::string> words;
        std::istringstream iss(query);
        std::string word;
        while (iss >> word) {
            words.push_back(word);
        }
        if (!words.empty()) {
            for (const std::string& w : words) {
                pos = findWholeWord(body, w);
                if (pos != std::string::npos) {
                    size_t start = findSentenceStart(body, pos);
                    if (start + 120 <= body.size()) {
                        return body.substr(start, 120);
                    }
                    return body.substr(start);
                }
            }
        }

        // If nothing found, return beginning of body
        if (body.size() <= 120) {
            return body;
        }
        return body.substr(0, 120);
    }
}

// Split string by whitespace
std::vector<std::string> split(const std::string& s) {
    std::vector<std::string> words;
    std::istringstream iss(s);
    std::string word;
    while (iss >> word) {
        words.push_back(word);
    }
    return words;
}

// Recursive crawl function
void crawl(const std::string& currentURL, std::set<std::string>& visited) {
    if (visited.count(currentURL)) return;
    visited.insert(currentURL);

    std::ifstream fileStream(currentURL.c_str());
    if (!fileStream.is_open()) {
        std::cerr << "Failed to open " << currentURL << std::endl;
        return;
    }
    std::string content((std::istreambuf_iterator<char>(fileStream)), std::istreambuf_iterator<char>());
    fileStream.close();

    docContents[currentURL] = content;
    docLengths[currentURL] = content.length();

    std::map<std::string, int> freq = extractWordFrequencies(content);
    for (std::map<std::string, int>::const_iterator it = freq.begin(); it != freq.end(); ++it) {
        invertedIndex[it->first][currentURL] = it->second;
    }

    std::list<std::string> links = extractLinksFromHTML(content);
    outgoingLinksCount[currentURL] = links.size();
    std::string currentDir = getDirectory(currentURL);

    for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
        std::string targetURL = resolvePath(currentDir, *it);
        backlinks[targetURL].insert(currentURL);
    }

    for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
        std::string targetURL = resolvePath(currentDir, *it);
        if (!visited.count(targetURL)) {
            crawl(targetURL, visited);
        }
    }
}

int main(int argc, char* argv[]) {
    if (argc != 3) {
        std::cerr << "Usage: " << argv[0] << " <seedURL> <inputFile>" << std::endl;
        return 1;
    }

    std::string seedURL = argv[1];
    std::string inputFile = argv[2];
    std::set<std::string> visited;
    crawl(seedURL, visited);

    double totalLength = 0.0;
    for (std::map<std::string, int>::const_iterator it = docLengths.begin(); it != docLengths.end(); ++it) {
        totalLength += it->second;
    }

    std::ifstream inputStream(inputFile.c_str());
    if (!inputStream.is_open()) {
        std::cerr << "Failed to open " << inputFile << std::endl;
        return 1;
    }

    std::vector<std::string> queries;
    std::string line;
    while (std::getline(inputStream, line)) {
        queries.push_back(line);
    }
    inputStream.close();

    for (size_t i = 0; i < queries.size(); ++i) {
        std::string query = queries[i];
        std::string outputFile = "out" + std::to_string(i + 1) + ".txt";
        std::ofstream outStream(outputFile.c_str());
        if (!outStream.is_open()) {
            std::cerr << "Failed to open " << outputFile << std::endl;
            continue;
        }

        bool isPhraseSearch = (query.size() >= 2 && query[0] == '"' && query[query.size() - 1] == '"');
        std::string phrase;
        if (isPhraseSearch) {
            phrase = query.substr(1, query.size() - 2);
            query = phrase;
        }

        std::vector<std::string> words = split(query);
        std::set<std::string> candidates;
        if (!words.empty()) {
            std::string firstWord = words[0];
            if (invertedIndex.count(firstWord)) {
                std::map<std::string, int> docs = invertedIndex[firstWord];
                for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
                    candidates.insert(it->first);
                }
            }
            for (size_t j = 1; j < words.size(); ++j) {
                std::string word = words[j];
                if (invertedIndex.count(word)) {
                    std::set<std::string> temp;
                    std::map<std::string, int> docs = invertedIndex[word];
                    for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
                        if (candidates.count(it->first)) {
                            temp.insert(it->first);
                        }
                    }
                    candidates = temp;
                } else {
                    candidates.clear();
                    break;
                }
            }
        }

        if (isPhraseSearch) {
            std::set<std::string> filtered;
            for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
                if (phraseExists(docContents[*it], phrase)) {
                    filtered.insert(*it);
                }
            }
            candidates = filtered;
        }

        if (candidates.empty()) {
            outStream << "Your search - " << queries[i] << " - did not match any documents." << std::endl;
            outStream.close();
            continue;
        }

        std::vector<std::pair<std::string, double> > scores;
        for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
            std::string doc = *it;
            double densityScore = 0.0;
            for (size_t j = 0; j < words.size(); ++j) {
                std::string word = words[j];
                if (invertedIndex[word].count(doc)) {
                    int freq = invertedIndex[word][doc];
                    double totalOccurrences = 0.0;
                    std::map<std::string, int> docs = invertedIndex[word];
                    for (std::map<std::string, int>::const_iterator docIt = docs.begin(); docIt != docs.end(); ++docIt) {
                        totalOccurrences += docIt->second;
                    }
                    double keywordDensityAcrossAll = totalOccurrences / totalLength;
                    densityScore += freq / (docLengths[doc] * keywordDensityAcrossAll);
                }
            }

            double backlinksScore = 0.0;
            if (backlinks.count(doc)) {
                std::set<std::string> linkers = backlinks[doc];
                for (std::set<std::string>::const_iterator linkIt = linkers.begin(); linkIt != linkers.end(); ++linkIt) {
                    backlinksScore += 1.0 / (1 + outgoingLinksCount[*linkIt]);
                }
            }

            double pageScore = 0.5 * densityScore + 0.5 * backlinksScore;
            scores.push_back(std::pair<std::string, double>(doc, pageScore));
        }

        std::sort(scores.begin(), scores.end(),
                  [](const std::pair<std::string, double>& a, const std::pair<std::string, double>& b) {
                      return a.second > b.second;
                  });

        outStream << "Matching documents: " << std::endl;
        for (size_t j = 0; j < scores.size(); ++j) {
            std::string doc = scores[j].first;
            std::string content = docContents[doc];
            std::string title = getTitle(content);
            std::string description = getDescription(content);
            std::string body = getBody(content);
            std::string snippet = generateSnippet(body, query);

            outStream << std::endl << "Title: " << title << std::endl;
            outStream << "URL: " << doc << std::endl;
            outStream << "Description: " << description << std::endl;
            outStream << "Snippet: " << snippet << std::endl;
        }
        outStream.close();
    }

    return 0;
}