405 lines
14 KiB
C++
405 lines
14 KiB
C++
#include <iostream>
|
|
#include <fstream>
|
|
#include <string>
|
|
#include <map>
|
|
#include <set>
|
|
#include <vector>
|
|
#include <list>
|
|
#include <regex>
|
|
#include <algorithm>
|
|
#include <cctype>
|
|
#include <sstream>
|
|
|
|
// Global data structures
|
|
std::map<std::string, std::map<std::string, int> > invertedIndex;
|
|
std::map<std::string, int> docLengths;
|
|
std::map<std::string, int> outgoingLinksCount;
|
|
std::map<std::string, std::set<std::string> > backlinks;
|
|
std::map<std::string, std::string> docContents;
|
|
|
|
// Provided function to extract links from HTML
|
|
std::list<std::string> extractLinksFromHTML(const std::string& fileContent) {
|
|
std::list<std::string> links;
|
|
std::regex linkRegex("<a\\s+[^>]*href\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");
|
|
std::smatch match;
|
|
std::string::const_iterator start = fileContent.begin();
|
|
while (std::regex_search(start, fileContent.end(), match, linkRegex)) {
|
|
if (match.size() > 1) {
|
|
links.push_back(match[1].str());
|
|
}
|
|
start = match.suffix().first;
|
|
}
|
|
return links;
|
|
}
|
|
|
|
// Resolve relative path based on current directory
|
|
std::string resolvePath(const std::string& currentDir, const std::string& link) {
|
|
if (link.find("../") == 0) {
|
|
std::string dir = currentDir;
|
|
if (!dir.empty() && dir.back() == '/') {
|
|
dir.pop_back(); // Remove trailing slash if present
|
|
}
|
|
size_t pos = dir.find_last_of('/');
|
|
if (pos != std::string::npos) {
|
|
dir = dir.substr(0, pos);
|
|
}
|
|
std::string remaining = link.substr(3);
|
|
return resolvePath(dir, remaining);
|
|
} else if (link.find("./") == 0) {
|
|
std::string remaining = link.substr(2);
|
|
return resolvePath(currentDir, remaining);
|
|
} else if (link.empty()) {
|
|
return currentDir; // Handle empty links if applicable
|
|
} else {
|
|
// Ensure exactly one slash between currentDir and link
|
|
std::string dir = currentDir;
|
|
std::string lnk = link;
|
|
if (!dir.empty() && dir.back() == '/') {
|
|
dir.pop_back(); // Remove trailing slash from currentDir
|
|
}
|
|
if (!lnk.empty() && lnk.front() == '/') {
|
|
lnk.erase(0, 1); // Remove leading slash from link
|
|
}
|
|
return dir + "/" + lnk;
|
|
}
|
|
}
|
|
|
|
// Get directory from URL
|
|
std::string getDirectory(const std::string& url) {
|
|
size_t pos = url.find_last_of('/');
|
|
if (pos != std::string::npos) {
|
|
return url.substr(0, pos + 1);
|
|
}
|
|
return "";
|
|
}
|
|
|
|
// Extract word frequencies with word boundaries
|
|
std::map<std::string, int> extractWordFrequencies(const std::string& content) {
|
|
std::map<std::string, int> freq;
|
|
size_t i = 0;
|
|
while (i < content.size()) {
|
|
if (std::isalnum(content[i])) {
|
|
if (i == 0 || !std::isalnum(content[i - 1])) {
|
|
size_t j = i;
|
|
while (j < content.size() && std::isalnum(content[j])) {
|
|
j++;
|
|
}
|
|
if (j == content.size() || !std::isalnum(content[j])) {
|
|
std::string word = content.substr(i, j - i);
|
|
freq[word]++;
|
|
i = j;
|
|
} else {
|
|
i = j;
|
|
}
|
|
} else {
|
|
i++;
|
|
}
|
|
} else {
|
|
i++;
|
|
}
|
|
}
|
|
return freq;
|
|
}
|
|
|
|
// Check if phrase exists with word boundaries
|
|
bool phraseExists(const std::string& content, const std::string& phrase) {
|
|
size_t pos = 0;
|
|
while ((pos = content.find(phrase, pos)) != std::string::npos) {
|
|
bool before = (pos == 0 || !std::isalnum(content[pos - 1]));
|
|
bool after = (pos + phrase.size() == content.size() || !std::isalnum(content[pos + phrase.size()]));
|
|
if (before && after) {
|
|
return true;
|
|
}
|
|
pos += 1;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Helper function to find a whole word with word boundaries
|
|
size_t findWholeWord(const std::string& str, const std::string& word) {
|
|
size_t pos = 0;
|
|
while ((pos = str.find(word, pos)) != std::string::npos) {
|
|
bool before = (pos == 0 || !std::isalnum(str[pos - 1]));
|
|
bool after = (pos + word.size() == str.size() || !std::isalnum(str[pos + word.size()]));
|
|
if (before && after) {
|
|
return pos;
|
|
}
|
|
pos += 1;
|
|
}
|
|
return std::string::npos;
|
|
}
|
|
|
|
// Extract title from <head>
|
|
std::string getTitle(const std::string& content) {
|
|
size_t start = content.find("<title>");
|
|
if (start == std::string::npos) return "";
|
|
start += 7;
|
|
size_t end = content.find("</title>", start);
|
|
if (end == std::string::npos) return "";
|
|
return content.substr(start, end - start);
|
|
}
|
|
|
|
// Extract description from <meta>
|
|
std::string getDescription(const std::string& content) {
|
|
size_t pos = content.find("<meta name=\"description\" content=\"");
|
|
if (pos == std::string::npos) return "";
|
|
pos += 34;
|
|
size_t end = content.find("\"", pos);
|
|
if (end == std::string::npos) return "";
|
|
return content.substr(pos, end - pos);
|
|
}
|
|
|
|
// Extract <body> content
|
|
std::string getBody(const std::string& content) {
|
|
size_t start = content.find("<body>");
|
|
if (start == std::string::npos) return "";
|
|
start += 6;
|
|
size_t end = content.find("</body>", start);
|
|
if (end == std::string::npos) return "";
|
|
return content.substr(start, end - start);
|
|
}
|
|
|
|
// Find sentence start position
|
|
size_t findSentenceStart(const std::string& body, size_t pos) {
|
|
size_t periodPos = body.rfind(".", pos);
|
|
if (periodPos == std::string::npos) {
|
|
return 0;
|
|
}
|
|
size_t start = periodPos + 1;
|
|
while (start < body.size() && std::isspace(body[start])) {
|
|
start++;
|
|
}
|
|
return start;
|
|
}
|
|
|
|
// Generate 120-character snippet
|
|
std::string generateSnippet(const std::string& body, const std::string& query) {
|
|
// Try to find the exact query with word boundaries first
|
|
size_t pos = findWholeWord(body, query);
|
|
if (pos != std::string::npos) {
|
|
size_t start = findSentenceStart(body, pos);
|
|
if (start + 120 <= body.size()) {
|
|
return body.substr(start, 120);
|
|
}
|
|
return body.substr(start);
|
|
} else {
|
|
// If not found, try individual words
|
|
std::vector<std::string> words;
|
|
std::istringstream iss(query);
|
|
std::string word;
|
|
while (iss >> word) {
|
|
words.push_back(word);
|
|
}
|
|
if (!words.empty()) {
|
|
for (const std::string& w : words) {
|
|
pos = findWholeWord(body, w);
|
|
if (pos != std::string::npos) {
|
|
size_t start = findSentenceStart(body, pos);
|
|
if (start + 120 <= body.size()) {
|
|
return body.substr(start, 120);
|
|
}
|
|
return body.substr(start);
|
|
}
|
|
}
|
|
}
|
|
|
|
// If nothing found, return beginning of body
|
|
if (body.size() <= 120) {
|
|
return body;
|
|
}
|
|
return body.substr(0, 120);
|
|
}
|
|
}
|
|
|
|
// Split string by whitespace
|
|
std::vector<std::string> split(const std::string& s) {
|
|
std::vector<std::string> words;
|
|
std::istringstream iss(s);
|
|
std::string word;
|
|
while (iss >> word) {
|
|
words.push_back(word);
|
|
}
|
|
return words;
|
|
}
|
|
|
|
// Recursive crawl function
|
|
void crawl(const std::string& currentURL, std::set<std::string>& visited) {
|
|
if (visited.count(currentURL)) return;
|
|
visited.insert(currentURL);
|
|
|
|
std::ifstream fileStream(currentURL.c_str());
|
|
if (!fileStream.is_open()) {
|
|
std::cerr << "Failed to open " << currentURL << std::endl;
|
|
return;
|
|
}
|
|
std::string content((std::istreambuf_iterator<char>(fileStream)), std::istreambuf_iterator<char>());
|
|
fileStream.close();
|
|
|
|
docContents[currentURL] = content;
|
|
docLengths[currentURL] = content.length();
|
|
|
|
std::map<std::string, int> freq = extractWordFrequencies(content);
|
|
for (std::map<std::string, int>::const_iterator it = freq.begin(); it != freq.end(); ++it) {
|
|
invertedIndex[it->first][currentURL] = it->second;
|
|
}
|
|
|
|
std::list<std::string> links = extractLinksFromHTML(content);
|
|
outgoingLinksCount[currentURL] = links.size();
|
|
std::string currentDir = getDirectory(currentURL);
|
|
|
|
for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
|
|
std::string targetURL = resolvePath(currentDir, *it);
|
|
backlinks[targetURL].insert(currentURL);
|
|
}
|
|
|
|
for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
|
|
std::string targetURL = resolvePath(currentDir, *it);
|
|
if (!visited.count(targetURL)) {
|
|
crawl(targetURL, visited);
|
|
}
|
|
}
|
|
}
|
|
|
|
int main(int argc, char* argv[]) {
|
|
if (argc != 3) {
|
|
std::cerr << "Usage: " << argv[0] << " <seedURL> <inputFile>" << std::endl;
|
|
return 1;
|
|
}
|
|
|
|
std::string seedURL = argv[1];
|
|
std::string inputFile = argv[2];
|
|
std::set<std::string> visited;
|
|
crawl(seedURL, visited);
|
|
|
|
double totalLength = 0.0;
|
|
for (std::map<std::string, int>::const_iterator it = docLengths.begin(); it != docLengths.end(); ++it) {
|
|
totalLength += it->second;
|
|
}
|
|
|
|
std::ifstream inputStream(inputFile.c_str());
|
|
if (!inputStream.is_open()) {
|
|
std::cerr << "Failed to open " << inputFile << std::endl;
|
|
return 1;
|
|
}
|
|
|
|
std::vector<std::string> queries;
|
|
std::string line;
|
|
while (std::getline(inputStream, line)) {
|
|
queries.push_back(line);
|
|
}
|
|
inputStream.close();
|
|
|
|
for (size_t i = 0; i < queries.size(); ++i) {
|
|
std::string query = queries[i];
|
|
std::string outputFile = "out" + std::to_string(i + 1) + ".txt";
|
|
std::ofstream outStream(outputFile.c_str());
|
|
if (!outStream.is_open()) {
|
|
std::cerr << "Failed to open " << outputFile << std::endl;
|
|
continue;
|
|
}
|
|
|
|
bool isPhraseSearch = (query.size() >= 2 && query[0] == '"' && query[query.size() - 1] == '"');
|
|
std::string phrase;
|
|
if (isPhraseSearch) {
|
|
phrase = query.substr(1, query.size() - 2);
|
|
query = phrase;
|
|
}
|
|
|
|
std::vector<std::string> words = split(query);
|
|
std::set<std::string> candidates;
|
|
if (!words.empty()) {
|
|
std::string firstWord = words[0];
|
|
if (invertedIndex.count(firstWord)) {
|
|
std::map<std::string, int> docs = invertedIndex[firstWord];
|
|
for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
|
|
candidates.insert(it->first);
|
|
}
|
|
}
|
|
for (size_t j = 1; j < words.size(); ++j) {
|
|
std::string word = words[j];
|
|
if (invertedIndex.count(word)) {
|
|
std::set<std::string> temp;
|
|
std::map<std::string, int> docs = invertedIndex[word];
|
|
for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
|
|
if (candidates.count(it->first)) {
|
|
temp.insert(it->first);
|
|
}
|
|
}
|
|
candidates = temp;
|
|
} else {
|
|
candidates.clear();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (isPhraseSearch) {
|
|
std::set<std::string> filtered;
|
|
for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
|
|
if (phraseExists(docContents[*it], phrase)) {
|
|
filtered.insert(*it);
|
|
}
|
|
}
|
|
candidates = filtered;
|
|
}
|
|
|
|
if (candidates.empty()) {
|
|
outStream << "Your search - " << queries[i] << " - did not match any documents." << std::endl;
|
|
outStream.close();
|
|
continue;
|
|
}
|
|
|
|
std::vector<std::pair<std::string, double> > scores;
|
|
for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
|
|
std::string doc = *it;
|
|
double densityScore = 0.0;
|
|
for (size_t j = 0; j < words.size(); ++j) {
|
|
std::string word = words[j];
|
|
if (invertedIndex[word].count(doc)) {
|
|
int freq = invertedIndex[word][doc];
|
|
double totalOccurrences = 0.0;
|
|
std::map<std::string, int> docs = invertedIndex[word];
|
|
for (std::map<std::string, int>::const_iterator docIt = docs.begin(); docIt != docs.end(); ++docIt) {
|
|
totalOccurrences += docIt->second;
|
|
}
|
|
double keywordDensityAcrossAll = totalOccurrences / totalLength;
|
|
densityScore += freq / (docLengths[doc] * keywordDensityAcrossAll);
|
|
}
|
|
}
|
|
|
|
double backlinksScore = 0.0;
|
|
if (backlinks.count(doc)) {
|
|
std::set<std::string> linkers = backlinks[doc];
|
|
for (std::set<std::string>::const_iterator linkIt = linkers.begin(); linkIt != linkers.end(); ++linkIt) {
|
|
backlinksScore += 1.0 / (1 + outgoingLinksCount[*linkIt]);
|
|
}
|
|
}
|
|
|
|
double pageScore = 0.5 * densityScore + 0.5 * backlinksScore;
|
|
scores.push_back(std::pair<std::string, double>(doc, pageScore));
|
|
}
|
|
|
|
std::sort(scores.begin(), scores.end(),
|
|
[](const std::pair<std::string, double>& a, const std::pair<std::string, double>& b) {
|
|
return a.second > b.second;
|
|
});
|
|
|
|
outStream << "Matching documents: " << std::endl;
|
|
for (size_t j = 0; j < scores.size(); ++j) {
|
|
std::string doc = scores[j].first;
|
|
std::string content = docContents[doc];
|
|
std::string title = getTitle(content);
|
|
std::string description = getDescription(content);
|
|
std::string body = getBody(content);
|
|
std::string snippet = generateSnippet(body, query);
|
|
|
|
outStream << std::endl << "Title: " << title << std::endl;
|
|
outStream << "URL: " << doc << std::endl;
|
|
outStream << "Description: " << description << std::endl;
|
|
outStream << "Snippet: " << snippet << std::endl;
|
|
}
|
|
outStream.close();
|
|
}
|
|
|
|
return 0;
|
|
} |