solve hw 7

This commit is contained in:
JamesFlare1212
2025-03-27 23:45:56 -04:00
parent 48d8b8fcfd
commit ff290ace15
5 changed files with 478 additions and 6 deletions

View File

@@ -1,7 +1,7 @@
HOMEWORK 7: Search Engine
NAME: < insert name >
NAME: Jinshan Zhou
COLLABORATORS AND OTHER RESOURCES:
@@ -10,13 +10,13 @@ List the names of everyone you talked to about this assignment
LMS, etc.), and all of the resources (books, online reference
material, etc.) you consulted in completing this assignment.
< insert collaborators / resources >
some examples about recursive listing file under a path
Remember: Your implementation for this assignment must be done on your
own, as described in "Academic Integrity for Homework" handout.
ESTIMATE OF # OF HOURS SPENT ON THIS ASSIGNMENT: < insert # hours >
ESTIMATE OF # OF HOURS SPENT ON THIS ASSIGNMENT: 12hr
MISC. COMMENTS TO GRADER:
@@ -33,5 +33,7 @@ What parts of the assignment did you find challenging? Is there anything that
finally "clicked" for you in the process of working on this assignment? How well
did the development and testing process go for you?
< insert reflection >
The program made a lot of mistake at the begin and I don't know why. After
a deep research, I found it's because some missing "/" when we join path. Then.
I fixed it and everything works fine. The snippet also trick me a bit. But, isn't
that hard (since easy to debug)

View File

@@ -0,0 +1,49 @@
html_files
├── file1.html
├── file2.html
├── index.html
└── subdir1
├── file3.html
└── subdir2
├── file4.html
├── file5.html
├── file6.html
├── file7.html
└── subdir3
├── file10.html
├── file8.html
├── file9.html
├── subdir4
│   ├── file11.html
│   ├── file12.html
│   └── file13.html
└── subdir5
├── file14.html
├── file15.html
├── file16.html
└── subdir6
└── subdir7
├── file17.html
├── file18.html
├── file19.html
└── subdir8
├── file20.html
├── file21.html
├── file22.html
├── file23.html
└── subdir9
├── file24.html
├── file25.html
└── subdir10
├── file26.html
├── file27.html
├── file28.html
└── subdir11
├── file29.html
├── file30.html
└── subdir12
├── file31.html
└── subdir13
└── file32.html
14 directories, 33 files

405
hws/search_engine/main.cpp Normal file
View File

@@ -0,0 +1,405 @@
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <set>
#include <vector>
#include <list>
#include <regex>
#include <algorithm>
#include <cctype>
#include <sstream>
// Global data structures
std::map<std::string, std::map<std::string, int> > invertedIndex;
std::map<std::string, int> docLengths;
std::map<std::string, int> outgoingLinksCount;
std::map<std::string, std::set<std::string> > backlinks;
std::map<std::string, std::string> docContents;
// Provided function to extract links from HTML
std::list<std::string> extractLinksFromHTML(const std::string& fileContent) {
std::list<std::string> links;
std::regex linkRegex("<a\\s+[^>]*href\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");
std::smatch match;
std::string::const_iterator start = fileContent.begin();
while (std::regex_search(start, fileContent.end(), match, linkRegex)) {
if (match.size() > 1) {
links.push_back(match[1].str());
}
start = match.suffix().first;
}
return links;
}
// Resolve relative path based on current directory
std::string resolvePath(const std::string& currentDir, const std::string& link) {
if (link.find("../") == 0) {
std::string dir = currentDir;
if (!dir.empty() && dir.back() == '/') {
dir.pop_back(); // Remove trailing slash if present
}
size_t pos = dir.find_last_of('/');
if (pos != std::string::npos) {
dir = dir.substr(0, pos);
}
std::string remaining = link.substr(3);
return resolvePath(dir, remaining);
} else if (link.find("./") == 0) {
std::string remaining = link.substr(2);
return resolvePath(currentDir, remaining);
} else if (link.empty()) {
return currentDir; // Handle empty links if applicable
} else {
// Ensure exactly one slash between currentDir and link
std::string dir = currentDir;
std::string lnk = link;
if (!dir.empty() && dir.back() == '/') {
dir.pop_back(); // Remove trailing slash from currentDir
}
if (!lnk.empty() && lnk.front() == '/') {
lnk.erase(0, 1); // Remove leading slash from link
}
return dir + "/" + lnk;
}
}
// Get directory from URL
std::string getDirectory(const std::string& url) {
size_t pos = url.find_last_of('/');
if (pos != std::string::npos) {
return url.substr(0, pos + 1);
}
return "";
}
// Extract word frequencies with word boundaries
std::map<std::string, int> extractWordFrequencies(const std::string& content) {
std::map<std::string, int> freq;
size_t i = 0;
while (i < content.size()) {
if (std::isalnum(content[i])) {
if (i == 0 || !std::isalnum(content[i - 1])) {
size_t j = i;
while (j < content.size() && std::isalnum(content[j])) {
j++;
}
if (j == content.size() || !std::isalnum(content[j])) {
std::string word = content.substr(i, j - i);
freq[word]++;
i = j;
} else {
i = j;
}
} else {
i++;
}
} else {
i++;
}
}
return freq;
}
// Check if phrase exists with word boundaries
bool phraseExists(const std::string& content, const std::string& phrase) {
size_t pos = 0;
while ((pos = content.find(phrase, pos)) != std::string::npos) {
bool before = (pos == 0 || !std::isalnum(content[pos - 1]));
bool after = (pos + phrase.size() == content.size() || !std::isalnum(content[pos + phrase.size()]));
if (before && after) {
return true;
}
pos += 1;
}
return false;
}
// Helper function to find a whole word with word boundaries
size_t findWholeWord(const std::string& str, const std::string& word) {
size_t pos = 0;
while ((pos = str.find(word, pos)) != std::string::npos) {
bool before = (pos == 0 || !std::isalnum(str[pos - 1]));
bool after = (pos + word.size() == str.size() || !std::isalnum(str[pos + word.size()]));
if (before && after) {
return pos;
}
pos += 1;
}
return std::string::npos;
}
// Extract title from <head>
std::string getTitle(const std::string& content) {
size_t start = content.find("<title>");
if (start == std::string::npos) return "";
start += 7;
size_t end = content.find("</title>", start);
if (end == std::string::npos) return "";
return content.substr(start, end - start);
}
// Extract description from <meta>
std::string getDescription(const std::string& content) {
size_t pos = content.find("<meta name=\"description\" content=\"");
if (pos == std::string::npos) return "";
pos += 34;
size_t end = content.find("\"", pos);
if (end == std::string::npos) return "";
return content.substr(pos, end - pos);
}
// Extract <body> content
std::string getBody(const std::string& content) {
size_t start = content.find("<body>");
if (start == std::string::npos) return "";
start += 6;
size_t end = content.find("</body>", start);
if (end == std::string::npos) return "";
return content.substr(start, end - start);
}
// Find sentence start position
size_t findSentenceStart(const std::string& body, size_t pos) {
size_t periodPos = body.rfind(".", pos);
if (periodPos == std::string::npos) {
return 0;
}
size_t start = periodPos + 1;
while (start < body.size() && std::isspace(body[start])) {
start++;
}
return start;
}
// Generate 120-character snippet
std::string generateSnippet(const std::string& body, const std::string& query) {
// Try to find the exact query with word boundaries first
size_t pos = findWholeWord(body, query);
if (pos != std::string::npos) {
size_t start = findSentenceStart(body, pos);
if (start + 120 <= body.size()) {
return body.substr(start, 120);
}
return body.substr(start);
} else {
// If not found, try individual words
std::vector<std::string> words;
std::istringstream iss(query);
std::string word;
while (iss >> word) {
words.push_back(word);
}
if (!words.empty()) {
for (const std::string& w : words) {
pos = findWholeWord(body, w);
if (pos != std::string::npos) {
size_t start = findSentenceStart(body, pos);
if (start + 120 <= body.size()) {
return body.substr(start, 120);
}
return body.substr(start);
}
}
}
// If nothing found, return beginning of body
if (body.size() <= 120) {
return body;
}
return body.substr(0, 120);
}
}
// Split string by whitespace
std::vector<std::string> split(const std::string& s) {
std::vector<std::string> words;
std::istringstream iss(s);
std::string word;
while (iss >> word) {
words.push_back(word);
}
return words;
}
// Recursive crawl function
void crawl(const std::string& currentURL, std::set<std::string>& visited) {
if (visited.count(currentURL)) return;
visited.insert(currentURL);
std::ifstream fileStream(currentURL.c_str());
if (!fileStream.is_open()) {
std::cerr << "Failed to open " << currentURL << std::endl;
return;
}
std::string content((std::istreambuf_iterator<char>(fileStream)), std::istreambuf_iterator<char>());
fileStream.close();
docContents[currentURL] = content;
docLengths[currentURL] = content.length();
std::map<std::string, int> freq = extractWordFrequencies(content);
for (std::map<std::string, int>::const_iterator it = freq.begin(); it != freq.end(); ++it) {
invertedIndex[it->first][currentURL] = it->second;
}
std::list<std::string> links = extractLinksFromHTML(content);
outgoingLinksCount[currentURL] = links.size();
std::string currentDir = getDirectory(currentURL);
for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
std::string targetURL = resolvePath(currentDir, *it);
backlinks[targetURL].insert(currentURL);
}
for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
std::string targetURL = resolvePath(currentDir, *it);
if (!visited.count(targetURL)) {
crawl(targetURL, visited);
}
}
}
int main(int argc, char* argv[]) {
if (argc != 3) {
std::cerr << "Usage: " << argv[0] << " <seedURL> <inputFile>" << std::endl;
return 1;
}
std::string seedURL = argv[1];
std::string inputFile = argv[2];
std::set<std::string> visited;
crawl(seedURL, visited);
double totalLength = 0.0;
for (std::map<std::string, int>::const_iterator it = docLengths.begin(); it != docLengths.end(); ++it) {
totalLength += it->second;
}
std::ifstream inputStream(inputFile.c_str());
if (!inputStream.is_open()) {
std::cerr << "Failed to open " << inputFile << std::endl;
return 1;
}
std::vector<std::string> queries;
std::string line;
while (std::getline(inputStream, line)) {
queries.push_back(line);
}
inputStream.close();
for (size_t i = 0; i < queries.size(); ++i) {
std::string query = queries[i];
std::string outputFile = "out" + std::to_string(i + 1) + ".txt";
std::ofstream outStream(outputFile.c_str());
if (!outStream.is_open()) {
std::cerr << "Failed to open " << outputFile << std::endl;
continue;
}
bool isPhraseSearch = (query.size() >= 2 && query[0] == '"' && query[query.size() - 1] == '"');
std::string phrase;
if (isPhraseSearch) {
phrase = query.substr(1, query.size() - 2);
query = phrase;
}
std::vector<std::string> words = split(query);
std::set<std::string> candidates;
if (!words.empty()) {
std::string firstWord = words[0];
if (invertedIndex.count(firstWord)) {
std::map<std::string, int> docs = invertedIndex[firstWord];
for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
candidates.insert(it->first);
}
}
for (size_t j = 1; j < words.size(); ++j) {
std::string word = words[j];
if (invertedIndex.count(word)) {
std::set<std::string> temp;
std::map<std::string, int> docs = invertedIndex[word];
for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
if (candidates.count(it->first)) {
temp.insert(it->first);
}
}
candidates = temp;
} else {
candidates.clear();
break;
}
}
}
if (isPhraseSearch) {
std::set<std::string> filtered;
for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
if (phraseExists(docContents[*it], phrase)) {
filtered.insert(*it);
}
}
candidates = filtered;
}
if (candidates.empty()) {
outStream << "Your search - " << queries[i] << " - did not match any documents." << std::endl;
outStream.close();
continue;
}
std::vector<std::pair<std::string, double> > scores;
for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
std::string doc = *it;
double densityScore = 0.0;
for (size_t j = 0; j < words.size(); ++j) {
std::string word = words[j];
if (invertedIndex[word].count(doc)) {
int freq = invertedIndex[word][doc];
double totalOccurrences = 0.0;
std::map<std::string, int> docs = invertedIndex[word];
for (std::map<std::string, int>::const_iterator docIt = docs.begin(); docIt != docs.end(); ++docIt) {
totalOccurrences += docIt->second;
}
double keywordDensityAcrossAll = totalOccurrences / totalLength;
densityScore += freq / (docLengths[doc] * keywordDensityAcrossAll);
}
}
double backlinksScore = 0.0;
if (backlinks.count(doc)) {
std::set<std::string> linkers = backlinks[doc];
for (std::set<std::string>::const_iterator linkIt = linkers.begin(); linkIt != linkers.end(); ++linkIt) {
backlinksScore += 1.0 / (1 + outgoingLinksCount[*linkIt]);
}
}
double pageScore = 0.5 * densityScore + 0.5 * backlinksScore;
scores.push_back(std::pair<std::string, double>(doc, pageScore));
}
std::sort(scores.begin(), scores.end(),
[](const std::pair<std::string, double>& a, const std::pair<std::string, double>& b) {
return a.second > b.second;
});
outStream << "Matching documents: " << std::endl;
for (size_t j = 0; j < scores.size(); ++j) {
std::string doc = scores[j].first;
std::string content = docContents[doc];
std::string title = getTitle(content);
std::string description = getDescription(content);
std::string body = getBody(content);
std::string snippet = generateSnippet(body, query);
outStream << std::endl << "Title: " << title << std::endl;
outStream << "URL: " << doc << std::endl;
outStream << "Description: " << description << std::endl;
outStream << "Snippet: " << snippet << std::endl;
}
outStream.close();
}
return 0;
}