solve hw 7
This commit is contained in:
12
.vscode/launch.json
vendored
12
.vscode/launch.json
vendored
@@ -106,18 +106,30 @@
|
|||||||
"preLaunchTask": "C/C++: g++ build active file"
|
"preLaunchTask": "C/C++: g++ build active file"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
<<<<<<< HEAD
|
||||||
"name": "lab07",
|
"name": "lab07",
|
||||||
|
=======
|
||||||
|
"name": "nysearch",
|
||||||
|
>>>>>>> 6b2a5ae (solve hw 7)
|
||||||
"type": "cppdbg",
|
"type": "cppdbg",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"program": "${fileDirname}/${fileBasenameNoExtension}",
|
"program": "${fileDirname}/${fileBasenameNoExtension}",
|
||||||
"args": [
|
"args": [
|
||||||
|
<<<<<<< HEAD
|
||||||
|
=======
|
||||||
|
"html_files/index.html",
|
||||||
|
>>>>>>> 6b2a5ae (solve hw 7)
|
||||||
"input.txt"
|
"input.txt"
|
||||||
],
|
],
|
||||||
"cwd": "${fileDirname}",
|
"cwd": "${fileDirname}",
|
||||||
"environment": [],
|
"environment": [],
|
||||||
"MIMode": "gdb",
|
"MIMode": "gdb",
|
||||||
"miDebuggerPath": "/usr/bin/gdb",
|
"miDebuggerPath": "/usr/bin/gdb",
|
||||||
|
<<<<<<< HEAD
|
||||||
"preLaunchTask": "C/C++: g++ build single active file"
|
"preLaunchTask": "C/C++: g++ build single active file"
|
||||||
|
=======
|
||||||
|
"preLaunchTask": "C/C++: g++ build active file"
|
||||||
|
>>>>>>> 6b2a5ae (solve hw 7)
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
6
.vscode/settings.json
vendored
6
.vscode/settings.json
vendored
@@ -73,6 +73,10 @@
|
|||||||
"queue": "cpp",
|
"queue": "cpp",
|
||||||
"stack": "cpp",
|
"stack": "cpp",
|
||||||
"set": "cpp",
|
"set": "cpp",
|
||||||
"climits": "cpp"
|
"climits": "cpp",
|
||||||
|
"map": "cpp",
|
||||||
|
"unordered_set": "cpp",
|
||||||
|
"regex": "cpp",
|
||||||
|
"cinttypes": "cpp"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
HOMEWORK 7: Search Engine
|
HOMEWORK 7: Search Engine
|
||||||
|
|
||||||
|
|
||||||
NAME: < insert name >
|
NAME: Jinshan Zhou
|
||||||
|
|
||||||
|
|
||||||
COLLABORATORS AND OTHER RESOURCES:
|
COLLABORATORS AND OTHER RESOURCES:
|
||||||
@@ -10,13 +10,13 @@ List the names of everyone you talked to about this assignment
|
|||||||
LMS, etc.), and all of the resources (books, online reference
|
LMS, etc.), and all of the resources (books, online reference
|
||||||
material, etc.) you consulted in completing this assignment.
|
material, etc.) you consulted in completing this assignment.
|
||||||
|
|
||||||
< insert collaborators / resources >
|
some examples about recursive listing file under a path
|
||||||
|
|
||||||
Remember: Your implementation for this assignment must be done on your
|
Remember: Your implementation for this assignment must be done on your
|
||||||
own, as described in "Academic Integrity for Homework" handout.
|
own, as described in "Academic Integrity for Homework" handout.
|
||||||
|
|
||||||
|
|
||||||
ESTIMATE OF # OF HOURS SPENT ON THIS ASSIGNMENT: < insert # hours >
|
ESTIMATE OF # OF HOURS SPENT ON THIS ASSIGNMENT: 12hr
|
||||||
|
|
||||||
|
|
||||||
MISC. COMMENTS TO GRADER:
|
MISC. COMMENTS TO GRADER:
|
||||||
@@ -33,5 +33,7 @@ What parts of the assignment did you find challenging? Is there anything that
|
|||||||
finally "clicked" for you in the process of working on this assignment? How well
|
finally "clicked" for you in the process of working on this assignment? How well
|
||||||
did the development and testing process go for you?
|
did the development and testing process go for you?
|
||||||
|
|
||||||
< insert reflection >
|
The program made a lot of mistake at the begin and I don't know why. After
|
||||||
|
a deep research, I found it's because some missing "/" when we join path. Then.
|
||||||
|
I fixed it and everything works fine. The snippet also trick me a bit. But, isn't
|
||||||
|
that hard (since easy to debug)
|
||||||
|
|||||||
49
hws/search_engine/dir_tree.txt
Normal file
49
hws/search_engine/dir_tree.txt
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
html_files
|
||||||
|
├── file1.html
|
||||||
|
├── file2.html
|
||||||
|
├── index.html
|
||||||
|
└── subdir1
|
||||||
|
├── file3.html
|
||||||
|
└── subdir2
|
||||||
|
├── file4.html
|
||||||
|
├── file5.html
|
||||||
|
├── file6.html
|
||||||
|
├── file7.html
|
||||||
|
└── subdir3
|
||||||
|
├── file10.html
|
||||||
|
├── file8.html
|
||||||
|
├── file9.html
|
||||||
|
├── subdir4
|
||||||
|
│ ├── file11.html
|
||||||
|
│ ├── file12.html
|
||||||
|
│ └── file13.html
|
||||||
|
└── subdir5
|
||||||
|
├── file14.html
|
||||||
|
├── file15.html
|
||||||
|
├── file16.html
|
||||||
|
└── subdir6
|
||||||
|
└── subdir7
|
||||||
|
├── file17.html
|
||||||
|
├── file18.html
|
||||||
|
├── file19.html
|
||||||
|
└── subdir8
|
||||||
|
├── file20.html
|
||||||
|
├── file21.html
|
||||||
|
├── file22.html
|
||||||
|
├── file23.html
|
||||||
|
└── subdir9
|
||||||
|
├── file24.html
|
||||||
|
├── file25.html
|
||||||
|
└── subdir10
|
||||||
|
├── file26.html
|
||||||
|
├── file27.html
|
||||||
|
├── file28.html
|
||||||
|
└── subdir11
|
||||||
|
├── file29.html
|
||||||
|
├── file30.html
|
||||||
|
└── subdir12
|
||||||
|
├── file31.html
|
||||||
|
└── subdir13
|
||||||
|
└── file32.html
|
||||||
|
|
||||||
|
14 directories, 33 files
|
||||||
405
hws/search_engine/main.cpp
Normal file
405
hws/search_engine/main.cpp
Normal file
@@ -0,0 +1,405 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
#include <set>
|
||||||
|
#include <vector>
|
||||||
|
#include <list>
|
||||||
|
#include <regex>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cctype>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
// Global data structures
|
||||||
|
std::map<std::string, std::map<std::string, int> > invertedIndex;
|
||||||
|
std::map<std::string, int> docLengths;
|
||||||
|
std::map<std::string, int> outgoingLinksCount;
|
||||||
|
std::map<std::string, std::set<std::string> > backlinks;
|
||||||
|
std::map<std::string, std::string> docContents;
|
||||||
|
|
||||||
|
// Provided function to extract links from HTML
|
||||||
|
std::list<std::string> extractLinksFromHTML(const std::string& fileContent) {
|
||||||
|
std::list<std::string> links;
|
||||||
|
std::regex linkRegex("<a\\s+[^>]*href\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");
|
||||||
|
std::smatch match;
|
||||||
|
std::string::const_iterator start = fileContent.begin();
|
||||||
|
while (std::regex_search(start, fileContent.end(), match, linkRegex)) {
|
||||||
|
if (match.size() > 1) {
|
||||||
|
links.push_back(match[1].str());
|
||||||
|
}
|
||||||
|
start = match.suffix().first;
|
||||||
|
}
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve relative path based on current directory
|
||||||
|
std::string resolvePath(const std::string& currentDir, const std::string& link) {
|
||||||
|
if (link.find("../") == 0) {
|
||||||
|
std::string dir = currentDir;
|
||||||
|
if (!dir.empty() && dir.back() == '/') {
|
||||||
|
dir.pop_back(); // Remove trailing slash if present
|
||||||
|
}
|
||||||
|
size_t pos = dir.find_last_of('/');
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
dir = dir.substr(0, pos);
|
||||||
|
}
|
||||||
|
std::string remaining = link.substr(3);
|
||||||
|
return resolvePath(dir, remaining);
|
||||||
|
} else if (link.find("./") == 0) {
|
||||||
|
std::string remaining = link.substr(2);
|
||||||
|
return resolvePath(currentDir, remaining);
|
||||||
|
} else if (link.empty()) {
|
||||||
|
return currentDir; // Handle empty links if applicable
|
||||||
|
} else {
|
||||||
|
// Ensure exactly one slash between currentDir and link
|
||||||
|
std::string dir = currentDir;
|
||||||
|
std::string lnk = link;
|
||||||
|
if (!dir.empty() && dir.back() == '/') {
|
||||||
|
dir.pop_back(); // Remove trailing slash from currentDir
|
||||||
|
}
|
||||||
|
if (!lnk.empty() && lnk.front() == '/') {
|
||||||
|
lnk.erase(0, 1); // Remove leading slash from link
|
||||||
|
}
|
||||||
|
return dir + "/" + lnk;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get directory from URL
|
||||||
|
std::string getDirectory(const std::string& url) {
|
||||||
|
size_t pos = url.find_last_of('/');
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
return url.substr(0, pos + 1);
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract word frequencies with word boundaries
|
||||||
|
std::map<std::string, int> extractWordFrequencies(const std::string& content) {
|
||||||
|
std::map<std::string, int> freq;
|
||||||
|
size_t i = 0;
|
||||||
|
while (i < content.size()) {
|
||||||
|
if (std::isalnum(content[i])) {
|
||||||
|
if (i == 0 || !std::isalnum(content[i - 1])) {
|
||||||
|
size_t j = i;
|
||||||
|
while (j < content.size() && std::isalnum(content[j])) {
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
if (j == content.size() || !std::isalnum(content[j])) {
|
||||||
|
std::string word = content.substr(i, j - i);
|
||||||
|
freq[word]++;
|
||||||
|
i = j;
|
||||||
|
} else {
|
||||||
|
i = j;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return freq;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if phrase exists with word boundaries
|
||||||
|
bool phraseExists(const std::string& content, const std::string& phrase) {
|
||||||
|
size_t pos = 0;
|
||||||
|
while ((pos = content.find(phrase, pos)) != std::string::npos) {
|
||||||
|
bool before = (pos == 0 || !std::isalnum(content[pos - 1]));
|
||||||
|
bool after = (pos + phrase.size() == content.size() || !std::isalnum(content[pos + phrase.size()]));
|
||||||
|
if (before && after) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
pos += 1;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to find a whole word with word boundaries
|
||||||
|
size_t findWholeWord(const std::string& str, const std::string& word) {
|
||||||
|
size_t pos = 0;
|
||||||
|
while ((pos = str.find(word, pos)) != std::string::npos) {
|
||||||
|
bool before = (pos == 0 || !std::isalnum(str[pos - 1]));
|
||||||
|
bool after = (pos + word.size() == str.size() || !std::isalnum(str[pos + word.size()]));
|
||||||
|
if (before && after) {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
pos += 1;
|
||||||
|
}
|
||||||
|
return std::string::npos;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract title from <head>
|
||||||
|
std::string getTitle(const std::string& content) {
|
||||||
|
size_t start = content.find("<title>");
|
||||||
|
if (start == std::string::npos) return "";
|
||||||
|
start += 7;
|
||||||
|
size_t end = content.find("</title>", start);
|
||||||
|
if (end == std::string::npos) return "";
|
||||||
|
return content.substr(start, end - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract description from <meta>
|
||||||
|
std::string getDescription(const std::string& content) {
|
||||||
|
size_t pos = content.find("<meta name=\"description\" content=\"");
|
||||||
|
if (pos == std::string::npos) return "";
|
||||||
|
pos += 34;
|
||||||
|
size_t end = content.find("\"", pos);
|
||||||
|
if (end == std::string::npos) return "";
|
||||||
|
return content.substr(pos, end - pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract <body> content
|
||||||
|
std::string getBody(const std::string& content) {
|
||||||
|
size_t start = content.find("<body>");
|
||||||
|
if (start == std::string::npos) return "";
|
||||||
|
start += 6;
|
||||||
|
size_t end = content.find("</body>", start);
|
||||||
|
if (end == std::string::npos) return "";
|
||||||
|
return content.substr(start, end - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find sentence start position
|
||||||
|
size_t findSentenceStart(const std::string& body, size_t pos) {
|
||||||
|
size_t periodPos = body.rfind(".", pos);
|
||||||
|
if (periodPos == std::string::npos) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
size_t start = periodPos + 1;
|
||||||
|
while (start < body.size() && std::isspace(body[start])) {
|
||||||
|
start++;
|
||||||
|
}
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate 120-character snippet
|
||||||
|
std::string generateSnippet(const std::string& body, const std::string& query) {
|
||||||
|
// Try to find the exact query with word boundaries first
|
||||||
|
size_t pos = findWholeWord(body, query);
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
size_t start = findSentenceStart(body, pos);
|
||||||
|
if (start + 120 <= body.size()) {
|
||||||
|
return body.substr(start, 120);
|
||||||
|
}
|
||||||
|
return body.substr(start);
|
||||||
|
} else {
|
||||||
|
// If not found, try individual words
|
||||||
|
std::vector<std::string> words;
|
||||||
|
std::istringstream iss(query);
|
||||||
|
std::string word;
|
||||||
|
while (iss >> word) {
|
||||||
|
words.push_back(word);
|
||||||
|
}
|
||||||
|
if (!words.empty()) {
|
||||||
|
for (const std::string& w : words) {
|
||||||
|
pos = findWholeWord(body, w);
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
size_t start = findSentenceStart(body, pos);
|
||||||
|
if (start + 120 <= body.size()) {
|
||||||
|
return body.substr(start, 120);
|
||||||
|
}
|
||||||
|
return body.substr(start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If nothing found, return beginning of body
|
||||||
|
if (body.size() <= 120) {
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
return body.substr(0, 120);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split string by whitespace
|
||||||
|
std::vector<std::string> split(const std::string& s) {
|
||||||
|
std::vector<std::string> words;
|
||||||
|
std::istringstream iss(s);
|
||||||
|
std::string word;
|
||||||
|
while (iss >> word) {
|
||||||
|
words.push_back(word);
|
||||||
|
}
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recursive crawl function
|
||||||
|
void crawl(const std::string& currentURL, std::set<std::string>& visited) {
|
||||||
|
if (visited.count(currentURL)) return;
|
||||||
|
visited.insert(currentURL);
|
||||||
|
|
||||||
|
std::ifstream fileStream(currentURL.c_str());
|
||||||
|
if (!fileStream.is_open()) {
|
||||||
|
std::cerr << "Failed to open " << currentURL << std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::string content((std::istreambuf_iterator<char>(fileStream)), std::istreambuf_iterator<char>());
|
||||||
|
fileStream.close();
|
||||||
|
|
||||||
|
docContents[currentURL] = content;
|
||||||
|
docLengths[currentURL] = content.length();
|
||||||
|
|
||||||
|
std::map<std::string, int> freq = extractWordFrequencies(content);
|
||||||
|
for (std::map<std::string, int>::const_iterator it = freq.begin(); it != freq.end(); ++it) {
|
||||||
|
invertedIndex[it->first][currentURL] = it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::list<std::string> links = extractLinksFromHTML(content);
|
||||||
|
outgoingLinksCount[currentURL] = links.size();
|
||||||
|
std::string currentDir = getDirectory(currentURL);
|
||||||
|
|
||||||
|
for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
|
||||||
|
std::string targetURL = resolvePath(currentDir, *it);
|
||||||
|
backlinks[targetURL].insert(currentURL);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (std::list<std::string>::const_iterator it = links.begin(); it != links.end(); ++it) {
|
||||||
|
std::string targetURL = resolvePath(currentDir, *it);
|
||||||
|
if (!visited.count(targetURL)) {
|
||||||
|
crawl(targetURL, visited);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
if (argc != 3) {
|
||||||
|
std::cerr << "Usage: " << argv[0] << " <seedURL> <inputFile>" << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string seedURL = argv[1];
|
||||||
|
std::string inputFile = argv[2];
|
||||||
|
std::set<std::string> visited;
|
||||||
|
crawl(seedURL, visited);
|
||||||
|
|
||||||
|
double totalLength = 0.0;
|
||||||
|
for (std::map<std::string, int>::const_iterator it = docLengths.begin(); it != docLengths.end(); ++it) {
|
||||||
|
totalLength += it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ifstream inputStream(inputFile.c_str());
|
||||||
|
if (!inputStream.is_open()) {
|
||||||
|
std::cerr << "Failed to open " << inputFile << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> queries;
|
||||||
|
std::string line;
|
||||||
|
while (std::getline(inputStream, line)) {
|
||||||
|
queries.push_back(line);
|
||||||
|
}
|
||||||
|
inputStream.close();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < queries.size(); ++i) {
|
||||||
|
std::string query = queries[i];
|
||||||
|
std::string outputFile = "out" + std::to_string(i + 1) + ".txt";
|
||||||
|
std::ofstream outStream(outputFile.c_str());
|
||||||
|
if (!outStream.is_open()) {
|
||||||
|
std::cerr << "Failed to open " << outputFile << std::endl;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isPhraseSearch = (query.size() >= 2 && query[0] == '"' && query[query.size() - 1] == '"');
|
||||||
|
std::string phrase;
|
||||||
|
if (isPhraseSearch) {
|
||||||
|
phrase = query.substr(1, query.size() - 2);
|
||||||
|
query = phrase;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> words = split(query);
|
||||||
|
std::set<std::string> candidates;
|
||||||
|
if (!words.empty()) {
|
||||||
|
std::string firstWord = words[0];
|
||||||
|
if (invertedIndex.count(firstWord)) {
|
||||||
|
std::map<std::string, int> docs = invertedIndex[firstWord];
|
||||||
|
for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
|
||||||
|
candidates.insert(it->first);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size_t j = 1; j < words.size(); ++j) {
|
||||||
|
std::string word = words[j];
|
||||||
|
if (invertedIndex.count(word)) {
|
||||||
|
std::set<std::string> temp;
|
||||||
|
std::map<std::string, int> docs = invertedIndex[word];
|
||||||
|
for (std::map<std::string, int>::const_iterator it = docs.begin(); it != docs.end(); ++it) {
|
||||||
|
if (candidates.count(it->first)) {
|
||||||
|
temp.insert(it->first);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
candidates = temp;
|
||||||
|
} else {
|
||||||
|
candidates.clear();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isPhraseSearch) {
|
||||||
|
std::set<std::string> filtered;
|
||||||
|
for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
|
||||||
|
if (phraseExists(docContents[*it], phrase)) {
|
||||||
|
filtered.insert(*it);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
candidates = filtered;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (candidates.empty()) {
|
||||||
|
outStream << "Your search - " << queries[i] << " - did not match any documents." << std::endl;
|
||||||
|
outStream.close();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::pair<std::string, double> > scores;
|
||||||
|
for (std::set<std::string>::const_iterator it = candidates.begin(); it != candidates.end(); ++it) {
|
||||||
|
std::string doc = *it;
|
||||||
|
double densityScore = 0.0;
|
||||||
|
for (size_t j = 0; j < words.size(); ++j) {
|
||||||
|
std::string word = words[j];
|
||||||
|
if (invertedIndex[word].count(doc)) {
|
||||||
|
int freq = invertedIndex[word][doc];
|
||||||
|
double totalOccurrences = 0.0;
|
||||||
|
std::map<std::string, int> docs = invertedIndex[word];
|
||||||
|
for (std::map<std::string, int>::const_iterator docIt = docs.begin(); docIt != docs.end(); ++docIt) {
|
||||||
|
totalOccurrences += docIt->second;
|
||||||
|
}
|
||||||
|
double keywordDensityAcrossAll = totalOccurrences / totalLength;
|
||||||
|
densityScore += freq / (docLengths[doc] * keywordDensityAcrossAll);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double backlinksScore = 0.0;
|
||||||
|
if (backlinks.count(doc)) {
|
||||||
|
std::set<std::string> linkers = backlinks[doc];
|
||||||
|
for (std::set<std::string>::const_iterator linkIt = linkers.begin(); linkIt != linkers.end(); ++linkIt) {
|
||||||
|
backlinksScore += 1.0 / (1 + outgoingLinksCount[*linkIt]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double pageScore = 0.5 * densityScore + 0.5 * backlinksScore;
|
||||||
|
scores.push_back(std::pair<std::string, double>(doc, pageScore));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::sort(scores.begin(), scores.end(),
|
||||||
|
[](const std::pair<std::string, double>& a, const std::pair<std::string, double>& b) {
|
||||||
|
return a.second > b.second;
|
||||||
|
});
|
||||||
|
|
||||||
|
outStream << "Matching documents: " << std::endl;
|
||||||
|
for (size_t j = 0; j < scores.size(); ++j) {
|
||||||
|
std::string doc = scores[j].first;
|
||||||
|
std::string content = docContents[doc];
|
||||||
|
std::string title = getTitle(content);
|
||||||
|
std::string description = getDescription(content);
|
||||||
|
std::string body = getBody(content);
|
||||||
|
std::string snippet = generateSnippet(body, query);
|
||||||
|
|
||||||
|
outStream << std::endl << "Title: " << title << std::endl;
|
||||||
|
outStream << "URL: " << doc << std::endl;
|
||||||
|
outStream << "Description: " << description << std::endl;
|
||||||
|
outStream << "Snippet: " << snippet << std::endl;
|
||||||
|
}
|
||||||
|
outStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user