265 lines
8.3 KiB
C++
265 lines
8.3 KiB
C++
#include "Utils.h"
|
|
#include <iostream> // For potential cerr usage, although not directly in these functions
|
|
#include <cctype>
|
|
#include <cstring>
|
|
#include <algorithm> // For std::min
|
|
|
|
bool parseQuotedStringValue(const std::string& str, size_t& pos, std::string& value) {
|
|
const size_t strLen = str.length();
|
|
value.clear();
|
|
if (pos >= strLen || str[pos] != '"') return false;
|
|
++pos;
|
|
const size_t startPos = pos;
|
|
const char* strData = str.data();
|
|
while (pos < strLen && strData[pos] != '"') {
|
|
++pos;
|
|
}
|
|
if (pos >= strLen) return false;
|
|
value.assign(strData + startPos, pos - startPos);
|
|
++pos;
|
|
return true;
|
|
}
|
|
|
|
bool parseUnquotedValue(const std::string& str, size_t& pos, std::string& value) {
|
|
const size_t strLen = str.length();
|
|
value.clear();
|
|
const size_t startPos = pos;
|
|
const char* strData = str.data();
|
|
while (pos < strLen && strData[pos] != ',' && strData[pos] != '}' && strData[pos] != ']' && !std::isspace(static_cast<unsigned char>(strData[pos]))) {
|
|
++pos;
|
|
}
|
|
if (startPos == pos) return false;
|
|
value.assign(strData + startPos, pos - startPos);
|
|
return true;
|
|
}
|
|
|
|
bool extractValue(const std::string& line, const std::string& key, std::string& value) {
|
|
const std::string searchKey = "\"" + key + "\":";
|
|
const char* found_pos = strstr(line.c_str(), searchKey.c_str());
|
|
if (!found_pos) return false;
|
|
|
|
size_t pos = (found_pos - line.c_str()) + searchKey.length();
|
|
const size_t lineLen = line.length();
|
|
|
|
while (pos < lineLen && std::isspace(static_cast<unsigned char>(line[pos]))) {
|
|
++pos;
|
|
}
|
|
if (pos >= lineLen) return false;
|
|
|
|
if (line[pos] == '"') {
|
|
return parseQuotedStringValue(line, pos, value);
|
|
} else {
|
|
return parseUnquotedValue(line, pos, value);
|
|
}
|
|
}
|
|
|
|
bool extractSubObject(const std::string& line, const std::string& key, std::string& subObj) {
|
|
const std::string searchKey = "\"" + key + "\":";
|
|
const char* found_pos = strstr(line.c_str(), searchKey.c_str());
|
|
if (!found_pos) return false;
|
|
|
|
size_t pos = (found_pos - line.c_str()) + searchKey.length();
|
|
const size_t lineLen = line.length();
|
|
|
|
while (pos < lineLen && std::isspace(static_cast<unsigned char>(line[pos]))) ++pos;
|
|
|
|
if (pos >= lineLen || line[pos] != '{') return false;
|
|
|
|
const size_t startBracePos = pos;
|
|
int braceCount = 1;
|
|
++pos;
|
|
const char* lineData = line.data();
|
|
|
|
bool inString = false;
|
|
char prevChar = 0;
|
|
while (pos < lineLen && braceCount > 0) {
|
|
const char c = lineData[pos];
|
|
if (c == '"' && prevChar != '\\') {
|
|
inString = !inString;
|
|
} else if (!inString) {
|
|
if (c == '{') {
|
|
++braceCount;
|
|
} else if (c == '}') {
|
|
--braceCount;
|
|
}
|
|
}
|
|
prevChar = (prevChar == '\\' && c == '\\') ? 0 : c;
|
|
++pos;
|
|
}
|
|
|
|
if (braceCount == 0) {
|
|
subObj.assign(lineData + startBracePos, pos - startBracePos);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool parseLongLong(const std::string& s, long& result) {
|
|
result = 0;
|
|
if (s.empty()) return false;
|
|
const char* ptr = s.c_str();
|
|
bool negative = false;
|
|
long current_val = 0;
|
|
|
|
if (*ptr == '-') {
|
|
negative = true;
|
|
++ptr;
|
|
}
|
|
if (!*ptr) return false;
|
|
|
|
while (*ptr) {
|
|
if (*ptr >= '0' && *ptr <= '9') {
|
|
long digit = (*ptr - '0');
|
|
current_val = current_val * 10 + digit;
|
|
} else {
|
|
return false;
|
|
}
|
|
++ptr;
|
|
}
|
|
|
|
result = negative ? -current_val : current_val;
|
|
return true;
|
|
}
|
|
|
|
|
|
bool parseLineForHashtags(const std::string& line, int inputOrder, StringInterner& interner,
|
|
VideoInfo& outVideo, std::string& outText)
|
|
{
|
|
outText.clear();
|
|
|
|
std::string id_str, coverUrl_str, webVideoUrl_str, playCount_str;
|
|
|
|
if (!extractValue(line, "id", id_str) || id_str.empty()) return false;
|
|
|
|
long playCount = 0;
|
|
if (extractValue(line, "playCount", playCount_str)) {
|
|
parseLongLong(playCount_str, playCount);
|
|
}
|
|
|
|
extractValue(line, "text", outText);
|
|
|
|
extractValue(line, "webVideoUrl", webVideoUrl_str);
|
|
std::string videoMetaSub;
|
|
if (extractSubObject(line, "videoMeta", videoMetaSub)) {
|
|
extractValue(videoMetaSub, "coverUrl", coverUrl_str);
|
|
}
|
|
|
|
outVideo = VideoInfo(
|
|
interner.intern(std::move(id_str)),
|
|
interner.intern(std::move(coverUrl_str)),
|
|
interner.intern(std::move(webVideoUrl_str)),
|
|
playCount,
|
|
inputOrder
|
|
);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool parseLineForSounds(const std::string& line, int inputOrder, StringInterner& interner,
|
|
VideoInfo& outVideo,
|
|
const std::string*& outMusicIdPtr,
|
|
const std::string*& outMusicNamePtr,
|
|
const std::string*& outMusicAuthorPtr)
|
|
{
|
|
std::string id_str, coverUrl_str, webVideoUrl_str, playCount_str;
|
|
std::string musicId_str, musicName_str, musicAuthor_str;
|
|
|
|
if (!extractValue(line, "id", id_str) || id_str.empty()) return false;
|
|
|
|
long playCount = 0;
|
|
if (extractValue(line, "playCount", playCount_str)) {
|
|
parseLongLong(playCount_str, playCount);
|
|
}
|
|
|
|
std::string musicMetaSub;
|
|
if (extractSubObject(line, "musicMeta", musicMetaSub)) {
|
|
extractValue(musicMetaSub, "musicId", musicId_str);
|
|
extractValue(musicMetaSub, "musicName", musicName_str);
|
|
extractValue(musicMetaSub, "musicAuthor", musicAuthor_str);
|
|
}
|
|
|
|
if (musicId_str.empty()) {
|
|
return false;
|
|
}
|
|
|
|
extractValue(line, "webVideoUrl", webVideoUrl_str);
|
|
std::string videoMetaSub;
|
|
if (extractSubObject(line, "videoMeta", videoMetaSub)) {
|
|
extractValue(videoMetaSub, "coverUrl", coverUrl_str);
|
|
}
|
|
|
|
outVideo = VideoInfo(
|
|
interner.intern(std::move(id_str)),
|
|
interner.intern(std::move(coverUrl_str)),
|
|
interner.intern(std::move(webVideoUrl_str)),
|
|
playCount,
|
|
inputOrder
|
|
);
|
|
outMusicIdPtr = interner.intern(std::move(musicId_str));
|
|
outMusicNamePtr = interner.intern(std::move(musicName_str));
|
|
outMusicAuthorPtr = interner.intern(std::move(musicAuthor_str));
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
void extractHashtags(const std::string& text,
|
|
std::unordered_map<const std::string*, HashtagInfo, StringPtrHash, StringPtrEqual>& hashtagData,
|
|
StringInterner& interner,
|
|
const VideoInfo& video)
|
|
{
|
|
const size_t textLen = text.length();
|
|
const char* textData = text.data();
|
|
size_t pos = 0;
|
|
std::string tag_buffer;
|
|
tag_buffer.reserve(50);
|
|
|
|
while (pos < textLen) {
|
|
while (pos < textLen && textData[pos] != '#') {
|
|
pos++;
|
|
}
|
|
if (pos >= textLen) break;
|
|
|
|
size_t start = pos + 1;
|
|
if (start >= textLen) break;
|
|
|
|
size_t end = start;
|
|
|
|
while (end < textLen && (std::isalnum(static_cast<unsigned char>(textData[end])) || textData[end] == '_')) {
|
|
end++;
|
|
}
|
|
|
|
if (end > start) {
|
|
tag_buffer.assign(textData + start, end - start);
|
|
const std::string* hashtagPtr = interner.intern(tag_buffer);
|
|
|
|
typedef std::unordered_map<const std::string*, HashtagInfo, StringPtrHash, StringPtrEqual> HashtagMapType;
|
|
HashtagMapType::iterator it = hashtagData.find(hashtagPtr);
|
|
|
|
if (it == hashtagData.end()) {
|
|
std::pair<HashtagMapType::iterator, bool> emplace_result =
|
|
hashtagData.emplace(hashtagPtr, HashtagInfo(hashtagPtr));
|
|
it = emplace_result.first;
|
|
}
|
|
|
|
it->second.usageCount++;
|
|
it->second.totalViews += video.playCount;
|
|
it->second.topVideos.add(video);
|
|
}
|
|
|
|
pos = end;
|
|
}
|
|
}
|
|
|
|
void extractSortAndPrintTop3Videos(std::ofstream& fout, TopKVideoHolder& topVideos) {
|
|
std::vector<VideoInfo> sortedTopVideos = topVideos.getSortedVideos();
|
|
|
|
int videosToPrint = std::min(static_cast<int>(sortedTopVideos.size()), TOP_K_CANDIDATES);
|
|
for (int i = 0; i < videosToPrint; ++i) {
|
|
const VideoInfo& video = sortedTopVideos[i];
|
|
|
|
fout << "cover url: " << (video.coverUrl && !video.coverUrl->empty() ? *video.coverUrl : "null") << "\n";
|
|
fout << "web video url: " << (video.webVideoUrl && !video.webVideoUrl->empty() ? *video.webVideoUrl : "null") << "\n";
|
|
}
|
|
} |