#include "Utils.h" #include // For potential cerr usage, although not directly in these functions #include #include #include // For std::min bool parseQuotedStringValue(const std::string& str, size_t& pos, std::string& value) { const size_t strLen = str.length(); value.clear(); if (pos >= strLen || str[pos] != '"') return false; ++pos; const size_t startPos = pos; const char* strData = str.data(); while (pos < strLen && strData[pos] != '"') { ++pos; } if (pos >= strLen) return false; value.assign(strData + startPos, pos - startPos); ++pos; return true; } bool parseUnquotedValue(const std::string& str, size_t& pos, std::string& value) { const size_t strLen = str.length(); value.clear(); const size_t startPos = pos; const char* strData = str.data(); while (pos < strLen && strData[pos] != ',' && strData[pos] != '}' && strData[pos] != ']' && !std::isspace(static_cast(strData[pos]))) { ++pos; } if (startPos == pos) return false; value.assign(strData + startPos, pos - startPos); return true; } bool extractValue(const std::string& line, const std::string& key, std::string& value) { const std::string searchKey = "\"" + key + "\":"; const char* found_pos = strstr(line.c_str(), searchKey.c_str()); if (!found_pos) return false; size_t pos = (found_pos - line.c_str()) + searchKey.length(); const size_t lineLen = line.length(); while (pos < lineLen && std::isspace(static_cast(line[pos]))) { ++pos; } if (pos >= lineLen) return false; if (line[pos] == '"') { return parseQuotedStringValue(line, pos, value); } else { return parseUnquotedValue(line, pos, value); } } bool extractSubObject(const std::string& line, const std::string& key, std::string& subObj) { const std::string searchKey = "\"" + key + "\":"; const char* found_pos = strstr(line.c_str(), searchKey.c_str()); if (!found_pos) return false; size_t pos = (found_pos - line.c_str()) + searchKey.length(); const size_t lineLen = line.length(); while (pos < lineLen && std::isspace(static_cast(line[pos]))) ++pos; if (pos >= lineLen || line[pos] != '{') return false; const size_t startBracePos = pos; int braceCount = 1; ++pos; const char* lineData = line.data(); bool inString = false; char prevChar = 0; while (pos < lineLen && braceCount > 0) { const char c = lineData[pos]; if (c == '"' && prevChar != '\\') { inString = !inString; } else if (!inString) { if (c == '{') { ++braceCount; } else if (c == '}') { --braceCount; } } prevChar = (prevChar == '\\' && c == '\\') ? 0 : c; ++pos; } if (braceCount == 0) { subObj.assign(lineData + startBracePos, pos - startBracePos); return true; } return false; } bool parseLongLong(const std::string& s, long& result) { result = 0; if (s.empty()) return false; const char* ptr = s.c_str(); bool negative = false; long current_val = 0; if (*ptr == '-') { negative = true; ++ptr; } if (!*ptr) return false; while (*ptr) { if (*ptr >= '0' && *ptr <= '9') { long digit = (*ptr - '0'); current_val = current_val * 10 + digit; } else { return false; } ++ptr; } result = negative ? -current_val : current_val; return true; } bool parseLineForHashtags(const std::string& line, int inputOrder, StringInterner& interner, VideoInfo& outVideo, std::string& outText) { outText.clear(); std::string id_str, coverUrl_str, webVideoUrl_str, playCount_str; if (!extractValue(line, "id", id_str) || id_str.empty()) return false; long playCount = 0; if (extractValue(line, "playCount", playCount_str)) { parseLongLong(playCount_str, playCount); } extractValue(line, "text", outText); extractValue(line, "webVideoUrl", webVideoUrl_str); std::string videoMetaSub; if (extractSubObject(line, "videoMeta", videoMetaSub)) { extractValue(videoMetaSub, "coverUrl", coverUrl_str); } outVideo = VideoInfo( interner.intern(std::move(id_str)), interner.intern(std::move(coverUrl_str)), interner.intern(std::move(webVideoUrl_str)), playCount, inputOrder ); return true; } bool parseLineForSounds(const std::string& line, int inputOrder, StringInterner& interner, VideoInfo& outVideo, const std::string*& outMusicIdPtr, const std::string*& outMusicNamePtr, const std::string*& outMusicAuthorPtr) { std::string id_str, coverUrl_str, webVideoUrl_str, playCount_str; std::string musicId_str, musicName_str, musicAuthor_str; if (!extractValue(line, "id", id_str) || id_str.empty()) return false; long playCount = 0; if (extractValue(line, "playCount", playCount_str)) { parseLongLong(playCount_str, playCount); } std::string musicMetaSub; if (extractSubObject(line, "musicMeta", musicMetaSub)) { extractValue(musicMetaSub, "musicId", musicId_str); extractValue(musicMetaSub, "musicName", musicName_str); extractValue(musicMetaSub, "musicAuthor", musicAuthor_str); } if (musicId_str.empty()) { return false; } extractValue(line, "webVideoUrl", webVideoUrl_str); std::string videoMetaSub; if (extractSubObject(line, "videoMeta", videoMetaSub)) { extractValue(videoMetaSub, "coverUrl", coverUrl_str); } outVideo = VideoInfo( interner.intern(std::move(id_str)), interner.intern(std::move(coverUrl_str)), interner.intern(std::move(webVideoUrl_str)), playCount, inputOrder ); outMusicIdPtr = interner.intern(std::move(musicId_str)); outMusicNamePtr = interner.intern(std::move(musicName_str)); outMusicAuthorPtr = interner.intern(std::move(musicAuthor_str)); return true; } void extractHashtags(const std::string& text, std::unordered_map& hashtagData, StringInterner& interner, const VideoInfo& video) { const size_t textLen = text.length(); const char* textData = text.data(); size_t pos = 0; std::string tag_buffer; tag_buffer.reserve(50); while (pos < textLen) { while (pos < textLen && textData[pos] != '#') { pos++; } if (pos >= textLen) break; size_t start = pos + 1; if (start >= textLen) break; size_t end = start; while (end < textLen && (std::isalnum(static_cast(textData[end])) || textData[end] == '_')) { end++; } if (end > start) { tag_buffer.assign(textData + start, end - start); const std::string* hashtagPtr = interner.intern(tag_buffer); typedef std::unordered_map HashtagMapType; HashtagMapType::iterator it = hashtagData.find(hashtagPtr); if (it == hashtagData.end()) { std::pair emplace_result = hashtagData.emplace(hashtagPtr, HashtagInfo(hashtagPtr)); it = emplace_result.first; } it->second.usageCount++; it->second.totalViews += video.playCount; it->second.topVideos.add(video); } pos = end; } } void extractSortAndPrintTop3Videos(std::ofstream& fout, TopKVideoHolder& topVideos) { std::vector sortedTopVideos = topVideos.getSortedVideos(); int videosToPrint = std::min(static_cast(sortedTopVideos.size()), TOP_K_CANDIDATES); for (int i = 0; i < videosToPrint; ++i) { const VideoInfo& video = sortedTopVideos[i]; fout << "cover url: " << (video.coverUrl && !video.coverUrl->empty() ? *video.coverUrl : "null") << "\n"; fout << "web video url: " << (video.webVideoUrl && !video.webVideoUrl->empty() ? *video.webVideoUrl : "null") << "\n"; } }