登录
注册
开源
企业版
高校版
搜索
帮助中心
使用条款
关于我们
开源
企业版
高校版
私有云
模力方舟
AI 队友
登录
注册
Gitee 年度开源项目评选结果正式揭晓,速戳👉
代码拉取完成,页面将自动刷新
捐赠
捐赠前请先登录
取消
前往登录
扫描微信二维码支付
取消
支付完成
支付提示
将跳转至支付宝完成支付
确定
取消
Watch
不关注
关注所有动态
仅关注版本发行动态
关注但不提醒动态
1
Star
0
Fork
0
NXX
/
3123003122
代码
Issues
1
Pull Requests
0
Wiki
统计
流水线
服务
质量分析
Jenkins for Gitee
腾讯云托管
腾讯云 Serverless
悬镜安全
阿里云 SAE
Codeblitz
SBOM
我知道了,不再自动展开
更新失败,请稍后重试!
移除标识
内容风险标识
本任务被
标识为内容中包含有代码安全 Bug 、隐私泄露等敏感信息,仓库外成员不可访问
第2次作业
待办的
#ICZFN6
NXX
拥有者
创建于
2025-09-23 23:52
#include <iostream> #include <fstream> #include <vector> #include <map> #include <cmath> #include <cstdint> #include <algorithm> using namespace std; // UTF-8字符结构体 struct UTF8Char { uint8_t byte1; uint8_t byte2; uint8_t byte3; uint8_t byte4; bool operator<(const UTF8Char& other) const { if (byte1 != other.byte1) return byte1 < other.byte1; if (byte2 != other.byte2) return byte2 < other.byte2; if (byte3 != other.byte3) return byte3 < other.byte3; return byte4 < other.byte4; } bool operator==(const UTF8Char& other) const { return byte1 == other.byte1 && byte2 == other.byte2 && byte3 == other.byte3 && byte4 == other.byte4; } }; // 数据存储结构体 struct FileData { vector<UTF8Char> bytes; int cnt; vector<vector<UTF8Char>> lines; vector<map<UTF8Char, int>> lineMaps; }; // 读取单个UTF-8字符 UTF8Char read_utf8_char(ifstream& file) { UTF8Char utf8_char = { 0, 0, 0, 0 }; char c; if (!file.get(c)) return utf8_char; utf8_char.byte1 = static_cast<uint8_t>(c); if ((utf8_char.byte1 & 0xE0) == 0xC0) { file.get(c); utf8_char.byte2 = static_cast<uint8_t>(c); } else if ((utf8_char.byte1 & 0xF0) == 0xE0) { file.get(c); utf8_char.byte2 = static_cast<uint8_t>(c); file.get(c); utf8_char.byte3 = static_cast<uint8_t>(c); } else if ((utf8_char.byte1 & 0xF8) == 0xF0) { file.get(c); utf8_char.byte2 = static_cast<uint8_t>(c); file.get(c); utf8_char.byte3 = static_cast<uint8_t>(c); file.get(c); utf8_char.byte4 = static_cast<uint8_t>(c); } return utf8_char; } // 检查UTF8Char是否为换行符 bool is_newline(const UTF8Char& c) { return (c.byte1 == 0x0A && c.byte2 == 0 && c.byte3 == 0 && c.byte4 == 0); } //检查UTF8Char是否为回车符 bool is_newlines(const UTF8Char& c) { return (c.byte1 == 0x0D && c.byte2 == 0 && c.byte3 == 0 && c.byte4 == 0); } // 检查UTF8Char是否为空格或标点(可选优化) bool is_whitespace_or_punctuation(const UTF8Char& c) { // 简单实现:只检查空格 return (c.byte1 == 0x20 && c.byte2 == 0 && c.byte3 == 0 && c.byte4 == 0); } // 读取文件并存储到结构体 FileData read_file_data(const string& filename) { FileData data; data.cnt = 0; vector<UTF8Char> current_line; ifstream file(filename, ios::binary); if (!file.is_open()) { return data; } while (!file.eof()) { UTF8Char c = read_utf8_char(file); if (c.byte1 == 0) break; data.bytes.push_back(c); current_line.push_back(c); if (is_newline(c)) { data.lines.push_back(current_line); current_line.clear(); } data.cnt++; } if (!current_line.empty()) { data.lines.push_back(current_line); } file.close(); return data; } // 字符统计函数 void countUTF8Characters(FileData& fileData) { fileData.lineMaps.clear(); for (const auto& line : fileData.lines) { map<UTF8Char, int> charCount; for (const auto& ch : line) { if (!is_newline(ch)) { charCount[ch]++; } } fileData.lineMaps.push_back(charCount); } } // 1. 字符统计余弦相似度计算 double calculateCosineSimilarity(const FileData& file1, const FileData& file2) { if (file1.lineMaps.empty() || file2.lineMaps.empty()) { return 0.0; } size_t minLines = min(file1.lineMaps.size(), file2.lineMaps.size()); vector<double> lineSimilarities; for (size_t i = 0; i < minLines; i++) { const map<UTF8Char, int>& map1 = file1.lineMaps[i]; const map<UTF8Char, int>& map2 = file2.lineMaps[i]; // 如果两个map都为空,相似度为1 if (map1.empty() && map2.empty()) { lineSimilarities.push_back(1.0); // 两个空行视为相同 continue; } else if (map1.empty() || map2.empty()) {//一个有,一个没,当成垂直,0.0 lineSimilarities.push_back(0.0); continue; } // 计算向量点积 double dotProduct = 0.0; for (const auto& pair1 : map1) { auto it = map2.find(pair1.first); if (it != map2.end()) { dotProduct += pair1.second * it->second; } } // 计算向量模长 double norm1 = 0.0, norm2 = 0.0; for (const auto& pair : map1) { norm1 += pair.second * pair.second; } for (const auto& pair : map2) { norm2 += pair.second * pair.second; } norm1 = sqrt(norm1); norm2 = sqrt(norm2); // 计算余弦相似度 if (norm1 == 0 || norm2 == 0) {//再次检测,防止出错 lineSimilarities.push_back(0.0); } else { double similarity = dotProduct / (norm1 * norm2); lineSimilarities.push_back(similarity); } } // 计算算术平均数 if (lineSimilarities.empty()) { return 0.0; } double sum = 0.0; for (double sim : lineSimilarities) { sum += sim; } return sum / lineSimilarities.size(); } // 2. 双指针区间匹配相似度计算 double calculateDoublePointerSimilarity(const FileData& file1, const FileData& file2) { if (file1.lines.empty() || file2.lines.empty()) { return 0.0; } int minLines = min(file1.lines.size(), file2.lines.size()); vector<double> lineRatios; for (int i = 0; i < minLines; i++) { const vector<UTF8Char>& line1 = file1.lines[i]; const vector<UTF8Char>& line2 = file2.lines[i]; // 过滤掉换行符的实际内容长度 // 同时提取有效字符(过滤换行符) vector<UTF8Char> valid1, valid2; int len1 = 0, len2 = 0; for (const auto& ch : line1) { if (!is_newline(ch) || !is_newlines(ch)) { len1++; valid1.push_back(ch); } } for (const auto& ch : line2) { if (!is_newline(ch) || !is_newlines(ch)) { len2++; valid2.push_back(ch); } } // 如果其中一个为空,比值为0 if (len1 == 0 || len2 == 0) { lineRatios.push_back(0.0); continue; } // 使用双指针算法找到最长公共子序列的长度 vector<vector<int>> dp(len1 + 1, vector<int>(len2 + 1, 0)); // 动态规划计算最长公共子序列长度 for (int i = 1; i <= len1; i++) { for (int j = 1; j <= len2; j++) { if (valid1[i - 1] == valid2[j - 1]) { dp[i][j] = dp[i - 1][j - 1] + 1; } else { dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]); } } } int lcsLength = dp[len1][len2]; int minLength = min(len1, len2); double ratio = static_cast<double>(lcsLength) / minLength; lineRatios.push_back(ratio); } // 计算算术平均数 if (lineRatios.empty()) { return 0.0; } double sum = 0.0; for (double ratio : lineRatios) { //cout << ratio << " "; sum += ratio; } return sum / max(file1.lines.size(), file2.lines.size());//注意,要除以它们的最大值 } // 3. 最终相似度融合 double calculateFinalSimilarity(const FileData& file1, const FileData& file2) { double cosineSim = calculateCosineSimilarity(file1, file2); double pointerSim = calculateDoublePointerSimilarity(file1, file2); // 权重融合:字符统计相似度×15% + 双指针相似度×85% double finalSimilarity = cosineSim * 0.15 + pointerSim * 0.85; return finalSimilarity; } int main(int argc, char* argv[]) { if (argc != 4) { cerr << "用法: " << argv[0] << " <原文文件> <抄袭版文件> <输出文件>" << endl; return 1; } // 读取文件 FileData originalFile = read_file_data(argv[1]); FileData plagiarizedFile = read_file_data(argv[2]); string outputFile = argv[3]; // 字符统计 countUTF8Characters(originalFile); countUTF8Characters(plagiarizedFile); // 计算相似度 double cosineSimilarity = calculateCosineSimilarity(originalFile, plagiarizedFile); double pointerSimilarity = calculateDoublePointerSimilarity(originalFile, plagiarizedFile); double finalSimilarity = calculateFinalSimilarity(originalFile, plagiarizedFile); // 输出到文件 ofstream outFile(outputFile); if (outFile.is_open()) { outFile.precision(2); outFile << fixed << finalSimilarity; outFile.close(); } else { return 1; } // 控制台输出(调试信息) /* printf("字符统计余弦相似度: %.2f\n", cosineSimilarity); printf("双指针区间匹配相似度: %.2f\n", pointerSimilarity); printf("最终相似度: %.2f\n", finalSimilarity); */ return 0; } /* C:\Users\THINKPAD\source\repos\Project1\Project1\main.cpp C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\source\repos\Project1\Project1\1.txt C:\Users\THINKPAD\source\repos\Project1\Project1\2.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\source\repos\Project1\Project1\1.txt C:\Users\THINKPAD\source\repos\Project1\Project1\2.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\Downloads\测试文本(1)\orig.txt C:\Users\THINKPAD\Downloads\测试文本(1)\orig_0.8_del.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\Downloads\测试文本(1)\orig.txt C:\Users\THINKPAD\Downloads\测试文本(1)\orig_0.8_add.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\Downloads\测试文本(1)\orig.txt C:\Users\THINKPAD\Downloads\测试文本(1)\orig_0.8_dis_1.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\Downloads\测试文本(1)\orig.txt C:\Users\THINKPAD\Downloads\测试文本(1)\orig_0.8_dis_10.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\Downloads\测试文本(1)\orig.txt C:\Users\THINKPAD\Downloads\测试文本(1)\orig_0.8_dis_15.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt */
#include <iostream> #include <fstream> #include <vector> #include <map> #include <cmath> #include <cstdint> #include <algorithm> using namespace std; // UTF-8字符结构体 struct UTF8Char { uint8_t byte1; uint8_t byte2; uint8_t byte3; uint8_t byte4; bool operator<(const UTF8Char& other) const { if (byte1 != other.byte1) return byte1 < other.byte1; if (byte2 != other.byte2) return byte2 < other.byte2; if (byte3 != other.byte3) return byte3 < other.byte3; return byte4 < other.byte4; } bool operator==(const UTF8Char& other) const { return byte1 == other.byte1 && byte2 == other.byte2 && byte3 == other.byte3 && byte4 == other.byte4; } }; // 数据存储结构体 struct FileData { vector<UTF8Char> bytes; int cnt; vector<vector<UTF8Char>> lines; vector<map<UTF8Char, int>> lineMaps; }; // 读取单个UTF-8字符 UTF8Char read_utf8_char(ifstream& file) { UTF8Char utf8_char = { 0, 0, 0, 0 }; char c; if (!file.get(c)) return utf8_char; utf8_char.byte1 = static_cast<uint8_t>(c); if ((utf8_char.byte1 & 0xE0) == 0xC0) { file.get(c); utf8_char.byte2 = static_cast<uint8_t>(c); } else if ((utf8_char.byte1 & 0xF0) == 0xE0) { file.get(c); utf8_char.byte2 = static_cast<uint8_t>(c); file.get(c); utf8_char.byte3 = static_cast<uint8_t>(c); } else if ((utf8_char.byte1 & 0xF8) == 0xF0) { file.get(c); utf8_char.byte2 = static_cast<uint8_t>(c); file.get(c); utf8_char.byte3 = static_cast<uint8_t>(c); file.get(c); utf8_char.byte4 = static_cast<uint8_t>(c); } return utf8_char; } // 检查UTF8Char是否为换行符 bool is_newline(const UTF8Char& c) { return (c.byte1 == 0x0A && c.byte2 == 0 && c.byte3 == 0 && c.byte4 == 0); } //检查UTF8Char是否为回车符 bool is_newlines(const UTF8Char& c) { return (c.byte1 == 0x0D && c.byte2 == 0 && c.byte3 == 0 && c.byte4 == 0); } // 检查UTF8Char是否为空格或标点(可选优化) bool is_whitespace_or_punctuation(const UTF8Char& c) { // 简单实现:只检查空格 return (c.byte1 == 0x20 && c.byte2 == 0 && c.byte3 == 0 && c.byte4 == 0); } // 读取文件并存储到结构体 FileData read_file_data(const string& filename) { FileData data; data.cnt = 0; vector<UTF8Char> current_line; ifstream file(filename, ios::binary); if (!file.is_open()) { return data; } while (!file.eof()) { UTF8Char c = read_utf8_char(file); if (c.byte1 == 0) break; data.bytes.push_back(c); current_line.push_back(c); if (is_newline(c)) { data.lines.push_back(current_line); current_line.clear(); } data.cnt++; } if (!current_line.empty()) { data.lines.push_back(current_line); } file.close(); return data; } // 字符统计函数 void countUTF8Characters(FileData& fileData) { fileData.lineMaps.clear(); for (const auto& line : fileData.lines) { map<UTF8Char, int> charCount; for (const auto& ch : line) { if (!is_newline(ch)) { charCount[ch]++; } } fileData.lineMaps.push_back(charCount); } } // 1. 字符统计余弦相似度计算 double calculateCosineSimilarity(const FileData& file1, const FileData& file2) { if (file1.lineMaps.empty() || file2.lineMaps.empty()) { return 0.0; } size_t minLines = min(file1.lineMaps.size(), file2.lineMaps.size()); vector<double> lineSimilarities; for (size_t i = 0; i < minLines; i++) { const map<UTF8Char, int>& map1 = file1.lineMaps[i]; const map<UTF8Char, int>& map2 = file2.lineMaps[i]; // 如果两个map都为空,相似度为1 if (map1.empty() && map2.empty()) { lineSimilarities.push_back(1.0); // 两个空行视为相同 continue; } else if (map1.empty() || map2.empty()) {//一个有,一个没,当成垂直,0.0 lineSimilarities.push_back(0.0); continue; } // 计算向量点积 double dotProduct = 0.0; for (const auto& pair1 : map1) { auto it = map2.find(pair1.first); if (it != map2.end()) { dotProduct += pair1.second * it->second; } } // 计算向量模长 double norm1 = 0.0, norm2 = 0.0; for (const auto& pair : map1) { norm1 += pair.second * pair.second; } for (const auto& pair : map2) { norm2 += pair.second * pair.second; } norm1 = sqrt(norm1); norm2 = sqrt(norm2); // 计算余弦相似度 if (norm1 == 0 || norm2 == 0) {//再次检测,防止出错 lineSimilarities.push_back(0.0); } else { double similarity = dotProduct / (norm1 * norm2); lineSimilarities.push_back(similarity); } } // 计算算术平均数 if (lineSimilarities.empty()) { return 0.0; } double sum = 0.0; for (double sim : lineSimilarities) { sum += sim; } return sum / lineSimilarities.size(); } // 2. 双指针区间匹配相似度计算 double calculateDoublePointerSimilarity(const FileData& file1, const FileData& file2) { if (file1.lines.empty() || file2.lines.empty()) { return 0.0; } int minLines = min(file1.lines.size(), file2.lines.size()); vector<double> lineRatios; for (int i = 0; i < minLines; i++) { const vector<UTF8Char>& line1 = file1.lines[i]; const vector<UTF8Char>& line2 = file2.lines[i]; // 过滤掉换行符的实际内容长度 // 同时提取有效字符(过滤换行符) vector<UTF8Char> valid1, valid2; int len1 = 0, len2 = 0; for (const auto& ch : line1) { if (!is_newline(ch) || !is_newlines(ch)) { len1++; valid1.push_back(ch); } } for (const auto& ch : line2) { if (!is_newline(ch) || !is_newlines(ch)) { len2++; valid2.push_back(ch); } } // 如果其中一个为空,比值为0 if (len1 == 0 || len2 == 0) { lineRatios.push_back(0.0); continue; } // 使用双指针算法找到最长公共子序列的长度 vector<vector<int>> dp(len1 + 1, vector<int>(len2 + 1, 0)); // 动态规划计算最长公共子序列长度 for (int i = 1; i <= len1; i++) { for (int j = 1; j <= len2; j++) { if (valid1[i - 1] == valid2[j - 1]) { dp[i][j] = dp[i - 1][j - 1] + 1; } else { dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]); } } } int lcsLength = dp[len1][len2]; int minLength = min(len1, len2); double ratio = static_cast<double>(lcsLength) / minLength; lineRatios.push_back(ratio); } // 计算算术平均数 if (lineRatios.empty()) { return 0.0; } double sum = 0.0; for (double ratio : lineRatios) { //cout << ratio << " "; sum += ratio; } return sum / max(file1.lines.size(), file2.lines.size());//注意,要除以它们的最大值 } // 3. 最终相似度融合 double calculateFinalSimilarity(const FileData& file1, const FileData& file2) { double cosineSim = calculateCosineSimilarity(file1, file2); double pointerSim = calculateDoublePointerSimilarity(file1, file2); // 权重融合:字符统计相似度×15% + 双指针相似度×85% double finalSimilarity = cosineSim * 0.15 + pointerSim * 0.85; return finalSimilarity; } int main(int argc, char* argv[]) { if (argc != 4) { cerr << "用法: " << argv[0] << " <原文文件> <抄袭版文件> <输出文件>" << endl; return 1; } // 读取文件 FileData originalFile = read_file_data(argv[1]); FileData plagiarizedFile = read_file_data(argv[2]); string outputFile = argv[3]; // 字符统计 countUTF8Characters(originalFile); countUTF8Characters(plagiarizedFile); // 计算相似度 double cosineSimilarity = calculateCosineSimilarity(originalFile, plagiarizedFile); double pointerSimilarity = calculateDoublePointerSimilarity(originalFile, plagiarizedFile); double finalSimilarity = calculateFinalSimilarity(originalFile, plagiarizedFile); // 输出到文件 ofstream outFile(outputFile); if (outFile.is_open()) { outFile.precision(2); outFile << fixed << finalSimilarity; outFile.close(); } else { return 1; } // 控制台输出(调试信息) /* printf("字符统计余弦相似度: %.2f\n", cosineSimilarity); printf("双指针区间匹配相似度: %.2f\n", pointerSimilarity); printf("最终相似度: %.2f\n", finalSimilarity); */ return 0; } /* C:\Users\THINKPAD\source\repos\Project1\Project1\main.cpp C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\source\repos\Project1\Project1\1.txt C:\Users\THINKPAD\source\repos\Project1\Project1\2.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\source\repos\Project1\Project1\1.txt C:\Users\THINKPAD\source\repos\Project1\Project1\2.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\Downloads\测试文本(1)\orig.txt C:\Users\THINKPAD\Downloads\测试文本(1)\orig_0.8_del.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\Downloads\测试文本(1)\orig.txt C:\Users\THINKPAD\Downloads\测试文本(1)\orig_0.8_add.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\Downloads\测试文本(1)\orig.txt C:\Users\THINKPAD\Downloads\测试文本(1)\orig_0.8_dis_1.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\Downloads\测试文本(1)\orig.txt C:\Users\THINKPAD\Downloads\测试文本(1)\orig_0.8_dis_10.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt C:\Users\THINKPAD\source\repos\Project1\Debug\Project1.exe C:\Users\THINKPAD\Downloads\测试文本(1)\orig.txt C:\Users\THINKPAD\Downloads\测试文本(1)\orig_0.8_dis_15.txt C:\Users\THINKPAD\source\repos\Project1\Project1\out.txt */
评论 (
0
)
登录
后才可以发表评论
状态
待办的
待办的
进行中
已完成
已关闭
负责人
未设置
标签
未设置
标签管理
里程碑
未关联里程碑
未关联里程碑
Pull Requests
未关联
未关联
关联的 Pull Requests 被合并后可能会关闭此 issue
分支
未关联
未关联
master
开始日期   -   截止日期
-
置顶选项
不置顶
置顶等级:高
置顶等级:中
置顶等级:低
优先级
不指定
严重
主要
次要
不重要
参与者(1)
C++
1
https://gitee.com/NXX_code/3123003122.git
git@gitee.com:NXX_code/3123003122.git
NXX_code
3123003122
3123003122
点此查找更多帮助
搜索帮助
Git 命令在线学习
如何在 Gitee 导入 GitHub 仓库
Git 仓库基础操作
企业版和社区版功能对比
SSH 公钥设置
如何处理代码冲突
仓库体积过大,如何减小?
如何找回被删除的仓库数据
Gitee 产品配额说明
GitHub仓库快速导入Gitee及同步更新
什么是 Release(发行版)
将 PHP 项目自动发布到 packagist.org
评论
仓库举报
回到顶部
登录提示
该操作需登录 Gitee 帐号,请先登录后再操作。
立即登录
没有帐号,去注册