代码拉取完成,页面将自动刷新
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#pragma GCC optimize("Ofast")
// 长整型变量简写
typedef long long ll;
typedef unsigned long long ull;
// 结构定义
// 非停用词单词信息体
typedef struct NonStopWord
{
char word[100];
int count;
int status; // 0表示已经填入Hash表中,1表示未填入
} NonStopWord;
// 非停用词Hash表
typedef struct NonStopWordHashTable
{
int tableSize;
NonStopWord *table;
} NonStopWordHashTable;
// 停用词树,用前缀字典树实现
typedef struct StopWordsTree
{
int cnt;
struct StopWordsTree *chilren[26];
} StopWordsTree;
// 特征向量树,用前缀字典树实现
typedef struct FeatureVectorTree
{
int cnt;
int id;
struct FeatureVectorTree *chilren[26];
} FeatureVectorTree;
// 变量定义
// 数据库的非停用词单词数组
NonStopWord nonStopWords[133333] = {0};
// 数据库的非停用词数量
int nonStopWordsNum = 0;
// 原网页页数
int pageNum = 0;
// 样本网页页数
int samplePageNum = 0;
// 读到换页符的标记
int pageFlag = 0;
// 单词数组
char word[100] = {0};
// 网页标识信息的二维数组
// 原网页标识信息
char webId[16000][50] = {0};
// 样本网页标识信息
char sampleWebId[16000][50] = {0};
// 各网页权重向量构成的二维数组
// 原网页权重向量
int weight[16000][8000] = {0};
// 样本网页权重向量
int sampleWeight[16000][8000] = {0};
// 每个网页使用的Hash值,每行的Hash值按一个64位二进制数存储
uint64_t lowerHashValue[10000] = {0};
// 原网页指纹,每行的指纹按一个64位二进制数存储
uint64_t fingerprint[20000] = {0};
// 样本网页指纹,每行的指纹按一个64位二进制数存储
uint64_t sampleFingerprint[20000] = {0};
// 临时储存输出结果的数组
int result[4][20000] = {0};
// 汉明距离分别为0、1、2、3的网页数量
int printPageNum0 = 0;
int printPageNum1 = 0;
int printPageNum2 = 0;
int printPageNum3 = 0;
// 停用词文件指针
FILE *StopWordsFile;
// 已有网页文件指针
FILE *WebFile;
// 样本网页文件指针
FILE *SampleFile;
// Hash表文件指针
FILE *HashFile;
// 输出结果文件指针
FILE *ResultFile;
// 停用词树根节点
StopWordsTree *StopWordsRoot = NULL;
// 特征向量树根节点
FeatureVectorTree *FeatureVectorRoot = NULL;
// 功能函数声明
// 读取一个单词
void GetWord(FILE *file);
// 读取网页标识信息
void GetWebId(FILE *file);
// 创建停用词树
void CreateStopWordsTree();
// 判断是否是停用词
int IsStopWord();
// 非停用词词频统计
void NonStopWordsCnt();
// 非停用词词频排序
void NonStopWordsSort();
// 创建特征向量树(排序后前N个信息体就是特征向量)
void CreateFeatureVectorTree(int N);
// 统计每个网页(文本)的特征向量中每个特征(单词)的频度
// 并计算网页指纹
void WebFingerprintCnt(int N, int M);
// 计算汉明距离并输出结果
void HammingDistanceCnt(int M);
// 主程序实现
int main(int argc, char **argv)
{
// 命令行输入形式应该是:simtool N M
if (argc == 3)
{
// 字符串转成整数,得到N、M的值
int N = atoi(argv[1]);
int M = atoi(argv[2]);
// 步骤0:打开所有需要的文件
StopWordsFile = fopen("stopwords.txt", "r");
if (StopWordsFile == NULL)
{
printf("停用词文件打开失败!\n");
return 1;
}
WebFile = fopen("article.txt", "rb");
if (WebFile == NULL)
{
printf("已有网页文件打开失败!\n");
return 1;
}
SampleFile = fopen("sample.txt", "r");
if (SampleFile == NULL)
{
printf("样本网页文件打开失败!\n");
return 1;
}
HashFile = fopen("hashvalue.txt", "rb");
if (HashFile == NULL)
{
printf("Hash表文件打开失败!\n");
return 1;
}
ResultFile = fopen("result.txt", "wb");
if (ResultFile == NULL)
{
printf("输出结果文件打开失败!\n");
return 1;
}
// 步骤1:获取各网页标识信息
GetWebId(WebFile);
GetWebId(SampleFile);
// 步骤2:得到排序后的非停用词单词数组(排序后前N个信息体就是特征向量)
CreateStopWordsTree();
NonStopWordsCnt();
NonStopWordsSort();
/*
步骤3:统计每个网页(文本)的特征向量中每个特征(单词)的频度,得到权重向量
并计算网页指纹
*/
CreateFeatureVectorTree(N);
//!!! 到此步约耗时0.9秒(总耗时3秒) !!!
WebFingerprintCnt(N, M);
// 步骤4:计算各网页的汉明距离
HammingDistanceCnt(M);
// 步骤5:关闭文件
fclose(StopWordsFile);
fclose(WebFile);
fclose(SampleFile);
fclose(HashFile);
fclose(ResultFile);
}
return 0;
}
// 功能函数实现
// 读取网页标识信息
void GetWebId(FILE *file)
{
if (file == WebFile)
{
int num = 0;
int flag = 0;
// 读取第一个网页
fgets(webId[num++], 200, file);
strtok(webId[num - 1], "\n");
strtok(webId[num - 1], "\r");
char tmp[200] = {0};
// 以换页符为标志读取后续的网页
while (fgets(tmp, 200, file) != NULL)
{
if (flag == 1)
{
strcpy(webId[num++], tmp);
strtok(webId[num - 1], "\n");
strtok(webId[num - 1], "\r");
flag = 0;
}
if (strstr(tmp, "\f"))
{
flag = 1;
}
}
fseek(file, 0, SEEK_SET);
}
else if (file == SampleFile)
{
int num = 0;
int flag = 0;
char tmp[200] = {0};
fgets(tmp, 10, file);
memset(tmp, 0, sizeof(tmp));
fgets(sampleWebId[num++], 200, file);
strtok(sampleWebId[num - 1], "\n");
strtok(sampleWebId[num - 1], "\r");
while (fgets(tmp, 200, file) != NULL)
{
if (flag == 1)
{
strcpy(sampleWebId[num++], tmp);
strtok(sampleWebId[num - 1], "\n");
strtok(sampleWebId[num - 1], "\r");
flag = 0;
}
if (strstr(tmp, "\f"))
{
flag = 1;
}
}
fseek(file, 0, SEEK_SET);
}
}
// 读取一个单词,同时要把单词转换成小写
void GetWord(FILE *file)
{
int i = 0;
char ch;
while ((ch = fgetc(file)) != EOF)
{
if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'))
{
if (ch >= 'A' && ch <= 'Z')
{
ch = ch | 0x20;
}
word[i++] = ch;
}
else
{
if (ch == '\f')
{
pageFlag = 1;
}
if (i > 0)
{
word[i] = '\0';
return;
}
}
}
if (i > 0)
{
word[i] = '\0';
}
}
// 创建停用词树,用前缀字典树实现
void CreateStopWordsTree()
{
StopWordsRoot = (StopWordsTree *)malloc(sizeof(StopWordsTree));
StopWordsRoot->cnt = 0;
for (int i = 0; i < 26; i++)
{
StopWordsRoot->chilren[i] = NULL;
}
StopWordsTree *p = StopWordsRoot;
while (fscanf(StopWordsFile, "%s", word) != EOF)
{
p = StopWordsRoot;
for (int i = 0; i < strlen(word); i++)
{
int index = word[i] - 'a';
if (p->chilren[index] == NULL)
{
p->chilren[index] = (StopWordsTree *)malloc(sizeof(StopWordsTree));
p->chilren[index]->cnt = 0;
for (int j = 0; j < 26; j++)
{
p->chilren[index]->chilren[j] = NULL;
}
}
p = p->chilren[index];
}
p->cnt = 1;
memset(word, 0, sizeof(word));
}
memset(word, 0, sizeof(word));
}
// 判断是否是停用词,是返回1,否返回0
int IsStopWord()
{
StopWordsTree *p = StopWordsRoot;
for (int i = 0; i < strlen(word); i++)
{
int index = word[i] - 'a';
if (p->chilren[index] == NULL)
{
return 0;
}
p = p->chilren[index];
}
if (p->cnt == 1)
{
return 1;
}
else
{
return 0;
}
}
// 非停用词词频统计,用Hash表实现
// 模拟MurmurHash算法的hash函数
uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
{
const uint64_t m = 0xc6a4a7935bd1e995LLU;
const int r = 47;
uint64_t h = seed ^ (len * m);
const unsigned char *data = (const unsigned char *)key;
for (int i = 0; i < len; i++)
{
h = (h * m) ^ data[i];
h = (h ^ (h >> r)) * m;
}
return h;
}
void NonStopWordsCnt()
{
NonStopWordHashTable *hashTable = (NonStopWordHashTable *)malloc(sizeof(NonStopWordHashTable));
hashTable->tableSize = 133333;
hashTable->table = nonStopWords;
int len = 0;
GetWord(WebFile);
len = strlen(word);
while (len > 0)
{
if (IsStopWord() == 0)
{
uint64_t hash = 0;
hash = MurmurHash64A(word, len, 0) % hashTable->tableSize;
while (hashTable->table[hash].status == 1 && strcmp(hashTable->table[hash].word, word) != 0)
{
hash = (hash + 1) % hashTable->tableSize;
}
if (hashTable->table[hash].status == 0)
{
strcpy(hashTable->table[hash].word, word);
hashTable->table[hash].count = 1;
hashTable->table[hash].status = 1;
nonStopWordsNum++;
}
else if (hashTable->table[hash].status == 1)
{
hashTable->table[hash].count++;
}
}
memset(word, 0, sizeof(word));
GetWord(WebFile);
len = strlen(word);
}
memset(word, 0, sizeof(word));
}
// 非停用词词频排序,降序排列,词频相同的按字典序升序排列
// 这是qsort函数的比较函数
int cmp(const void *a, const void *b)
{
NonStopWord *c = (NonStopWord *)a;
NonStopWord *d = (NonStopWord *)b;
if (c->count != d->count)
{
return d->count - c->count;
}
else
{
return strcmp(c->word, d->word);
}
}
// 这是快速排序函数
void NonStopWordsSort()
{
int i, j;
qsort(nonStopWords, 133333, sizeof(NonStopWord), cmp);
}
// 创建特征向量树,用前缀字典树实现
void CreateFeatureVectorTree(int N)
{
FeatureVectorRoot = (FeatureVectorTree *)malloc(sizeof(FeatureVectorTree));
FeatureVectorRoot->cnt = 0;
FeatureVectorRoot->id = 0;
for (int i = 0; i < 26; i++)
{
FeatureVectorRoot->chilren[i] = NULL;
}
FeatureVectorTree *p = FeatureVectorRoot;
for (int i = 0; i < N; i++)
{
p = FeatureVectorRoot;
for (int j = 0; j < strlen(nonStopWords[i].word); j++)
{
int index = nonStopWords[i].word[j] - 'a';
if (p->chilren[index] == NULL)
{
p->chilren[index] = (FeatureVectorTree *)malloc(sizeof(FeatureVectorTree));
p->chilren[index]->cnt = 0;
for (int k = 0; k < 26; k++)
{
p->chilren[index]->chilren[k] = NULL;
}
}
p = p->chilren[index];
}
p->cnt = 1;
p->id = i;
}
}
// 以下两步同时进行:
// 统计每个网页(文本)的特征向量中每个特征(单词)的频度,得到权重向量
// 并计算网页指纹
void WebFingerprintCnt(int N, int M)
{
int i = 0;
int j = 0;
ll finger[64] = {0};
char tmp[600] = {0};
char c = 0;
// 读取N行M列的Hash值,每行按二进制数存储在hashValue数组中
for (i = 0; i < N; i++)
{
fread(tmp, sizeof(char), M, HashFile);
if (tmp[M - 1] != '\n')
{
fread(&c, sizeof(char), 1, HashFile);
while (c != '\n')
{
fread(&c, sizeof(char), 1, HashFile);
}
}
for (j = 0; j < M; j++)
{
if (tmp[j] == '1')
{
lowerHashValue[i] += (1ULL << (M - j - 1));
}
}
}
// 开始处理原网页
fseek(WebFile, 0, SEEK_SET);
pageNum = 1;
while (!feof(WebFile))
{
pageFlag = 0;
GetWord(WebFile);
int len = strlen(word);
if (pageFlag == 1)
{
// 以网页编号为标志,处理得到该网页的指纹
for (i = 0; i < M; i++)
{
if (finger[i] > 0)
{
fingerprint[pageNum - 1] += (1ULL << (M - i - 1));
}
}
memset(finger, 0, sizeof(finger));
pageNum++;
pageFlag = 0;
}
if (len > 0)
{
FeatureVectorTree *p = FeatureVectorRoot;
int len1 = 0;
for (i = 0; i < len; i++)
{
int index = word[i] - 'a';
if (p->chilren[index] == NULL)
{
break;
}
p = p->chilren[index];
len1++;
}
if (p->cnt == 1 && len1 == len)
{
weight[pageNum - 1][p->id]++;
// 每次权重变动,就考虑M个finger的调整
for (i = 0; i < M; i++)
{
if ((lowerHashValue[p->id] & (1ULL << (M - i - 1))) != 0)
{
finger[i]++;
}
else
{
finger[i]--;
}
}
}
}
memset(word, 0, sizeof(word));
}
// 开始处理样本网页
fseek(SampleFile, 0, SEEK_SET);
memset(finger, 0, sizeof(finger));
samplePageNum = 1;
while (!feof(SampleFile))
{
pageFlag = 0;
GetWord(SampleFile);
int len = strlen(word);
if (pageFlag == 1)
{
for (i = 0; i < M; i++)
{
if (finger[i] > 0)
{
sampleFingerprint[samplePageNum - 1] += (1ULL << (M - i - 1));
}
}
memset(finger, 0, sizeof(finger));
if (!feof(SampleFile))
{
samplePageNum++;
}
pageFlag = 0;
}
if (strlen(word) > 0 && strcmp(word, "sample") != 0)
{
FeatureVectorTree *p = FeatureVectorRoot;
int len1 = 0;
for (i = 0; i < len; i++)
{
int index = word[i] - 'a';
if (p->chilren[index] == NULL)
{
break;
}
p = p->chilren[index];
len1++;
}
if (p->cnt == 1 && len1 == len)
{
sampleWeight[samplePageNum - 1][p->id]++;
for (i = 0; i < M; i++)
{
if ((lowerHashValue[p->id] & (1ULL << (M - i - 1))) != 0)
{
finger[i]++;
}
else
{
finger[i]--;
}
}
}
}
memset(word, 0, sizeof(word));
}
}
// 计算各网页的汉明距离并输出结果,利用二进制数的异或运算
void HammingDistanceCnt(int M)
{
int i = 0;
int j = 0;
int distance = 0;
for (i = 0; i < samplePageNum; i++)
{
for (j = 0; j < pageNum; j++)
{
uint64_t tmp = sampleFingerprint[i] ^ fingerprint[j];
while (tmp != 0)
{
distance++;
tmp = tmp & (tmp - 1);
}
if (distance == 3)
{
result[3][printPageNum3++] = j;
}
else if (distance == 2)
{
result[2][printPageNum2++] = j;
}
else if (distance == 1)
{
result[1][printPageNum1++] = j;
}
else if (distance == 0)
{
result[0][printPageNum0++] = j;
}
distance = 0;
}
if (i == 0)
{
printf("%s\n", sampleWebId[i]);
int k = 0;
if (printPageNum0 > 0)
{
printf("0:");
for (k = 0; k < printPageNum0; k++)
{
printf("%s ", webId[result[0][k]]);
}
printf("\n");
}
if (printPageNum1 > 0)
{
printf("1:");
for (k = 0; k < printPageNum1; k++)
{
printf("%s ", webId[result[1][k]]);
}
printf("\n");
}
if (printPageNum2 > 0)
{
printf("2:");
for (k = 0; k < printPageNum2; k++)
{
printf("%s ", webId[result[2][k]]);
}
printf("\n");
}
if (printPageNum3 > 0)
{
printf("3:");
for (k = 0; k < printPageNum3; k++)
{
printf("%s ", webId[result[3][k]]);
}
printf("\n");
}
fwrite(sampleWebId[i], sizeof(char), strlen(sampleWebId[i]), ResultFile);
fwrite("\n", sizeof(char), 1, ResultFile);
if (printPageNum0 > 0)
{
fwrite("0:", sizeof(char), 2, ResultFile);
for (k = 0; k < printPageNum0; k++)
{
fwrite(webId[result[0][k]], sizeof(char), strlen(webId[result[0][k]]), ResultFile);
fwrite(" ", sizeof(char), 1, ResultFile);
}
fwrite("\n", sizeof(char), 1, ResultFile);
}
if (printPageNum1 > 0)
{
fwrite("1:", sizeof(char), 2, ResultFile);
for (k = 0; k < printPageNum1; k++)
{
fwrite(webId[result[1][k]], sizeof(char), strlen(webId[result[1][k]]), ResultFile);
fwrite(" ", sizeof(char), 1, ResultFile);
}
fwrite("\n", sizeof(char), 1, ResultFile);
}
if (printPageNum2 > 0)
{
fwrite("2:", sizeof(char), 2, ResultFile);
for (k = 0; k < printPageNum2; k++)
{
fwrite(webId[result[2][k]], sizeof(char), strlen(webId[result[2][k]]), ResultFile);
fwrite(" ", sizeof(char), 1, ResultFile);
}
fwrite("\n", sizeof(char), 1, ResultFile);
}
if (printPageNum3 > 0)
{
fwrite("3:", sizeof(char), 2, ResultFile);
for (k = 0; k < printPageNum3; k++)
{
fwrite(webId[result[3][k]], sizeof(char), strlen(webId[result[3][k]]), ResultFile);
fwrite(" ", sizeof(char), 1, ResultFile);
}
fwrite("\n", sizeof(char), 1, ResultFile);
}
printPageNum0 = 0;
printPageNum1 = 0;
printPageNum2 = 0;
printPageNum3 = 0;
}
else
{
fwrite(sampleWebId[i], sizeof(char), strlen(sampleWebId[i]), ResultFile);
fwrite("\n", sizeof(char), 1, ResultFile);
int k = 0;
if (printPageNum0 > 0)
{
fwrite("0:", sizeof(char), 2, ResultFile);
for (k = 0; k < printPageNum0; k++)
{
fwrite(webId[result[0][k]], sizeof(char), strlen(webId[result[0][k]]), ResultFile);
fwrite(" ", sizeof(char), 1, ResultFile);
}
fwrite("\n", sizeof(char), 1, ResultFile);
}
if (printPageNum1 > 0)
{
fwrite("1:", sizeof(char), 2, ResultFile);
for (k = 0; k < printPageNum1; k++)
{
fwrite(webId[result[1][k]], sizeof(char), strlen(webId[result[1][k]]), ResultFile);
fwrite(" ", sizeof(char), 1, ResultFile);
}
fwrite("\n", sizeof(char), 1, ResultFile);
}
if (printPageNum2 > 0)
{
fwrite("2:", sizeof(char), 2, ResultFile);
for (k = 0; k < printPageNum2; k++)
{
fwrite(webId[result[2][k]], sizeof(char), strlen(webId[result[2][k]]), ResultFile);
fwrite(" ", sizeof(char), 1, ResultFile);
}
fwrite("\n", sizeof(char), 1, ResultFile);
}
if (printPageNum3 > 0)
{
fwrite("3:", sizeof(char), 2, ResultFile);
for (k = 0; k < printPageNum3; k++)
{
fwrite(webId[result[3][k]], sizeof(char), strlen(webId[result[3][k]]), ResultFile);
fwrite(" ", sizeof(char), 1, ResultFile);
}
fwrite("\n", sizeof(char), 1, ResultFile);
}
printPageNum0 = 0;
printPageNum1 = 0;
printPageNum2 = 0;
printPageNum3 = 0;
}
}
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。