代码拉取完成,页面将自动刷新
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
// 结构定义
// 非停用词单词信息体
typedef struct NonStopWord
{
char word[50];
int count;
} NonStopWord;
// 停用词树,用前缀树实现
typedef struct StopWordsTree
{
int cnt;
struct StopWordsTree *chilren[26];
} StopWordsTree;
// 特征向量树,用前缀树实现
typedef struct FeatureVectorTree
{
int cnt;
int id;
struct FeatureVectorTree *chilren[26];
} FeatureVectorTree;
// 变量定义
// 数据库的非停用词单词数组
NonStopWord nonStopWords[80000] = {0};
// 数据库的非停用词数量
int nonStopWordsNum = 0;
// 原网页页数
int pageNum = 0;
// 样本网页页数
int samplePageNum = 0;
// 读到换页符的标记
int pageFlag = 0;
// 单词数组
char word[100] = {0};
// 网页标识信息的二维数组
// 原网页标识信息
char webId[8000][10] = {0};
// 样本网页标识信息
char sampleWebId[8000][10] = {0};
// 各网页权重向量构成的二维数组
// 原网页权重向量
int weight[8000][10000] = {0};
// 样本网页权重向量
int sampleWeight[8000][10000] = {0};
// 各网页指纹构成的二维数组
// 原网页指纹
int fingerprint[8000][130] = {0};
// 样本网页指纹
int sampleFingerprint[8000][130] = {0};
// 样本网页对原网页的汉明距离构成的二维数组
// 横坐标是样本网页编号数(第1个网页编号0,以此类推),纵坐标是原网页编号数
int hammingDistance[8000][8000] = {0};
// 临时储存输出结果
int tempResult[5][8000] = {0};
// 临时储存输出结果汉明距离为0的原网页数量
int tempResult0Num = 0;
// 临时储存输出结果汉明距离为1的原网页数量
int tempResult1Num = 0;
// 临时储存输出结果汉明距离为2的原网页数量
int tempResult2Num = 0;
// 临时储存输出结果汉明距离为3的原网页数量
int tempResult3Num = 0;
// 停用词文件指针
FILE *StopWordsFile;
// 已有网页文件指针
FILE *WebFile;
// 样本网页文件指针
FILE *SampleFile;
// Hash表文件指针
FILE *HashFile;
// 输出结果文件指针
FILE *ResultFile;
// 停用词树根节点
StopWordsTree *StopWordsRoot = NULL;
// 特征向量树根节点
FeatureVectorTree *FeatureVectorRoot = NULL;
// 功能函数声明
// 读取一个单词
void GetWord(FILE *file);
// 读取网页标识信息
void GetWebId(FILE *file);
// 创建停用词树
void CreateStopWordsTree();
// 判断是否是停用词
int IsStopWord();
// 非停用词词频统计
void NonStopWordsCnt();
// 非停用词词频排序
void NonStopWordsSort();
// 创建特征向量树(排序后前N个信息体就是特征向量)
void CreateFeatureVectorTree(int N);
// 统计每个网页(文本)的特征向量中每个特征(单词)的频度
void WebFeatureVectorCnt(FILE *file);
// 计算网页指纹
void WebFingerprintCnt(int N, int M);
// 计算汉明距离
void HammingDistanceCnt(int M);
// 输出结果
void OutputResult();
// 主程序实现
int main(int argc, char **argv)
{
// 命令行输入形式应该是:simtool N M
if (argc == 3)
{
// 字符串转成整数,得到N、M的值
int N = atoi(argv[1]);
int M = atoi(argv[2]);
// 步骤0:打开所有需要的文件
StopWordsFile = fopen("stopwords.txt", "r");
if (StopWordsFile == NULL)
{
printf("停用词文件打开失败!\n");
return 1;
}
WebFile = fopen("article.txt", "r");
if (WebFile == NULL)
{
printf("已有网页文件打开失败!\n");
return 1;
}
SampleFile = fopen("sample.txt", "r");
if (SampleFile == NULL)
{
printf("样本网页文件打开失败!\n");
return 1;
}
HashFile = fopen("hashvalue.txt", "r");
if (HashFile == NULL)
{
printf("Hash表文件打开失败!\n");
return 1;
}
ResultFile = fopen("result.txt", "w");
if (ResultFile == NULL)
{
printf("输出结果文件打开失败!\n");
return 1;
}
// 步骤1:获取各网页标识信息
GetWebId(WebFile);
GetWebId(SampleFile);
// 步骤2:得到排序后的非停用词单词数组(排序后前N个信息体就是特征向量)
CreateStopWordsTree();
NonStopWordsCnt();
NonStopWordsSort();
// 步骤3:统计每个网页(文本)的特征向量中每个特征(单词)的频度,得到权重向量
CreateFeatureVectorTree(N);
WebFeatureVectorCnt(WebFile);
WebFeatureVectorCnt(SampleFile);
// 步骤4:计算各网页的指纹
WebFingerprintCnt(N, M);
// 步骤5:计算各网页的汉明距离
HammingDistanceCnt(M);
// 步骤6:按要求输出结果
OutputResult();
// 步骤7:关闭文件
fclose(StopWordsFile);
fclose(WebFile);
fclose(SampleFile);
fclose(HashFile);
fclose(ResultFile);
}
return 0;
}
// 功能函数实现
// 读取网页标识信息
void GetWebId(FILE *file)
{
if (file == WebFile)
{
int num = 0;
int flag = 0;
// 读取第一个网页
fgets(webId[num++], 200, file);
strtok(webId[num - 1], "\n");
strtok(webId[num - 1], "\r");
char tmp[200] = {0};
// 以换页符为标志读取后续的网页
while (fgets(tmp, 200, file) != NULL)
{
if (flag == 1)
{
strcpy(webId[num++], tmp);
strtok(webId[num - 1], "\n");
strtok(webId[num - 1], "\r");
flag = 0;
}
if (strstr(tmp, "\f"))
{
flag = 1;
}
}
fseek(file, 0, SEEK_SET);
}
else if (file == SampleFile)
{
int num = 0;
int flag = 0;
char tmp[200] = {0};
fgets(tmp, 10, file);
memset(tmp, 0, sizeof(tmp));
fgets(sampleWebId[num++], 200, file);
strtok(sampleWebId[num - 1], "\n");
strtok(sampleWebId[num - 1], "\r");
while (fgets(tmp, 200, file) != NULL)
{
if (flag == 1)
{
strcpy(sampleWebId[num++], tmp);
strtok(sampleWebId[num - 1], "\n");
strtok(sampleWebId[num - 1], "\r");
flag = 0;
}
if (strstr(tmp, "\f"))
{
flag = 1;
}
}
fseek(file, 0, SEEK_SET);
}
}
// 读取一个单词,同时要把单词转换成小写
void GetWord(FILE *file)
{
int i = 0;
char ch;
while ((ch = fgetc(file)) != EOF)
{
if (isalpha(ch))
{
word[i++] = tolower(ch);
}
else
{
if (ch == '\f')
{
pageFlag = 1;
}
if (i > 0)
{
word[i] = '\0';
return;
}
}
}
if (i > 0)
{
word[i] = '\0';
}
}
// 创建停用词树,用前缀树实现
void CreateStopWordsTree()
{
StopWordsRoot = (StopWordsTree *)malloc(sizeof(StopWordsTree));
StopWordsRoot->cnt = 0;
for (int i = 0; i < 26; i++)
{
StopWordsRoot->chilren[i] = NULL;
}
StopWordsTree *p = StopWordsRoot;
while (fscanf(StopWordsFile, "%s", word) != EOF)
{
p = StopWordsRoot;
for (int i = 0; i < strlen(word); i++)
{
int index = word[i] - 'a';
if (p->chilren[index] == NULL)
{
p->chilren[index] = (StopWordsTree *)malloc(sizeof(StopWordsTree));
p->chilren[index]->cnt = 0;
for (int j = 0; j < 26; j++)
{
p->chilren[index]->chilren[j] = NULL;
}
}
p = p->chilren[index];
}
p->cnt = 1;
memset(word, 0, sizeof(word));
}
memset(word, 0, sizeof(word));
}
// 判断是否是停用词,是返回1,否返回0
int IsStopWord()
{
StopWordsTree *p = StopWordsRoot;
for (int i = 0; i < strlen(word); i++)
{
int index = word[i] - 'a';
if (p->chilren[index] == NULL)
{
return 0;
}
p = p->chilren[index];
}
if (p->cnt == 1)
{
return 1;
}
else
{
return 0;
}
}
// 非停用词词频统计
void NonStopWordsCnt()
{
GetWord(WebFile);
while (strlen(word) > 0)
{
if (IsStopWord() == 0)
{
int i;
for (i = 0; i < nonStopWordsNum; i++)
{
if (strcmp(nonStopWords[i].word, word) == 0)
{
nonStopWords[i].count++;
break;
}
}
if (i == nonStopWordsNum)
{
strcpy(nonStopWords[nonStopWordsNum].word, word);
nonStopWords[nonStopWordsNum].count = 1;
nonStopWordsNum++;
}
}
memset(word, 0, sizeof(word));
GetWord(WebFile);
}
memset(word, 0, sizeof(word));
}
// 非停用词词频排序,降序排列,词频相同的按字典序升序排列
// 这是qsort函数的比较函数
int cmp(const void *a, const void *b)
{
NonStopWord *c = (NonStopWord *)a;
NonStopWord *d = (NonStopWord *)b;
if (c->count != d->count)
{
return d->count - c->count;
}
else
{
return strcmp(c->word, d->word);
}
}
// 这是快速排序函数
void NonStopWordsSort()
{
int i, j;
qsort(nonStopWords, nonStopWordsNum, sizeof(NonStopWord), cmp);
}
// 创建特征向量树,用前缀树实现
void CreateFeatureVectorTree(int N)
{
FeatureVectorRoot = (FeatureVectorTree *)malloc(sizeof(FeatureVectorTree));
FeatureVectorRoot->cnt = 0;
FeatureVectorRoot->id = 0;
for (int i = 0; i < 26; i++)
{
FeatureVectorRoot->chilren[i] = NULL;
}
FeatureVectorTree *p = FeatureVectorRoot;
for (int i = 0; i < N; i++)
{
p = FeatureVectorRoot;
for (int j = 0; j < strlen(nonStopWords[i].word); j++)
{
int index = nonStopWords[i].word[j] - 'a';
if (p->chilren[index] == NULL)
{
p->chilren[index] = (FeatureVectorTree *)malloc(sizeof(FeatureVectorTree));
p->chilren[index]->cnt = 0;
for (int k = 0; k < 26; k++)
{
p->chilren[index]->chilren[k] = NULL;
}
}
p = p->chilren[index];
}
p->cnt = 1;
p->id = i;
}
}
// 统计每个网页(文本)的特征向量中每个特征(单词)的频度,得到权重向量
void WebFeatureVectorCnt(FILE *file)
{
if (file == WebFile)
{
fseek(file, 0, SEEK_SET);
pageNum = 1;
while (!feof(file))
{
pageFlag = 0;
GetWord(file);
if (pageFlag == 1)
{
pageNum++;
pageFlag = 0;
}
if (strlen(word) > 0)
{
FeatureVectorTree *p = FeatureVectorRoot;
int len = 0;
for (int i = 0; i < strlen(word); i++)
{
int index = word[i] - 'a';
if (p->chilren[index] == NULL)
{
break;
}
p = p->chilren[index];
len++;
}
if (p->cnt == 1 && len == strlen(word))
{
weight[pageNum - 1][p->id]++;
}
}
memset(word, 0, sizeof(word));
}
}
else if (file == SampleFile)
{
fseek(file, 0, SEEK_SET);
samplePageNum = 1;
while (!feof(file))
{
pageFlag = 0;
GetWord(file);
if (pageFlag == 1)
{
if (!feof(file))
{
samplePageNum++;
}
pageFlag = 0;
}
if (strlen(word) > 0 && strcmp(word, "sample") != 0)
{
FeatureVectorTree *p = FeatureVectorRoot;
int len = 0;
for (int i = 0; i < strlen(word); i++)
{
int index = word[i] - 'a';
if (p->chilren[index] == NULL)
{
break;
}
p = p->chilren[index];
len++;
}
if (p->cnt == 1 && len == strlen(word))
{
sampleWeight[samplePageNum - 1][p->id]++;
}
}
memset(word, 0, sizeof(word));
}
}
}
// 计算网页指纹
void WebFingerprintCnt(int N, int M)
{
int i = 0;
int j = 0;
int k = 0;
int finger = 0;
char tempHash[500] = {0};
for (i = 0; i < pageNum; i++)
{
for (j = 0; j < N; j++)
{
fgets(tempHash, 500, HashFile);
for (k = 0; k < M; k++)
{
if (tempHash[k] == '1')
{
fingerprint[i][k] += weight[i][j];
}
else if (tempHash[k] == '0')
{
fingerprint[i][k] -= weight[i][j];
}
}
memset(tempHash, 0, sizeof(tempHash));
}
fseek(HashFile, 0, SEEK_SET);
}
for (i = 0; i < pageNum; i++)
{
for (j = 0; j < M; j++)
{
if (fingerprint[i][j] > 0)
{
fingerprint[i][j] = 1;
}
else
{
fingerprint[i][j] = 0;
}
}
}
fseek(HashFile, 0, SEEK_SET);
for (i = 0; i < samplePageNum; i++)
{
for (j = 0; j < N; j++)
{
fgets(tempHash, 500, HashFile);
for (k = 0; k < M; k++)
{
if (tempHash[k] == '1')
{
sampleFingerprint[i][k] += sampleWeight[i][j];
}
else if (tempHash[k] == '0')
{
sampleFingerprint[i][k] -= sampleWeight[i][j];
}
}
memset(tempHash, 0, sizeof(tempHash));
}
fseek(HashFile, 0, SEEK_SET);
}
for (i = 0; i < samplePageNum; i++)
{
for (j = 0; j < M; j++)
{
if (sampleFingerprint[i][j] > 0)
{
sampleFingerprint[i][j] = 1;
}
else
{
sampleFingerprint[i][j] = 0;
}
}
}
}
// 计算各网页的汉明距离
void HammingDistanceCnt(int M)
{
int i = 0;
int j = 0;
int k = 0;
int distance = 0;
for (i = 0; i < samplePageNum; i++)
{
for (j = 0; j < pageNum; j++)
{
distance = 0;
for (k = 0; k < M; k++)
{
if (sampleFingerprint[i][k] != fingerprint[j][k])
{
distance++;
}
}
hammingDistance[i][j] = distance;
}
}
}
// 按要求输出结果
void OutputResult()
{
int i = 0;
int j = 0;
for (i = 0; i < samplePageNum; i++)
{
if (i == 0)
{
printf("%s\n", sampleWebId[i]);
}
fprintf(ResultFile, "%s\n", sampleWebId[i]);
for (j = 0; j < pageNum; j++)
{
if (hammingDistance[i][j] == 0)
{
tempResult[0][tempResult0Num++] = j + 1;
}
else if (hammingDistance[i][j] == 1)
{
tempResult[1][tempResult1Num++] = j + 1;
}
else if (hammingDistance[i][j] == 2)
{
tempResult[2][tempResult2Num++] = j + 1;
}
else if (hammingDistance[i][j] == 3)
{
tempResult[3][tempResult3Num++] = j + 1;
}
}
if (tempResult0Num > 0)
{
if (i == 0)
{
printf("0:");
}
fprintf(ResultFile, "0:");
for (j = 0; j < tempResult0Num; j++)
{
if (i == 0)
{
printf("%s ", webId[tempResult[0][j] - 1]);
}
fprintf(ResultFile, "%s ", webId[tempResult[0][j] - 1]);
}
if (i == 0)
{
printf("\n");
}
fprintf(ResultFile, "\n");
}
if (tempResult1Num > 0)
{
if (i == 0)
{
printf("1:");
}
fprintf(ResultFile, "1:");
for (j = 0; j < tempResult1Num; j++)
{
if (i == 0)
{
printf("%s ", webId[tempResult[1][j] - 1]);
}
fprintf(ResultFile, "%s ", webId[tempResult[1][j] - 1]);
}
if (i == 0)
{
printf("\n");
}
fprintf(ResultFile, "\n");
}
if (tempResult2Num > 0)
{
if (i == 0)
{
printf("2:");
}
fprintf(ResultFile, "2:");
for (j = 0; j < tempResult2Num; j++)
{
if (i == 0)
{
printf("%s ", webId[tempResult[2][j] - 1]);
}
fprintf(ResultFile, "%s ", webId[tempResult[2][j] - 1]);
}
if (i == 0)
{
printf("\n");
}
fprintf(ResultFile, "\n");
}
if (tempResult3Num > 0)
{
if (i == 0)
{
printf("3:");
}
fprintf(ResultFile, "3:");
for (j = 0; j < tempResult3Num; j++)
{
if (i == 0)
{
printf("%s ", webId[tempResult[3][j] - 1]);
}
fprintf(ResultFile, "%s ", webId[tempResult[3][j] - 1]);
}
if (i == 0)
{
printf("\n");
}
fprintf(ResultFile, "\n");
}
memset(tempResult, 0, sizeof(tempResult));
tempResult0Num = 0;
tempResult1Num = 0;
tempResult2Num = 0;
tempResult3Num = 0;
}
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。