5 Star 7 Fork 0

zjzdy / Offline_Small_Search_pkg_build

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
build.cpp 33.02 KB
一键复制 编辑 原始数据 按行查看 历史
zjzdy 提交于 2016-07-28 17:28 . v0.8
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054
#include "build.h"
#ifndef __WIN32
magic_t magic;
#endif
std::map<std::string, unsigned int> counters;
std::map<std::string, std::string> fileMimeTypes;
std::map<std::string, std::string> extMimeTypes;
char *data = NULL;
unsigned int dataSize = 0;
std::queue<std::string> metadataQueue;
std::string directoryPath = "";
std::string language = "zh_CN";
std::string creator = "oss_pkg_build";
std::string publisher = "";
std::string title = "";
std::string description = "";
std::string welcome = "";
std::string code_name;
unsigned long long count_index = 0;
Xapian::WritableDatabase db;
Xapian::TermGenerator termgen;
Xapian::Stem enstem("en");
class MetadataArticle : public Article {
public:
MetadataArticle(std::string &id) {
aid = "/M/" + id;
mimeType="text/plain";
ns = 'M';
url = id;
}
};
/*
#include <str.h>
static string format_doc_termlist(const Xapian::Document & doc)
{
string output;
Xapian::TermIterator it;
for (it = doc.termlist_begin(); it != doc.termlist_end(); ++it) {
if (!output.empty()) output += ' ';
output += *it;
if (it.positionlist_count() != 0) {
// If we've got a position list, only display the wdf if it's not
// the length of the positionlist.
if (it.get_wdf() != it.positionlist_count()) {
output += ':';
output += str(it.get_wdf());
}
char ch = '[';
Xapian::PositionIterator posit;
for (posit = it.positionlist_begin(); posit != it.positionlist_end(); posit++) {
output += ch;
ch = ',';
output += str(*posit);
}
output += ']';
} else if (it.get_wdf() != 0) {
// If no position list, display any non-zero wdfs.
output += ':';
output += str(it.get_wdf());
}
}
return output;
}
*/
/* Decompress an STL string using zlib and return the original data. */
inline std::string inflateString(const std::string& str) {
z_stream zs; // z_stream is zlib's control structure
memset(&zs, 0, sizeof(zs));
if (inflateInit(&zs) != Z_OK)
throw(std::runtime_error("inflateInit failed while decompressing."));
zs.next_in = (Bytef*)str.data();
zs.avail_in = str.size();
int ret;
char outbuffer[32768];
std::string outstring;
// get the decompressed bytes blockwise using repeated calls to inflate
do {
zs.next_out = reinterpret_cast<Bytef*>(outbuffer);
zs.avail_out = sizeof(outbuffer);
ret = inflate(&zs, 0);
if (outstring.size() < zs.total_out) {
outstring.append(outbuffer,
zs.total_out - outstring.size());
}
} while (ret == Z_OK);
inflateEnd(&zs);
if (ret != Z_STREAM_END) { // an error occurred that was not EOF
std::ostringstream oss;
oss << "Exception during zlib decompression: (" << ret << ") "
<< zs.msg;
throw(std::runtime_error(oss.str()));
}
return outstring;
}
inline bool seemsToBeHtml(const std::string &path) {
if (path.find_last_of(".") != std::string::npos) {
std::string mimeType = path.substr(path.find_last_of(".")+1);
if (extMimeTypes.find(mimeType) != extMimeTypes.end()) {
return "text/html" == extMimeTypes[mimeType];
}
}
return false;
}
inline std::string getFileContent(const std::string &path) {
try
{
std::ifstream in(path.c_str(), ::std::ios::binary);
if (in) {
std::string contents;
in.seekg(0, std::ios::end);
contents.resize(in.tellg());
in.seekg(0, std::ios::beg);
in.read(&contents[0], contents.size());
in.close();
return(contents);
}
std::cerr << "读取文件: 无法打开文件: " << path << std::endl;
throw(errno);
}
catch(...)
{
QFile file(QString::fromLocal8Bit(QByteArray::fromStdString(path)));
file.open(QFile::ReadOnly);
QByteArray byte = file.readAll();
file.close();
return byte.toStdString();
}
}
inline bool fileExists(const std::string &path) {
bool flag = false;
std::fstream fin;
fin.open(path.c_str(), std::ios::in);
if (fin.is_open()) {
flag = true;
}
fin.close();
return flag;
}
/* base64 */
static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) {
std::string ret;
int i = 0;
int j = 0;
unsigned char char_array_3[3];
unsigned char char_array_4[4];
while (in_len--) {
char_array_3[i++] = *(bytes_to_encode++);
if (i == 3) {
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
char_array_4[3] = char_array_3[2] & 0x3f;
for(i = 0; (i <4) ; i++)
ret += base64_chars[char_array_4[i]];
i = 0;
}
}
if (i)
{
for(j = i; j < 3; j++)
char_array_3[j] = '\0';
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
char_array_4[3] = char_array_3[2] & 0x3f;
for (j = 0; (j < i + 1); j++)
ret += base64_chars[char_array_4[j]];
while((i++ < 3))
ret += '=';
}
return ret;
}
inline std::string decodeUrl(const std::string &encodedUrl) {
std::string decodedUrl = encodedUrl;
std::string::size_type pos = 0;
char ch;
while ((pos = decodedUrl.find('%', pos)) != std::string::npos &&
pos + 2 < decodedUrl.length()) {
sscanf(decodedUrl.substr(pos + 1, 2).c_str(), "%x", (unsigned int*)&ch);
decodedUrl.replace(pos, 3, 1, ch);
++pos;
}
return decodedUrl;
}
inline std::string removeLastPathElement(const std::string path, const bool removePreSeparator, const bool removePostSeparator) {
std::string newPath = path;
size_t offset = newPath.find_last_of(SEPARATOR);
if (removePreSeparator && offset == newPath.length()-1) {
newPath = newPath.substr(0, offset);
offset = newPath.find_last_of(SEPARATOR);
}
newPath = removePostSeparator ? newPath.substr(0, offset) : newPath.substr(0, offset+1);
return newPath;
}
/* Split string in a token array */
std::vector<std::string> split(const std::string & str,
const std::string & delims=" *-")
{
std::string::size_type lastPos = str.find_first_not_of(delims, 0);
std::string::size_type pos = str.find_first_of(delims, lastPos);
std::vector<std::string> tokens;
while (std::string::npos != pos || std::string::npos != lastPos)
{
tokens.push_back(str.substr(lastPos, pos - lastPos));
lastPos = str.find_first_not_of(delims, pos);
pos = str.find_first_of(delims, lastPos);
}
return tokens;
}
std::vector<std::string> split(const char* lhs, const char* rhs){
const std::string m1 (lhs), m2 (rhs);
return split(m1, m2);
}
std::vector<std::string> split(const char* lhs, const std::string& rhs){
return split(lhs, rhs.c_str());
}
std::vector<std::string> split(const std::string& lhs, const char* rhs){
return split(lhs.c_str(), rhs);
}
/* Warning: the relative path must be with slashes */
inline std::string computeAbsolutePath(const std::string path, const std::string relativePath) {
/* Add a trailing / to the path if necessary */
std::string absolutePath = path[path.length()-1] == '/' ? path : removeLastPathElement(path, false, false);
/* Go through relative path */
std::vector<std::string> relativePathElements;
std::stringstream relativePathStream(relativePath);
std::string relativePathItem;
while (std::getline(relativePathStream, relativePathItem, '/')) {
if (relativePathItem == "..") {
absolutePath = removeLastPathElement(absolutePath, true, false);
} else if (!relativePathItem.empty() && relativePathItem != ".") {
absolutePath += relativePathItem;
absolutePath += "/";
}
}
/* Remove wront trailing / */
return absolutePath.substr(0, absolutePath.length()-1);
}
/* Warning: the relative path must be with slashes */
std::string computeRelativePath(const std::string path, const std::string absolutePath) {
std::vector<std::string> pathParts = split(path, "/");
std::vector<std::string> absolutePathParts = split(absolutePath, "/");
unsigned int commonCount = 0;
while (commonCount < pathParts.size() &&
commonCount < absolutePathParts.size() &&
pathParts[commonCount] == absolutePathParts[commonCount]) {
if (!pathParts[commonCount].empty()) {
commonCount++;
}
}
std::string relativePath;
for (unsigned int i = commonCount ; i < pathParts.size()-1 ; i++) {
relativePath += "../";
}
for (unsigned int i = commonCount ; i < absolutePathParts.size() ; i++) {
relativePath += absolutePathParts[i];
relativePath += i + 1 < absolutePathParts.size() ? "/" : "";
}
return relativePath;
}
static bool isLocalUrl(const std::string url) {
if (url.find(":") != std::string::npos) {
return (!(
url.find("://") != std::string::npos ||
url.find("//") == 0 ||
url.find("tel:") == 0 ||
url.find("geo:") == 0
));
}
return true;
}
static std::string extractRedirectUrlFromHtml(const GumboVector* head_children) {
std::string url;
for (int i = 0; i < head_children->length; ++i) {
GumboNode* child = (GumboNode*)(head_children->data[i]);
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_META) {
GumboAttribute* attribute;
if (attribute = gumbo_get_attribute(&child->v.element.attributes, "http-equiv")) {
if (!strcmp(attribute->value, "refresh")) {
if (attribute = gumbo_get_attribute(&child->v.element.attributes, "content")) {
std::string targetUrl = attribute->value;
std::size_t found = targetUrl.find("URL=") != std::string::npos ? targetUrl.find("URL=") : targetUrl.find("url=");
if (found!=std::string::npos) {
url = targetUrl.substr(found+4);
} else {
throw std::string("Unable to find the redirect/refresh target url from the HTML DOM");
}
}
}
}
}
}
return url;
}
static void getLinks(GumboNode* node, std::map<std::string, bool> &links) {
if (node->type != GUMBO_NODE_ELEMENT) {
return;
}
GumboAttribute* attribute = NULL;
attribute = gumbo_get_attribute(&node->v.element.attributes, "href");
if (attribute == NULL) {
attribute = gumbo_get_attribute(&node->v.element.attributes, "src");
}
if (attribute != NULL && isLocalUrl(attribute->value)) {
links[attribute->value] = true;
}
GumboVector* children = &node->v.element.children;
for (int i = 0; i < children->length; ++i) {
getLinks(static_cast<GumboNode*>(children->data[i]), links);
}
}
static void replaceStringInPlaceOnce(std::string& subject, const std::string& search,
const std::string& replace) {
size_t pos = 0;
while ((pos = subject.find(search, pos)) != std::string::npos) {
subject.replace(pos, search.length(), replace);
pos += replace.length();
return; /* Do it once */
}
}
static void replaceStringInPlace(std::string& subject, const std::string& search,
const std::string& replace) {
size_t pos = 0;
while ((pos = subject.find(search, pos)) != std::string::npos) {
subject.replace(pos, search.length(), replace);
pos += replace.length();
}
return;
}
static std::string getMimeTypeForFile(const std::string& filename) {
std::string mimeType;
/* Try to get the mimeType from the file extension */
if (filename.find_last_of(".") != std::string::npos) {
mimeType = filename.substr(filename.find_last_of(".")+1);
if (extMimeTypes.find(mimeType) != extMimeTypes.end()) {
return extMimeTypes[mimeType];
}
}
/* Try to get the mimeType from the cache */
if (fileMimeTypes.find(filename) != fileMimeTypes.end()) {
return fileMimeTypes[filename];
}
#ifndef __WIN32
/* Try to get the mimeType with libmagic */
try {
std::string path = directoryPath + "/" + filename;
mimeType = std::string(magic_file(magic, path.c_str()));
//cout<<path<<mimeType;
if (mimeType.find(";") != std::string::npos) {
mimeType = mimeType.substr(0, mimeType.find(";"));
}
fileMimeTypes[filename] = mimeType;
return mimeType;
} catch (...) {
return "";
}
#else
return "";
#endif
}
inline std::string removeLocalTag(const std::string &url) {
std::size_t found = url.find("#");
if (found != std::string::npos) {
return url.substr(0, found-1);
}
return url;
}
inline std::string computeNewUrl(const std::string &aid, const std::string &url) {
std::string filename = computeAbsolutePath(aid, url);
//std::string targetMimeType = getMimeTypeForFile(removeLocalTag(decodeUrl(filename)));
//std::string originMimeType = getMimeTypeForFile(aid);
std::string newUrl = "/A/" + filename;
std::string baseUrl = "/A/" + aid;
return computeRelativePath(baseUrl, newUrl);
}
build::build(QObject *parent) : QObject(parent)
{
zim_minChunkSize = 2048;
zimCreator = new zim::writer::ZimCreator();
#ifdef HAVE__PUTENV_S
_putenv_s("ZIM_LZMA_LEVEL", "9e");
_putenv_s("XAPIAN_CJK_NGRAM", "1");
#elif defined HAVE_SETENV
setenv("ZIM_LZMA_LEVEL", "9e", 1);
setenv("XAPIAN_CJK_NGRAM", "1", 1);
#else
putenv(const_cast<char*>("ZIM_LZMA_LEVEL=9e"));
putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1"));
#endif
termgen.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM);
//termgen.set_stemmer(stem);
termgen.set_stemming_strategy(Xapian::TermGenerator::STEM_NONE);
/* Init file extensions hash */
extMimeTypes["HTML"] = "text/html";
extMimeTypes["html"] = "text/html";
extMimeTypes["HTM"] = "text/html";
extMimeTypes["htm"] = "text/html";
extMimeTypes["PHP"] = "text/html";
extMimeTypes["php"] = "text/html";
extMimeTypes["ASP"] = "text/html";
extMimeTypes["asp"] = "text/html";
extMimeTypes["ASPX"] = "text/html";
extMimeTypes["aspx"] = "text/html";
extMimeTypes["JSP"] = "text/html";
extMimeTypes["jsp"] = "text/html";
extMimeTypes["PNG"] = "image/png";
extMimeTypes["png"] = "image/png";
extMimeTypes["TIFF"] = "image/tiff";
extMimeTypes["tiff"] = "image/tiff";
extMimeTypes["TIF"] = "image/tiff";
extMimeTypes["tif"] = "image/tiff";
extMimeTypes["JPEG"] = "image/jpeg";
extMimeTypes["jpeg"] = "image/jpeg";
extMimeTypes["JPG"] = "image/jpeg";
extMimeTypes["jpg"] = "image/jpeg";
extMimeTypes["GIF"] = "image/gif";
extMimeTypes["gif"] = "image/gif";
extMimeTypes["SVG"] = "image/svg+xml";
extMimeTypes["svg"] = "image/svg+xml";
extMimeTypes["TXT"] = "text/plain";
extMimeTypes["txt"] = "text/plain";
extMimeTypes["XML"] = "text/xml";
extMimeTypes["xml"] = "text/xml";
extMimeTypes["EPUB"] = "application/epub+zip";
extMimeTypes["epub"] = "application/epub+zip";
extMimeTypes["PDF"] = "application/pdf";
extMimeTypes["pdf"] = "application/pdf";
extMimeTypes["OGG"] = "application/ogg";
extMimeTypes["ogg"] = "application/ogg";
extMimeTypes["JS"] = "application/javascript";
extMimeTypes["js"] = "application/javascript";
extMimeTypes["JSON"] = "application/json";
extMimeTypes["json"] = "application/json";
extMimeTypes["CSS"] = "text/css";
extMimeTypes["css"] = "text/css";
extMimeTypes["otf"] = "application/vnd.ms-opentype";
extMimeTypes["OTF"] = "application/vnd.ms-opentype";
extMimeTypes["eot"] = "application/vnd.ms-fontobject";
extMimeTypes["EOT"] = "application/vnd.ms-fontobject";
extMimeTypes["ttf"] = "application/font-ttf";
extMimeTypes["TTF"] = "application/font-ttf";
extMimeTypes["woff"] = "application/font-woff";
extMimeTypes["WOFF"] = "application/font-woff";
extMimeTypes["vtt"] = "text/vtt";
extMimeTypes["VTT"] = "text/vtt";
#ifndef __WIN32
magic = magic_open(MAGIC_MIME);
magic_load(magic, NULL);
#endif
}
void build::on_build_start(QString input, QString output, QString mode, QString publisher, QString title, QString welcome)
{
articleS = new ArticleSource(input, output, mode);
//qDebug()<<input;
metadataQueue = std::queue<std::string>();
/* Prepare metadata */
//metadataQueue.push("Language");
metadataQueue.push("Publisher");
metadataQueue.push("Creator");
metadataQueue.push("Title");
metadataQueue.push("Description");
metadataQueue.push("Date");
//metadataQueue.push("Favicon");
metadataQueue.push("Counter");
counters.clear();
data = NULL;
dataSize = 0;
count_index = 0;
description = mode.toStdString();//description == type == mode
::title = title.toStdString();
::welcome = welcome.toStdString();
::publisher = publisher.toStdString();
QRegExp code_name_reg("^.*[/\\\\]");
code_name_reg.setMinimal(false);
QString _code_name = output;
code_name = _code_name.remove(code_name_reg).toStdString();
Xapian::WritableDatabase db2(output.toLocal8Bit().toStdString(), Xapian::DB_CREATE_OR_OVERWRITE);
db = db2;// = Xapian::Chert::open(output.toStdString(), Xapian::DB_CREATE);
QFile type_file(QString(output.remove(QRegExp("[/\\\\]$"))+"/type"));
type_file.open(QFile::WriteOnly);
QDataStream type_stream(&type_file);
type_stream << mode;
type_file.close();
QFile name_file(QString(output.remove(QRegExp("[/\\\\]$"))+"/name"));
name_file.open(QFile::WriteOnly);
QDataStream name_stream(&name_file);
name_stream << title;
name_file.close();
QObject::connect(articleS,SIGNAL(input_status(QString)),this,SIGNAL(input_status(QString)));
QObject::connect(articleS,SIGNAL(change_progress(int)),this,SIGNAL(change_progress(int)));
QObject::connect(articleS,SIGNAL(change_statu_word(QString)),this,SIGNAL(change_statu_word(QString)));
QObject::connect(articleS,SIGNAL(done()),this,SIGNAL(done()));
try
{
zimCreator = new zim::writer::ZimCreator();
zimCreator->setMinChunkSize(zim_minChunkSize);
dir.mkdir(output);
zimCreator->create(QString(output+"/"+"data").toStdString(), *articleS);
delete zimCreator;
Q_EMIT input_status(tr("Xapian 索引总数:")+QString::number(count_index));
QFile count_file(output+"/count");
count_file.open(QFile::WriteOnly);
QDataStream count_stream(&count_file);
count_stream << count_index;
count_file.close();
db.commit();
Q_EMIT input_status(tr("正在压缩索引文件."));
db.compact(output.toStdString()+"/data.idx",Xapian::DBCOMPACT_SINGLE_FILE);
//db.compact(output.toStdString()+"/data2.idx",Xapian::DBCOMPACT_SINGLE_FILE,16384);
db.close();
Q_EMIT input_status(tr("索引文件压缩完成."));
QFile::remove(output+"/docdata.glass");
QFile::remove(output+"/postlist.glass");
QFile::remove(output+"/termlist.glass");
QFile::remove(output+"/flintlock");
QFile::remove(output+"/iamglass");
QFile ver_file(output+"/"+title+".pkg");
ver_file.open(QFile::WriteOnly);
ver_file.write("1");
ver_file.close();
Q_EMIT input_status(tr("离线包打包成功."));
Q_EMIT change_statu_word(tr("离线包打包成功."));
}
catch (const std::exception& e)
{
std::cerr << e.what() << std::endl;
Q_EMIT input_status(QString(e.what()));
Q_EMIT input_status(tr("离线包打包失败."));
Q_EMIT change_statu_word(tr("离线包打包失败.")+QString(e.what()));
}
Q_EMIT done();
if (articleS != NULL)
{
if (articleS->dir_iterator != NULL)
{
delete articleS->dir_iterator;
}
delete articleS;
}
}
ArticleSource::ArticleSource(QString input2, QString output2, QString mode)
{
article = NULL;
data = NULL;
dataSize = 0;
filters<<QString("*");
//input = input2;
output = output2;
idir_str = input2;
a = idir_str.size();
current_dir = idir_str;
count = 0;
data_count = 0;
get_data_stat = false;
get_next_stat = false;
dir_iterator = new QDirIterator(input2,filters,QDir::Files|QDir::NoDotDot,QDirIterator::Subdirectories|QDirIterator::FollowSymlinks);
directoryPath = input2.remove(QRegExp("[/\\\\]$")).toStdString();
}
std::string ArticleSource::getMainPage()
{
return welcome;
}
const zim::writer::Article* ArticleSource::getNextArticle()
{
if(!get_next_stat)
{
get_next_stat = true;
Q_EMIT change_statu_word(tr("正在统计文件."));
}
std::string path;
if (article != NULL) {
delete(article);
}
if (!metadataQueue.empty()) {
path = metadataQueue.front();
metadataQueue.pop();
article = new MetadataArticle(path);
} else
{
article = NULL;
while(dir_iterator->hasNext())
{
dir_iterator->next();
file_info = dir_iterator->fileInfo();
if(file_info.isFile())
{
absolute_file_path = file_info.absoluteFilePath();
filePath = file_info.absolutePath();
if (current_dir != filePath)
{
current_dir = filePath;
Q_EMIT input_status(tr("正在统计文件: 进入目录:")+current_dir);
}
article = new Article(absolute_file_path.toUtf8().toStdString());
break;
//Q_EMIT input_status(absolute_file_path);
}
}
}
/* Count mimetypes */
if (article != NULL && !article->isRedirect()) {
std::string mimeType = article->getMimeType();
if (counters.find(mimeType) == counters.end()) {
counters[mimeType] = 1;
} else {
counters[mimeType]++;
}
}
//Q_EMIT input_status("File: "+QString::fromStdString(article->getAid()));
if(article != NULL)
{
++count;
if(count%1000 == 1 && count > 1000)
{
Q_EMIT input_status(tr("已统计:")+QString::number(count));
}
}
else
{
Q_EMIT input_status(tr("统计完成,共:")+QString::number(count)+tr("个文件"));
Q_EMIT change_statu_word(tr("正在处理文件列表."));
}
return article;
}
zim::Blob ArticleSource::getData(const std::string& aid)
{
QString aid_url = input + QString::fromUtf8(QByteArray::fromStdString(directoryPath + "/" + aid));
if (data != NULL) {
delete(data);
data = NULL;
}
if(!get_data_stat)
{
get_data_stat = true;
Q_EMIT change_statu_word(tr("正在读取并索引文件 "));
}
++data_count;
if(progress != data_count*100/count)
{
progress = data_count*100/count;
Q_EMIT change_progress(progress);
}
//qDebug()<<aid_url;
string b;
QString b2;
if (aid.substr(0, 3) == "/M/") {
std::string value;
if ( aid == "/M/Language") {
value = language;
} else if (aid == "/M/Creator") {
value = creator;
} else if (aid == "/M/Publisher") {
value = publisher;
} else if (aid == "/M/Title") {
value = title;
} else if (aid == "/M/Description") {
value = description;
} else if ( aid == "/M/Date") {
time_t t = time(0);
struct tm * now = localtime( & t );
std::stringstream stream;
stream << (now->tm_year + 1900) << '-'
<< std::setw(2) << std::setfill('0') << (now->tm_mon + 1) << '-'
<< std::setw(2) << std::setfill('0') << now->tm_mday;
value = stream.str();
} else if ( aid == "/M/Counter") {
std::stringstream stream;
for (std::map<std::string, unsigned int>::iterator it = counters.begin(); it != counters.end(); ++it) {
stream << it->first << "=" << it->second << ";";
}
value = stream.str();
}
dataSize = value.length();
data = new char[dataSize];
memcpy(data, value.c_str(), dataSize);
}
else
{
srcfile.setFileName(aid_url);
file_info.setFile(aid_url);
if(srcfile.open(QFile::ReadOnly))
{
s = srcfile.readAll();
srcfile.close();
dataSize = file_info.size();
}
else
{
Q_EMIT input_status(tr("\n\n\n!!!读取文件: 无法打开文件:")+aid_url+"\n\n\n");
s = NULL;
dataSize = 0;
}
dataSize = file_info.size();
filePath = file_info.absolutePath();
if (current_dir != filePath)
{
current_dir = filePath;
Q_EMIT input_status(tr("正在读取文件: 进入目录:")+current_dir);
}
b = s.toStdString();
//cout<<aid<<" | "<<getMimeTypeForFile(aid)<<" | "<<extMimeTypes["html"];
if (getMimeTypeForFile(aid).find("text/html") == 0) {
std::string html = b;
htmlparse.reset();
htmlparse.parse_html(b,"utf-8",true);
add_to_index(htmlparse.dump,htmlparse.title,aid);
/* Rewrite links (src|href|...) attributes */
GumboOutput* output = gumbo_parse(html.c_str());
GumboNode* root = output->root;
std::map<std::string, bool> links;
getLinks(root, links);
std::map<std::string, bool>::iterator it;
/* If a link appearch to be duplicated in the HTML, it will
occurs only one time in the links variable */
for(it = links.begin(); it != links.end(); it++) {
if (it->first[0] == '/') {
replaceStringInPlace(html, "\"" + it->first + "\"", "\"/A" + it->first + "\"");
}
}
gumbo_destroy_output(&kGumboDefaultOptions, output);
dataSize = html.length();
data = new char[dataSize];
memcpy(data, html.c_str(), dataSize);
} else if (getMimeTypeForFile(aid).find("text/css") == 0) {
std::string css = b;
/* Rewrite url() values in the CSS */
size_t startPos = 0;
size_t endPos = 0;
std::string url;
while ((startPos = css.find("url(", endPos)) && startPos != std::string::npos) {
/* URL delimiters */
endPos = css.find(")", startPos);
startPos = startPos + (css[startPos+4] == '\'' || css[startPos+4] == '"' ? 5 : 4);
endPos = endPos - (css[endPos-1] == '\'' || css[endPos-1] == '"' ? 1 : 0);
url = css.substr(startPos, endPos - startPos);
std::string startDelimiter = css.substr(startPos-1, 1);
std::string endDelimiter = css.substr(endPos, 1);
if (url.substr(0, 5) != "data:") {
/* Deal with URL with arguments (using '? ') */
std::string path = url;
size_t markPos = url.find("?");
if (markPos != std::string::npos) {
path = url.substr(0, markPos);
}
/* Embeded fonts need to be inline because Kiwix is
otherwise not able to load same because of the
same-origin security */
std::string mimeType = getMimeTypeForFile(path);
if (mimeType == "application/font-ttf" ||
mimeType == "application/font-woff" ||
mimeType == "application/vnd.ms-opentype" ||
mimeType == "application/vnd.ms-fontobject") {
try {
std::string fontContent = getFileContent(aid_url.remove(QRegExp("[^/\\\\]*$")).toLocal8Bit().toStdString() + path);
replaceStringInPlaceOnce(css,
startDelimiter + url + endDelimiter,
startDelimiter + "data:" + mimeType + ";base64," +
base64_encode(reinterpret_cast<const unsigned char*>(fontContent.c_str()), fontContent.length()) +
endDelimiter
);
} catch (...) {
}
} else {
/* Deal with URL with arguments (using '? ') */
if (markPos != std::string::npos) {
endDelimiter = url.substr(markPos, 1);
}
replaceStringInPlaceOnce(css,
startDelimiter + url + endDelimiter,
startDelimiter + computeNewUrl(aid, path) + endDelimiter);
}
}
}
dataSize = css.length();
data = new char[dataSize];
memcpy(data, css.c_str(), dataSize);
}
else {
if (getMimeTypeForFile(aid).find("text/plain") == 0)
add_to_index(b,"",aid);
data = new char[dataSize];
str1 = b;
memcpy(data, str1.c_str(), dataSize);
}
}
if(data_count == count)
{
progress = 100;
Q_EMIT change_progress(progress);
Q_EMIT change_statu_word(tr("正在写入文件."));
}
return zim::Blob(data, dataSize);
}
void ArticleSource::add_to_index(const std::string &text, const string &title, const std::string &aid)
{
//std::string Ctext = text+" "+enstem(text);
//std::string enT;
Xapian::Document doc;
termgen.set_document(doc);
/*
QStringList entext_list = QString::fromStdString(text).remove(QRegExp("[^A-Za-z0-9 ,\\./;':\">?<,。?、;:]")).replace(QRegExp("[^A-Za-z0-9]+")," ").split(QRegExp("[^A-Za-z0-9]"));
for (int i = 0; i < entext_list.size(); ++i)
{
enT = entext_list.at(i).toStdString();
if(enT != enstem(enT))
Ctext += " "+enstem(enT);
}
*/
std::string text2 = title+" "+title+" "+title+" "+title+" "+title+" "+title+" "+title+" "+text+" "+enstem(text);//title 7个加权
termgen.index_text_without_positions(text2,1);
if(doc.termlist_count()!=0)
{
++count_index;
doc.add_value(1,code_name);
doc.set_data(aid);
db.add_document(doc);
if(count_index%1000 == 1 && count_index > 1000)
{
//Q_EMIT input_status(tr("Xapian test:")+QString::fromStdString(format_doc_termlist(doc)));
Q_EMIT input_status(tr("Xapian 已索引:")+QString::number(count_index));
db.commit();
}
}
}
Article::Article(const string& path)
{
/* aid */
aid = path.substr(directoryPath.size()+1);
/* url */
url = aid;
/* mime-type */
mimeType = getMimeTypeForFile(aid);
//Q_EMIT input_status(QString::fromStdString(mimeType));
/* HTML specific code */
if (mimeType.find("text/html") != std::string::npos) {
std::size_t found;
std::string html = getFileContent(path);
GumboOutput* output = gumbo_parse(html.c_str());
GumboNode* root = output->root;
/* Search the content of the <title> tag in the HTML */
if (root->type == GUMBO_NODE_ELEMENT && root->v.element.children.length >= 2) {
const GumboVector* root_children = &root->v.element.children;
GumboNode* head = NULL;
for (int i = 0; i < root_children->length; ++i) {
GumboNode* child = (GumboNode*)(root_children->data[i]);
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_HEAD) {
head = child;
break;
}
}
if (head != NULL) {
GumboVector* head_children = &head->v.element.children;
for (int i = 0; i < head_children->length; ++i) {
GumboNode* child = (GumboNode*)(head_children->data[i]);
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_TITLE) {
if (child->v.element.children.length == 1) {
GumboNode* title_text = (GumboNode*)(child->v.element.children.data[0]);
if (title_text->type == GUMBO_NODE_TEXT) {
title = title_text->v.text.text;
}
}
}
}
/* Detect if this is a redirection */
std::string targetUrl;
try {
targetUrl = extractRedirectUrlFromHtml(head_children);
} catch (std::string &error) {
std::cerr << error << std::endl;
}
if (!targetUrl.empty()) {
redirectAid = computeAbsolutePath(aid, decodeUrl(targetUrl));
if (!fileExists(directoryPath + "/" + redirectAid)) {
redirectAid.clear();
invalid = true;
}
}
}
/* If no title, then compute one from the filename */
if (title.empty()) {
found = path.rfind("/");
if (found!=std::string::npos) {
title = path.substr(found+1);
found = title.rfind(".");
if (found!=std::string::npos) {
title = title.substr(0, found);
}
} else {
title = path;
}
std::replace(title.begin(), title.end(), '_', ' ');
}
}
gumbo_destroy_output(&kGumboDefaultOptions, output);
}
}
std::string Article::getAid() const
{
return aid;
}
char Article::getNamespace() const
{
return 'A';
}
std::string Article::getUrl() const
{
return url;
}
std::string Article::getTitle() const
{
return title;
}
bool Article::isRedirect() const
{
return !redirectAid.empty();
}
bool Article::isInvalid() const
{
return invalid;
}
std::string Article::getMimeType() const
{
return mimeType;
}
std::string Article::getRedirectAid() const
{
return redirectAid;
}
bool Article::shouldCompress() const {
return true;
}
C++
1
https://gitee.com/zjzdy/Offline_Small_Search_pkg_build.git
git@gitee.com:zjzdy/Offline_Small_Search_pkg_build.git
zjzdy
Offline_Small_Search_pkg_build
Offline_Small_Search_pkg_build
master

搜索帮助