1 Star 1 Fork 0

hotmocha / spider

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
spider.h 5.42 KB
一键复制 编辑 原始数据 按行查看 历史
hotmocha 提交于 2015-04-02 22:29 . spider init
/*
* MODULE NAME :
* PROGRAM NAME : spider.h
* AUTHOR : HOTMOCHA
* CREATE DATE : 2015-01-13 18:45:20
* PROGRAM DESC :
*
* HISTORY :
*
*/
#ifndef _H_SPIDER_H_
#define _H_SPIDER_H_
#define DEBUG 0
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <curl/curl.h>
#include <gumbo.h>
#include <string.h>
#include "ilog.h"
#include "simplebuf.h"
#include "list.h"
#include "type.h"
#define TASKTYPE_START 0
#define TASKTYPE_ONETIME 1
#define TASKTYPE_MUTITIMES 2
#define TASKTYPE_END 3
#define unint unsigned int
#define unlong unsigned long
#define SPIDER_INTERNAL_ERR -10001
#define SPIDER_ARG_ERR -10002
#define SPIDER_MEM_ERR -10003
#define SPIDER_FILE_ERR -10004
/* 复位的掩码 */
#define REINITMASK_URL 1
#define REINITMASK_HEADER (1 << 1)
#define REINITMASK_COOKIE (1 << 2)
#define REINITMASK_POST (1 << 3)
#define REINITMASK_INPUTLIST (1 << 4)
#define REINITMASK_DATALIST (1 << 5)
#define REINITMASK_RSPHEADER (1 << 6)
#define REINITMASK_RSPBODY (1 << 7)
#define AUTOENABLE 1
#define AUTODISABLE 0
struct SpiderEnv;
typedef int (*ProcFunc)(struct SpiderEnv*);
typedef int (*RecvingProcFunc)(struct SpiderEnv*, char*, uint);
typedef int (*RecvingParentHookPoint)(struct SpiderEnv*, void*);
typedef int (*RecvedParentHookPoint)(struct SpiderEnv*, struct SpiderEnv*);
typedef int (*FinishFunc)(struct SpiderEnv*);
struct SpiderEnv
{
int debug;
char name[101];
int task_type;
ListHead *nextTasks; /* 放入SpiderEnv结构 */
CURL *curl;
struct curl_slist *headerlist;
/* 请求的要素 */
Buf *url;
Buf *header;
Buf *cookie;
Buf *postdata;
/* 接收缓冲数据 */
Buf *recvheader;
Buf *recvbody;
ListHead *inputmapstringlist; //放入MapEntry数据,key-value方式放入 */
ListHead *datamaplist; //放入MapEntry数据
Buf *other;
/* 处理header和body的函数 */
ProcFunc parseRespHeader;
ProcFunc parseRespBody;
/* 正在接受头部信息的挂钩函数 */
RecvingProcFunc recvingHeaderHook;
RecvingParentHookPoint recvingHeaderParentHookPoint;
/* 正在接受body信息的挂钩函数 */
RecvingProcFunc recvingBodyHook;
RecvingParentHookPoint recvingBodyParentHookPoint;
/* prepareInputForChild用于接收完页面后的调用函数设置子任务参数,如果每个子任务所需的数据都是一样的可以使用接口,
如果每个任务需要定制化自己想要获得不同的数据使用 prepareInputForChildParentHookPoint接口.
注意:需要在代码中显示调用框架不会自动调用,
如果想在让框架在自动执行需要设置autoPrepareInputForChild或autoPrepareInputForChildParentHookPoint字段
*/
int autoPrepareInputForChild;
ProcFunc prepareInputForChild;
int autoPrepareInputForChildParentHookPoint;
RecvedParentHookPoint prepareInputForChildParentHookPoint;
/* 调用子任务DoSpider之前最后初始化函数 */
ProcFunc prepareDoSpiderParentHookPoint;
/* 用于任务需要多页面处理的时候初始化下一个请求的消息 */
ProcFunc prepareForNextRand;
/* 整个任务结束后调用(子任务也结束之后) */
FinishFunc finishFunc;
int maxtrytimes;
int status;
int failedtimes;
int successtimes;
int ishttps;
char certfile[256];
};
struct SpiderEnv* InitSpiderEnv();
void SetAutoPrepareInputForChild(struct SpiderEnv *env, int e);
void SetVerbose(struct SpiderEnv *env, int e);
void SetAutoPrepareInputForChildParentHookPoint(struct SpiderEnv *env, int e);
int RequestHeaderProcFunc(struct SpiderEnv *env);
int RequestCookieProcFunc(struct SpiderEnv *env);
int RequestPostdataProcFunc(struct SpiderEnv *env);
int RequestUrlProcFunc(struct SpiderEnv *env);
void SetParseRespHeaderFunc(struct SpiderEnv *env, ProcFunc func);
void SetParseRespBodyFunc(struct SpiderEnv *env, ProcFunc func);
void SetRecvingHeaderHook(struct SpiderEnv *env, RecvingProcFunc func);
void SetRecvingHeaderBody(struct SpiderEnv *env, RecvingProcFunc func);
void SetRecvingHeaderParentHookPoint(struct SpiderEnv *env, RecvingParentHookPoint func);
void SetRecvingBodyParentHookPoint(struct SpiderEnv *env, RecvingParentHookPoint func);
void SetParentHookPointCalledAuto(struct SpiderEnv *env, ProcFunc func);
int SetTaskType(struct SpiderEnv *env, int type);
void SetPrepareForNextRandFunc(struct SpiderEnv *env, ProcFunc func);
void SetAfterAllPageProcFunc(struct SpiderEnv *env, ProcFunc func);
void SetFinishFunc(struct SpiderEnv *env, FinishFunc func);
int SetPostdata(struct SpiderEnv *env, char *postdata);
int SetCertFile(struct SpiderEnv *env, char *filename);
int SetHeader(struct SpiderEnv *env, char *header);
int SetUrl(struct SpiderEnv *env, char *url);
void SetPrepareInputForChild(struct SpiderEnv *env, ProcFunc func);
int SetCookie(struct SpiderEnv *env, char *cookie);
void SetTaskName(struct SpiderEnv *env, char *name);
int SetCookieFromFile(struct SpiderEnv *env, char *file);
int PrintRespHeader(struct SpiderEnv *env);
int PrintRespBody(struct SpiderEnv *env);
int AppendSpiderTask(struct SpiderEnv *env, struct SpiderEnv *envAdded);
int InsertSpiderTask(struct SpiderEnv *env, struct SpiderEnv *envAdded);
void FreeSpiderEnv(struct SpiderEnv *env);
int ReInitSpiderEnv(struct SpiderEnv *env, unsigned int flag);
/* 这两个函数需要在任务中自己调用,框架不会自动调用 */
int CallPrepareInputForChildPrentHookFunc(struct SpiderEnv *env);
int CallPrepareInputForChild(struct SpiderEnv *env);
struct SpiderEnv* InitSpiderEnv(char *url, int maxtrytimes);
#endif
1
https://gitee.com/hotmocha/spider.git
git@gitee.com:hotmocha/spider.git
hotmocha
spider
spider
master

搜索帮助

53164aa7 5694891 3bd8fe86 5694891