diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 78297e54428674ef5d76140a823ad2c02fd71163..0000000000000000000000000000000000000000 --- a/.gitignore +++ /dev/null @@ -1,37 +0,0 @@ -#* -*$ -*.BAK -*.Z -*.bak -*.class -*.elc -*.ln -*.log -*.o -*.obj -*.olb -*.old -*.orig -*.pyc -*.pyo -*.rej -*/.git/* -*~ -,* -.#* -.DS_Store -.del-* -.deployables -.make.state -.nse_depinfo -.svn -CVS.adm -RCS -RCSLOG -SCCS -_$* -_svn -.classpath -.project -.settings/* -target/* \ No newline at end of file diff --git a/README.md b/README.md index 2dc99858c0026caab90cedd76340fd1999465b5a..29b7bc21593ad86d476c78744409423e4e2e7b37 100644 --- a/README.md +++ b/README.md @@ -1,456 +1,374 @@ -Spiderman - Java开源Web数据抽取工具 -======================================== - Spiderman 是一个Java开源Web数据抽取工具。它能够收集指定的Web页面并从这些页面中提取有用的数据。 - Spiderman主要是运用了像XPath,正则表达式等这些技术来实数据抽取。 - -它包含了两部分(二者缺一不可): ------------------------------ - * spiderman-core 内核 - * spiderman-plugin 插件 - -主要特点 ----------------------- - * 微内核+插件式架构、灵活、可扩展性强 - * 无需编写程序代码即可完成数据抽取 - * 多线程保证性能 - -怎么使用? ----------- -* 首先,确定好你的目标网站以及目标网页(即某一类你想要获取数据的网页,例如网易新闻的新闻页面) -* 然后,打开目标页面,分析页面的HTML结构,得到你想要数据的XPath,具体XPath怎么获取请看下文。 -* 最后,在一个xml配置文件里填写好参数,运行Spiderman吧! - -近期更新 ----- -1. <parser 的表达式支持发起HTTP请求获取内容了: - <parser exp="$Fetcher.get('http://www.baidu.com')" - -2. <target节点添加 <before节点配置,该配置与<model一样可以用来解析网页内容,主要的区别是该节点会在<model节点解析之前进行工作,其解析后的结果将会作为model的上下文$before.xxx来使用 - -3. 重构下载器,支持多种下载器实现,允许在xml里面配置自己实现的下载器实现类,官方默认提供了三种,分别是默认的基于HttpClient的下载器、基于WebUnit的下载器、基于Selenium WebDriver的实现 - <site downloader="org.eweb4j.spiderman.plugin.util.WebDriverDownloader" - 或者 - <site downloader="xxx.YourDownloader"> - -4. 与第三点一样,重构了模型解析器,使得现在支持多种不同的实现类,且允许开发者在xml上指定自己实现的解析器,目前官方提供了两种解析器,分别是DefaultModelParser,WebDriverModelParser - <before parser="xxx.xxx.xxx.YourModelParser" - 或者 - <model parser="xxx.YourModelParser" -5. 其他一些零碎的更新、BUG修复等。 - -XPath获取技巧? --------------- -* 首先,下载xpathonclick插件,[猛击这里](https://chrome.google.com/webstore/search/xpathonclick) -* 安装完毕之后,打开Chrome浏览器,可以看到右上角有个“X Path” 图标。 -* 在浏览器打开你的目标网页,然后点击右上角的那个图片,然后点击网标上你想要获取XPath的地方,例如某个标题 -* 这时候按住F12打开JS控制台,拖到底部,可以看到一串XPath内容 -* 记住,这个内容不是绝对OK的,你可能还需要做些修改,因此,你最好还是去学习下XPath语法 -* 学习XPath语法的地方:[猛击这里](http://www.w3school.com.cn/xpath/index.asp) - -Spiderman Sample | 案例 -======================= - -* 首先保证你的机器至少可以运行Java程序、也可以执行Maven命令 -* 案例程序[spiderman-sample] mvn test -* Spiderman程序将会运行N秒钟,然后到保存抓取数据的文件夹查看对应网站的数据 -* 这里有篇文章介绍示例:[http://my.oschina.net/laiweiwei/blog/100866] - -这是使用Spiderman的代码: - - public class TestSpider { - - private final Object mutex = new Object(); - - @Test - public void test() throws Exception { - String err = EWeb4JConfig.start(); - if (err != null) - throw new Exception(err); - - SpiderListener listener = new SpiderListenerAdaptor(){ - public void afterScheduleCancel(){ - //调度结束回调 - } - /** - * 每次调度执行前回调此方法 - * @date 2013-4-1 下午03:33:11 - * @param theLastTimeScheduledAt 上一次调度时间 - */ - public void beforeEveryScheduleExecute(Date theLastTimeScheduledAt){ - System.err.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [LAST_SCHEDULE_AT] ~ "); - System.err.println("at -> " + CommonUtil.formatTime(theLastTimeScheduledAt)); - } - public void onFetch(Thread thread, Task task, FetchResult result) { - System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [FETCH] ~ "); - System.out.println("fetch result ->" + result + " from -> " + task.sourceUrl); - } - public void onNewUrls(Thread thread, Task task, Collection newUrls) { - System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [DIG] ~ "); - System.out.println(newUrls); - } - public void onDupRemoval(Thread currentThread, Task task, Collection validTasks) { - // for (Task t : validTasks){ - // System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [DUPREMOVE] ~ "); - // System.out.println(t.url+" from->"+t.sourceUrl); - // } - } - public void onTaskSort(Thread currentThread, Task task, Collection afterSortTasks) { - // for (Task t : afterSortTasks){ - // System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [SORT] ~ "); - // System.out.println(t.url+" from->"+t.sourceUrl); - // } - } - public void onNewTasks(Thread thread, Task task, Collection newTasks) { - // for (Task t : newTasks){ - // System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [NEWTASK] ~ "); - // System.out.println(t.sort + ",,,," + t.url+" from->"+t.sourceUrl); - // } - } - public void onTargetPage(Thread thread, Task task, Page page) { - // System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [TARGET] ~ "); - // System.out.println(page.getUrl()); - } - public void onInfo(Thread thread, Task task, String info) { - System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [INFO] ~ "); - System.out.println(info); - } - - public void onError(Thread thread, Task task, String err, Throwable e) { - System.err.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [ERROR] ~ "); - e.printStackTrace(); - } - - public void onParse(Thread thread, Task task, List> models) { - final String projectRoot = FileUtil.getTopClassPath(TestSpider.class); - final File dir = new File(projectRoot+"/Data/"+task.site.getName()+"/"+task.target.getName()); - try { - if (!dir.exists()) - dir.mkdirs(); - - for (int i = 0; i < models.size(); i++) { - Map map = models.get(i); - String fileName = dir + "/count_" + task.site.counter.getCount() + i; - StringBuilder sb = new StringBuilder(); - for (Iterator> it = map.entrySet().iterator(); it.hasNext();){ - Entry e = it.next(); - boolean isBlank = false; - - if (e.getValue() == null) - isBlank = true; - else if (e.getValue() instanceof String && ((String)e.getValue()).trim().length() == 0) - isBlank = true; - else if (e.getValue() instanceof List && ((ArrayList)e.getValue()).isEmpty()) - isBlank = true; - else if (e.getValue() instanceof List && !((ArrayList)e.getValue()).isEmpty()) { - if (((ArrayList)e.getValue()).size() == 1 && String.valueOf(((ArrayList)e.getValue()).get(0)).trim().length() == 0) - isBlank = true; - } - - if (isBlank){ - if (sb.length() > 0) - sb.append("_"); - sb.append(e.getKey()); - } - } - String content = CommonUtil.toJson(map); - if (sb.length() > 0) - fileName = fileName + "_no_"+sb.toString()+"_"; - - File file = new File(fileName+".json"); - FileUtil.writeFile(file, content); - System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [INFO] ~ "); - System.out.println(fileName + " create finished..."); - } - } catch (Exception e) { - e.printStackTrace(); - } - } - }; - - //启动爬虫 - Spiderman.me() - .init(listener)//初始化 - .startup()//启动 - .keepStrict("2h");//存活时间,过了存活时间后马上关闭 - - //启动爬虫 + 调度定时重启 - //Spiderman.me() - //.listen(listener)//设置监听器 - //.schedule("10s")//调度,爬虫运行10s - //.delay("2s")//每隔 10 + 2 秒后重启爬虫 - //.times(3)//调度 3 次 - //.startup()//启动 - //.blocking();//阻塞直到所有调度完成 - } - } - - -下面详细看看这个sample的配置文件: - -首先有一个初始化配置文件spiderman.properties,它就放在#{ClassPath}目录下 - - #网站配置文件放置目录 - website.xml.folder=#{ClassPath}/WebSites - #网站已访问url数据库存储目录 - website.visited.folder=#{ClassPath}/dbEnv - #http抓取失败重试次数 - http.fetch.retry=3 - #http连接超时,支持单位 s秒 m分 h时 d天,不写单位则表示s秒 - http.fetch.timeout=5s - -然后在#{ClassPath}/WebSites目录下有一份oschina.xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ----- +Spiderman - Java开源Web数据抽取工具 +======================================== + Spiderman 是一个Java开源Web数据抽取工具。它能够收集指定的Web页面并从这些页面中提取有用的数据。 + Spiderman主要是运用了像XPath,正则表达式等这些技术来实数据抽取。 + +它包含了两部分(二者缺一不可): +----------------------------- + * spiderman-core 内核 + * spiderman-plugin 插件 + +主要特点 +---------------------- + * 微内核+插件式架构、灵活、可扩展性强 + * 无需编写程序代码即可完成数据抽取 + * 多线程保证性能 + +怎么使用? +---------- +* 首先,确定好你的目标网站以及目标网页(即某一类你想要获取数据的网页,例如网易新闻的新闻页面) +* 然后,打开目标页面,分析页面的HTML结构,得到你想要数据的XPath,具体XPath怎么获取请看下文。 +* 最后,在一个xml配置文件里填写好参数,运行Spiderman吧! + +近期更新 +---- +1.重构spiderman内核加入容器Container概念,容器中可自定义组件XML节点结构,并开发自己的容器组件如site、db、file等;支持多数据源获取; + +2.该版本的升级需要在配置文件中的site节点外层加入conatainer节点,如<container id="container1"><site></site></container> + +3.改进内核处理链式流程,新插件开发如ftp-plugin、db-plugin等不需要实现所有的扩展点; + +4.优化配置文件如果不配置插件,默认采用官方实现的web-plugin插件。 + +5.优化改进官方默认插件web-plugin; + +6.其他一些零碎的更新、BUG修复等。 + +XPath获取技巧? +-------------- +* 首先,下载xpathonclick插件,[猛击这里](https://chrome.google.com/webstore/search/xpathonclick) +* 安装完毕之后,打开Chrome浏览器,可以看到右上角有个“X Path” 图标。 +* 在浏览器打开你的目标网页,然后点击右上角的那个图片,然后点击网标上你想要获取XPath的地方,例如某个标题 +* 这时候按住F12打开JS控制台,拖到底部,可以看到一串XPath内容 +* 记住,这个内容不是绝对OK的,你可能还需要做些修改,因此,你最好还是去学习下XPath语法 +* 学习XPath语法的地方:[猛击这里](http://www.w3school.com.cn/xpath/index.asp) + +Spiderman Sample | 案例 +======================= + +* 首先保证你的机器至少可以运行Java程序、也可以执行Maven命令 +* 案例程序[spiderman-sample] mvn test +* Spiderman程序将会运行N秒钟,然后到保存抓取数据的文件夹查看对应网站的数据 +* 这里有篇文章介绍示例:[http://my.oschina.net/laiweiwei/blog/100866] + +这是使用Spiderman的代码: + + public class TestSpider { + + private final Object mutex = new Object(); + + @Test + public void test() throws Exception { + String err = EWeb4JConfig.start(); + if (err != null) + throw new Exception(err); + + //实例化Spiderman + final Spiderman spiderman = Spiderman.me(); + //爬虫监听适配器 + SpiderListener listener = new SpiderListenerAdaptor(){ + @Override + public void onDigUrls(Thread thread, Task task, String fieldName,Collection urls) { + System.out.println("[DIG-URL] ~ "+urls); + } + @Override + public void onInfo(Thread thread, FetchRequest request, String info) { + System.out.println(CommonUtil.getNowTime("HH:mm:ss")+"[INFO] ~ "+info); + } + @Override + public void onTargetPage(Thread thread,FetchRequest request, Page page) { + System.out.println("[TARGET] ~ "+page.getUrl()); + } + @Override + public void onParse(Thread thread,FetchRequest request, List> models) { + System.out.println("on_Parse->" + models); + } + }; + + //启动爬虫|初始化| + //调度,爬虫运行10s + spiderman.init(listener).startup()/*.keep("10s")*/;//启动 + //File file = new File("E:\\jukeyuan\\spiderman-sample\\target\\test-classes\\sites\\tianya site of site_sample.xml"); + //spiderman.listen(listener).init(file).startup(file); + /*spiderman.init(listener) + .schedule("10s") + .startup()//启动 + .times(3);//调度 3 次*/ + Thread.currentThread().join(); + } + } + + +下面详细看看这个sample的配置文件: + +首先有一个初始化配置文件spiderman.properties,它就放在#{ClassPath}目录下 + + #网站配置文件放置目录 + website.xml.folder=#{ClassPath}/WebSites + #网站已访问url数据库存储目录 + website.visited.folder=#{ClassPath}/dbEnv + #http抓取失败重试次数 + http.fetch.retry=3 + #http连接超时,支持单位 s秒 m分 h时 d天,不写单位则表示s秒 + http.fetch.timeout=5s + #加载容器模块组件配置 + modules=site,db,file + +然后在#{ClassPath}/WebSites目录下有一份oschina.xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +---- \ No newline at end of file diff --git a/pom.xml b/pom.xml deleted file mode 100644 index 24f285fb789f2435aae1db6be994846ce4362a53..0000000000000000000000000000000000000000 --- a/pom.xml +++ /dev/null @@ -1,101 +0,0 @@ - - 4.0.0 - org.eweb4j - spiderman-parent - 0.1.0-SNAPSHOT - pom - - spiderman-core - spiderman-plugin - spiderman-sample - spiderman-web - spiderman-webapp - - - - - weiwei - 赖伟威 - l.weiwei@163.com - http://laiweiweihi.iteye.com - 8 - - - chenyoca - 陈永佳 - chenyoca@gmail.com - 8 - - - - Github Issue - https://gitcafe.com/laiweiwei/Spiderman/tickets - - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - - - - git@gitcafe.com:laiweiwei/Spiderman.git - git@gitcafe.com:laiweiwei/Spiderman.git - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.7 - - - - - - - nexus-releases - Nexus Releases Repository. - https://oss.sonatype.org/service/local/staging/deploy/maven2 - - - - nexus-snapshots - Nexus Snapshots Repository. - https://oss.sonatype.org/content/repositories/snapshots - - - - - nexus-releases - Nexus Releases Repository. - https://oss.sonatype.org/service/local/staging/deploy/maven2 - - - - nexus-snapshots - Nexus Snapshots Repository. - https://oss.sonatype.org/content/repositories/snapshots - - - - - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - - - org.apache.maven.plugins - maven-release-plugin - 2.0-beta-7 - - - - \ No newline at end of file diff --git a/spiderman-core/.classpath b/spiderman-core/.classpath new file mode 100644 index 0000000000000000000000000000000000000000..4fedf63237a3e151ffefda3ef10f9978ee604c61 --- /dev/null +++ b/spiderman-core/.classpath @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/spiderman-core/.gitignore b/spiderman-core/.gitignore deleted file mode 100644 index 78297e54428674ef5d76140a823ad2c02fd71163..0000000000000000000000000000000000000000 --- a/spiderman-core/.gitignore +++ /dev/null @@ -1,37 +0,0 @@ -#* -*$ -*.BAK -*.Z -*.bak -*.class -*.elc -*.ln -*.log -*.o -*.obj -*.olb -*.old -*.orig -*.pyc -*.pyo -*.rej -*/.git/* -*~ -,* -.#* -.DS_Store -.del-* -.deployables -.make.state -.nse_depinfo -.svn -CVS.adm -RCS -RCSLOG -SCCS -_$* -_svn -.classpath -.project -.settings/* -target/* \ No newline at end of file diff --git a/spiderman-core/.project b/spiderman-core/.project new file mode 100644 index 0000000000000000000000000000000000000000..27673ab6fbce50d9ac026e504f46a1b2cbcbe36f --- /dev/null +++ b/spiderman-core/.project @@ -0,0 +1,36 @@ + + + spiderman-core + + + + + + org.eclipse.wst.common.project.facet.core.builder + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.wst.validation.validationbuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jem.workbench.JavaEMFNature + org.eclipse.wst.common.modulecore.ModuleCoreNature + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + org.eclipse.wst.common.project.facet.core.nature + + diff --git a/spiderman-core/.settings/org.eclipse.core.resources.prefs b/spiderman-core/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000000000000000000000000000000000000..99f26c0203a7844de00dbfc56e6a35d8ed3c022c --- /dev/null +++ b/spiderman-core/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +encoding/=UTF-8 diff --git a/spiderman-core/.settings/org.eclipse.jdt.core.prefs b/spiderman-core/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000000000000000000000000000000000000..5ce4518899426199a8ab051fb032691fe545dcee --- /dev/null +++ b/spiderman-core/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,13 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 +org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve +org.eclipse.jdt.core.compiler.compliance=1.6 +org.eclipse.jdt.core.compiler.debug.lineNumber=generate +org.eclipse.jdt.core.compiler.debug.localVariable=generate +org.eclipse.jdt.core.compiler.debug.sourceFile=generate +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.source=1.6 diff --git a/spiderman-core/.settings/org.eclipse.m2e.core.prefs b/spiderman-core/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 0000000000000000000000000000000000000000..f897a7f1cb2389f85fe6381425d29f0a9866fb65 --- /dev/null +++ b/spiderman-core/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/spiderman-core/.settings/org.eclipse.wst.common.component b/spiderman-core/.settings/org.eclipse.wst.common.component new file mode 100644 index 0000000000000000000000000000000000000000..4c1c88b304dab8f26c0abe7bce3f3cf0260ac004 --- /dev/null +++ b/spiderman-core/.settings/org.eclipse.wst.common.component @@ -0,0 +1,6 @@ + + + + + + diff --git a/spiderman-core/.settings/org.eclipse.wst.common.project.facet.core.xml b/spiderman-core/.settings/org.eclipse.wst.common.project.facet.core.xml new file mode 100644 index 0000000000000000000000000000000000000000..5c9bd7532abbb254f449a025abe3d70d8c089bab --- /dev/null +++ b/spiderman-core/.settings/org.eclipse.wst.common.project.facet.core.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/spiderman-core/.settings/org.eclipse.wst.validation.prefs b/spiderman-core/.settings/org.eclipse.wst.validation.prefs new file mode 100644 index 0000000000000000000000000000000000000000..04cad8cb752a9761c4e5167d0301d3a27674430f --- /dev/null +++ b/spiderman-core/.settings/org.eclipse.wst.validation.prefs @@ -0,0 +1,2 @@ +disabled=06target +eclipse.preferences.version=1 diff --git a/spiderman-core/pom.xml b/spiderman-core/pom.xml index 6a950b6a843b2b73e8669098e752647a34425c46..f5de8d805948719c0d025bf3b6969be5687d0070 100644 --- a/spiderman-core/pom.xml +++ b/spiderman-core/pom.xml @@ -1,19 +1,17 @@ - - 4.0.0 - - spiderman-parent + + 4.0.0 + 0.1.0-SNAPSHOT + org.eweb4j + spiderman-core + + + org.eweb4j - 0.1.0-SNAPSHOT - - org.eweb4j - spiderman-core - spiderman core module - - - - org.eweb4j - eweb4j-all - 1.10-final - - - \ No newline at end of file + eweb4j-all + 1.10-final + + + + + diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/container/Component.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/container/Component.java new file mode 100644 index 0000000000000000000000000000000000000000..ff9d416066222268f79d7b2e7b706aa10be959fa --- /dev/null +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/container/Component.java @@ -0,0 +1,221 @@ +package org.eweb4j.spiderman.container; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.ExecutorService; + +import org.eweb4j.spiderman.fetcher.PageFetcher; +import org.eweb4j.spiderman.plugin.BeginPoint; +import org.eweb4j.spiderman.plugin.DigPoint; +import org.eweb4j.spiderman.plugin.DupRemovalPoint; +import org.eweb4j.spiderman.plugin.EndPoint; +import org.eweb4j.spiderman.plugin.ExtensionPoint; +import org.eweb4j.spiderman.plugin.ExtensionPoints; +import org.eweb4j.spiderman.plugin.FetchPoint; +import org.eweb4j.spiderman.plugin.ParsePoint; +import org.eweb4j.spiderman.plugin.PluginManager; +import org.eweb4j.spiderman.plugin.Point; +import org.eweb4j.spiderman.plugin.PojoPoint; +import org.eweb4j.spiderman.plugin.TargetPoint; +import org.eweb4j.spiderman.plugin.TaskPollPoint; +import org.eweb4j.spiderman.plugin.TaskPushPoint; +import org.eweb4j.spiderman.plugin.TaskSortPoint; +import org.eweb4j.spiderman.spider.Counter; +import org.eweb4j.spiderman.spider.SpiderListener; +import org.eweb4j.spiderman.task.TaskDbServer; +import org.eweb4j.spiderman.task.TaskQueue; +import org.eweb4j.spiderman.xml.Plugin; +import org.eweb4j.spiderman.xml.Plugins; +import org.eweb4j.util.xml.AttrTag; +import org.eweb4j.util.xml.Skip; + +public abstract class Component{ + + public Container container;//所属容器 + + public SpiderListener listener;//监听器; + //------------------------------------------ + @AttrTag + private String name;//目标站点名称 + @Skip + public TaskDbServer removaldb = null;//每个组件都有属于自己的一个任务去重DB服务 + @Skip + public ExecutorService pool;//每个组件都有属于自己的一个线程池 + @Skip + public Boolean isStop = false;//每个组件都有属于自己的一个停止信号,用来标识该组件的状态是否停止完全 + @Skip + public TaskQueue queue;//每个组件都有属于自己的一个任务队列容器 + @Skip + public PageFetcher fetcher;//每个组件都有属于自己的一个抓取器 + @Skip + public Counter counter;//针对本数据源已完成的任务数量 + + private Plugins plugins;//插件 + //------------------------------------------ + //--------------扩展点----------------------- + @Skip + public Collection taskPollPointImpls; + @Skip + public Collection beginPointImpls; + @Skip + public Collection fetchPointImpls; + @Skip + public Collection digPointImpls; + @Skip + public Collection dupRemovalPointImpls; + @Skip + public Collection taskSortPointImpls; + @Skip + public Collection taskPushPointImpls; + @Skip + public Collection targetPointImpls; + @Skip + public Collection parsePointImpls; + @Skip + public Collection endPointImpls; + @Skip + public Collection pojoPointImpls; + + //------------------------------------------- + + public abstract Component startup(); + public abstract void destroy(SpiderListener listener, boolean isShutdownNow); + public abstract Component init(Container container,SpiderListener listener)throws Exception; + public abstract void initPool(); + + public void initPlugins() throws Exception{ + //---------------------插件初始化开始---------------------------- + listener.onInfo(Thread.currentThread(), null, "plugins loading begin..."); + + if(this.getPlugins() == null){ + //加载默认插件及其扩展点... + Plugins deFaultPlugins = new Plugins(); + List deFaultPluginArray = new ArrayList(); + deFaultPluginArray.add(PluginManager.createPlugin()); + deFaultPlugins.setPlugin(deFaultPluginArray); + this.setPlugins(deFaultPlugins); + } + + Collection plugins = this.getPlugins().getPlugin(); + //加载网站插件配置 + try { + PluginManager pluginMgr = new PluginManager(); + pluginMgr.loadPluginConf(plugins, listener); + + //加载TaskPoll扩展点实现类 + ExtensionPoint taskPollPoint = pluginMgr.getExtensionPoint(ExtensionPoints.task_poll); + if (taskPollPoint != null) { + this.taskPollPointImpls = taskPollPoint.getExtensions(); + firstInitPoint(this.taskPollPointImpls, this, listener); + } + + //加载Begin扩展点实现类 + ExtensionPoint beginPoint = pluginMgr.getExtensionPoint(ExtensionPoints.begin); + if (beginPoint != null){ + this.beginPointImpls = beginPoint.getExtensions(); + firstInitPoint(this.beginPointImpls, this, listener); + } + + //加载Fetch扩展点实现类 + ExtensionPoint fetchPoint = pluginMgr.getExtensionPoint(ExtensionPoints.fetch); + if (fetchPoint != null){ + this.fetchPointImpls = fetchPoint.getExtensions(); + firstInitPoint(this.fetchPointImpls, this, listener); + } + + //加载Dig扩展点实现类 + ExtensionPoint digPoint = pluginMgr.getExtensionPoint(ExtensionPoints.dig); + if (digPoint != null){ + this.digPointImpls = digPoint.getExtensions(); + firstInitPoint(this.digPointImpls, this, listener); + } + + //加载DupRemoval扩展点实现类 + ExtensionPoint dupRemovalPoint = pluginMgr.getExtensionPoint(ExtensionPoints.dup_removal); + if (dupRemovalPoint != null){ + this.dupRemovalPointImpls = dupRemovalPoint.getExtensions(); + firstInitPoint(this.dupRemovalPointImpls, this, listener); + } + //加载TaskSort扩展点实现类 + ExtensionPoint taskSortPoint = pluginMgr.getExtensionPoint(ExtensionPoints.task_sort); + if (taskSortPoint != null){ + this.taskSortPointImpls = taskSortPoint.getExtensions(); + firstInitPoint(this.taskSortPointImpls, this, listener); + } + + //加载TaskPush扩展点实现类 + ExtensionPoint taskPushPoint = pluginMgr.getExtensionPoint(ExtensionPoints.task_push); + if (taskPushPoint != null){ + this.taskPushPointImpls = taskPushPoint.getExtensions(); + firstInitPoint(this.taskPushPointImpls, this, listener); + } + + //加载Target扩展点实现类 + ExtensionPoint targetPoint = pluginMgr.getExtensionPoint(ExtensionPoints.target); + if (targetPoint != null){ + this.targetPointImpls = targetPoint.getExtensions(); + firstInitPoint(this.targetPointImpls, this, listener); + } + + //加载Parse扩展点实现类 + ExtensionPoint parsePoint = pluginMgr.getExtensionPoint(ExtensionPoints.parse); + if (parsePoint != null){ + this.parsePointImpls = parsePoint.getExtensions(); + firstInitPoint(this.parsePointImpls, this, listener); + } + + //加载Pojo扩展点实现类 + ExtensionPoint pojoPoint = pluginMgr.getExtensionPoint(ExtensionPoints.pojo); + if (pojoPoint != null){ + this.pojoPointImpls = pojoPoint.getExtensions(); + firstInitPoint(this.pojoPointImpls, this, listener); + } + + //加载End扩展点实现类 + ExtensionPoint endPoint = pluginMgr.getExtensionPoint(ExtensionPoints.end); + if (endPoint != null){ + this.endPointImpls = endPoint.getExtensions(); + firstInitPoint(this.endPointImpls, this, listener); + } + //---------------------------插件初始化完毕---------------------------------- + } catch(Exception e){ + throw new Exception("Site["+this.getName()+"] loading plugins fail", e); + } + + //初始化网站的队列容器 + this.queue = new TaskQueue(); + this.queue.init(); + //初始化网站目标Model计数器 + this.counter = new Counter(); + + } + + public void firstInitPoint(Collection points, Component component, SpiderListener listener){ + for (Point point : points){ + point.init(component, listener); + } + } + + public Plugins getPlugins() { + return plugins; + } + + public void setPlugins(Plugins plugins) { + this.plugins = plugins; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + public Container getContainer() { + return container; + } + public void setContainer(Container container) { + this.container = container; + } +} diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/container/Components.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/container/Components.java new file mode 100644 index 0000000000000000000000000000000000000000..f257d6e70947108b2dfb8f633c2b7b339e589451 --- /dev/null +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/container/Components.java @@ -0,0 +1,42 @@ +/** + * + */ +package org.eweb4j.spiderman.container; + +import java.util.Arrays; +import java.util.Collection; + +/** + * @author WChao + * + */ +public class Components { + + public final static String site = "site"; + public final static String db = "db"; + public final static String file = "file"; + + public static String getComponentClassName(String point){ + if (site.equals(point)) + return "org.eweb4j.spiderman.xml.site.Site"; + if (db.equals(point)) + return "org.eweb4j.spiderman.xml.db.Db"; + if (file.equals(point)) + return "org.eweb4j.spiderman.xml.file.File"; + return null; + } + public static boolean contains(String name){ + return site.equals(name) || db.equals(name) || file.equals(name) ; + } + + public static String string(){ + return "[" + site + ", "+ db + ", " + file +"]" ; + } + public static Collection toArray(String type) + { + return Arrays.asList(type); + } + public static Collection toArray(){ + return Arrays.asList(site, db, file); + } +} diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/container/Container.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/container/Container.java new file mode 100644 index 0000000000000000000000000000000000000000..a2e342a7ad5955ae0ccb2ec9985dbcb95016b49b --- /dev/null +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/container/Container.java @@ -0,0 +1,199 @@ +/** + * + */ +package org.eweb4j.spiderman.container; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +import org.eweb4j.spiderman.spider.Settings; +import org.eweb4j.spiderman.spider.SpiderListener; +import org.eweb4j.spiderman.spider.SpiderListenerAdaptor; +import org.eweb4j.util.xml.AttrTag; +import org.eweb4j.util.xml.Skip; +import org.eweb4j.util.xml.XMLReader; + +/** + * @author yangc + * @param + * + */ +public class Container { + @AttrTag + private String id;//容器Id; + @AttrTag + private String name;//容器名称; + @AttrTag + private String enable = "1";//是否开启本容器 + @Skip + public Boolean isStop = false;//每个容器都有属于自己的一个停止信号,用来标识该容器的状态是否停止完全 + + private XMLReader reader;//所属配置文件; + + private ExecutorService pool = null;//线程池; + + //容器包含组件 + private Collection components = new ArrayList(); + + private SpiderListener listener = null; + + + + public Container(){} + public Container(String id) + { + this.id = id; + } + + public Container init(SpiderListener listener) throws Exception{ + this.listener = listener; + if (this.listener == null) + this.listener = new SpiderListenerAdaptor(); + Collection modules = null; + if(Settings.modules() == null){//默认只加载站点组件;提高性能; + modules = Components.toArray(Components.site); + }else{ + modules = Arrays.asList(Settings.modules()); + } + for(String module : modules){ + Component component = getModuleComponent(module); + if(component != null) + { + component = component.init(this, listener); + if(component != null) + components.add(component); + } + } + //初始化容器线程池; + initPool(); + return this; + } + + public void initPool(){ + if (pool == null){ + int size = components.size(); + if (size == 0) + throw new RuntimeException("there is no component to load..."); + pool = new ThreadPoolExecutor(size, size, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue()); + listener.onInfo(Thread.currentThread(), null, "init container["+this.id+"] thread pool size->"+size+" success "); + } + } + + public Container startup() { + for(Component component : components) + { + pool.execute(new Container._Executor(component)); + } + this.isStop = false; + return this; + } + + private class _Executor implements Runnable{ + private Component component = null; + + public _Executor(Component component){ + this.component = component; + } + + public void run() { + component.startup(); + } + } + + public void destroy(SpiderListener listener, boolean isShutdownNow,Object... args) + { + for(Component component : components) + { + component.destroy(listener, isShutdownNow); + listener.onInfo(Thread.currentThread(), null, component.getClass().getSimpleName()+"[" + component.getName() + "] of the Container["+this.id+"] destroy... "); + listener.onAfterShutdown(component,args); + } + if (isShutdownNow) + this.pool.shutdownNow(); + else + this.pool.shutdown(); + if(this.pool != null) + this.pool = null; + components.clear(); + this.isStop = true; + } + + public Container addComponent(Component component) + { + this.components.add(component); + return this; + } + + public Container removeComponent(Component component) + { + this.components.remove(component); + return this; + } + + public Component getComponent(String componentName) + { + for(Component component : components) + { + String simpleName = component.getClass().getSimpleName().toLowerCase(); + if(componentName.toLowerCase().equals(simpleName)){ + return component; + } + } + return null; + } + @SuppressWarnings("unchecked") + private Component getModuleComponent(final String componentName){ + if (!Components.contains(componentName)) + return null; + String value = Components.getComponentClassName(componentName); + Component t = null; + try { + Class cls = (Class) Thread.currentThread().getContextClassLoader().loadClass(value); + t = cls.newInstance(); + } catch (ClassNotFoundException e) { + throw new RuntimeException("Component class -> " + value + " of Component["+componentName+"] not found !", e); + } catch (InstantiationException e) { + throw new RuntimeException("Component class -> " + value + " of Component["+componentName+"] instaniation fail !", e); + } catch (IllegalAccessException e) { + throw new RuntimeException("Component class -> " + value + " of Component["+componentName+"] illegal access !", e); + } + + return t; + } + + public String getEnable() { + return enable; + } + public void setEnable(String enable) { + this.enable = enable; + } + public String getId() { + return id; + } + public void setId(String id) { + this.id = id; + } + public Collection getComponents() { + return components; + } + public void setComponents(Collection components) { + this.components = components; + } + public XMLReader getReader() { + return reader; + } + public void setReader(XMLReader reader) { + this.reader = reader; + } + public String getName() { + return name; + } + public void setName(String name) { + this.name = name; + } +} diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/container/ContainerManager.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/container/ContainerManager.java new file mode 100644 index 0000000000000000000000000000000000000000..564430b2082e6b8917b0cca5a7187bd88fdf725a --- /dev/null +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/container/ContainerManager.java @@ -0,0 +1,48 @@ +package org.eweb4j.spiderman.container; + +import java.util.ArrayList; +import java.util.Collection; + +public class ContainerManager { + + private static Collection containers = null; + + private static ContainerManager instance = null; + + public static ContainerManager me() + { + if(instance == null) + { + instance = new ContainerManager(); + containers = new ArrayList(); + } + return instance; + } + + public void add(Container container)throws Exception + { + if (container.getId() == null || container.getId().trim().length() == 0) + throw new Exception("container id required"); + if(get(container.getId().trim()) != null) + { + throw new Exception("container id ["+container.getId().trim()+"] can not be repeated!"); + } + containers.add(container); + } + + public Container get(String id) + { + for(Container container : containers) + { + if(id.trim().equals(container.getId())) + { + return container; + } + } + return null; + } + + public Collection getContainers() { + return containers; + } +} diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/FetchRequest.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/FetchRequest.java index 8bd05227951dadcaf5ea28da67fa6d4d8e6ec6b9..667af277ac65c5b50e9685b1ff06b0b18ba20282 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/FetchRequest.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/FetchRequest.java @@ -6,21 +6,30 @@ import java.util.List; import java.util.Map; import org.eweb4j.mvc.Http; +import org.eweb4j.spiderman.task.Task; /** * TODO * @author weiwei l.weiwei@163.com + * @author wchao wchaojava@163.com * @date 2013-3-7 下午05:28:08 */ public class FetchRequest { - private String url; + public String url; private String httpMethod = Http.Method.GET; private Map> params = new HashMap>(); private Map> files = new HashMap>(); private Map> headers = new HashMap>(); private Map> cookies = new HashMap>(); - + public Task task; + public FetchRequest(){} + public FetchRequest(Task task) + { + this.task = task; + this.url = task.url; + this.httpMethod = task.httpMethod; + } public String getUrl() { return this.url; } @@ -58,6 +67,13 @@ public class FetchRequest { public void setFiles(Map> files) { this.files = files; } + + public Task getTask() { + return task; + } + public void setTask(Task task) { + this.task = task; + } @Override public String toString() { return "FetchRequest [url=" + this.url + ", httpMethod=" diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/FetchResult.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/FetchResult.java index 7c528d0ee0fdba85291a0517bcf001e9d9d8cfbd..e77176313e93915c3198544f564ceb1b3e394671 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/FetchResult.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/FetchResult.java @@ -1,9 +1,13 @@ package org.eweb4j.spiderman.fetcher; +import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; +import org.eweb4j.spiderman.task.Task; +import org.eweb4j.spiderman.xml.Target; + public class FetchResult { @@ -12,8 +16,17 @@ public class FetchResult { private Map> headers = new HashMap>(); private String fetchedUrl = null; private String movedToUrl = null; - private Page page = null; - + private Page page = null;//抓取结果信息; + private Collection newUrls = null;//dig挖掘到新的资源Url + private Collection validTasks;//挖掘到的有效的任务集合; + private Target target = null;//是否有目标匹配当前Url; + private List> models = null;//已确认好的目标对象解析成为Map对象; + private List pojos;//将解析好的Map数据映射为POJO + public FetchResult(){}; + public FetchResult(FetchRequest request) + { + this.req = request; + } public FetchRequest getReq() { return this.req; } @@ -51,6 +64,41 @@ public class FetchResult { public void setHeaders(Map> headers) { this.headers = headers; } + + public Collection getNewUrls() { + return newUrls; + } + public void setNewUrls(Collection newUrls) { + this.newUrls = newUrls; + } + + public Collection getValidTasks() { + return validTasks; + } + public void setValidTasks(Collection validTasks) { + this.validTasks = validTasks; + } + + public Target getTarget() { + return target; + } + public void setTarget(Target target) { + this.target = target; + } + + public List> getModels() { + return models; + } + public void setModels(List> models) { + this.models = models; + } + + public List getPojos() { + return pojos; + } + public void setPojos(List pojos) { + this.pojos = pojos; + } @Override public String toString() { return "FetchResult [statusCode=" + this.statusCode + ", fetchedUrl=" diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/Page.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/Page.java index aebf9f0dab9c1dcc3e80629799fa0c349beb61e6..08b83194c711dfed1b074f6684514e12b2359b74 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/Page.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/Page.java @@ -4,8 +4,8 @@ package org.eweb4j.spiderman.fetcher; public class Page { private String url; - private String content; -// private byte[] contentData; + private Object content; + private byte[] contentData; private String contentType; private String encoding; private String charset; @@ -17,12 +17,15 @@ public class Page { public void setUrl(String url) { this.url = url; } - public String getContent() { - return this.content; + + public Object getContent() { + return content; } - public void setContent(String content) { + + public void setContent(Object content) { this.content = content; } + public String getContentType() { return this.contentType; } @@ -42,12 +45,11 @@ public class Page { this.charset = charset; } -// public byte[] getContentData() { -// return this.contentData; -// } -// -// public void setContentData(byte[] contentData) { -// this.contentData = contentData; -// } - + public byte[] getContentData() { + return this.contentData; + } + + public void setContentData(byte[] contentData) { + this.contentData = contentData; + } } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/PageFetcher.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/PageFetcher.java index d306cf9a75e5f19fb1f3eca3720641911e68cdd4..ddf14330402b5cd2d425af083077d0c94f82bd5c 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/PageFetcher.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/PageFetcher.java @@ -1,31 +1,32 @@ package org.eweb4j.spiderman.fetcher; import org.eweb4j.mvc.Http; -import org.eweb4j.spiderman.xml.Site; - +import org.eweb4j.spiderman.xml.site.Site; /** * TODO * @author weiwei l.weiwei@163.com + * @author wchao wchaojava@163.com * @date 2013-1-7 下午06:41:33 */ public abstract class PageFetcher { - public abstract void init(SpiderConfig config, Site site) throws Exception; + public int fetchSize = 1000; + public abstract void init(SpiderConfig config,Site site) throws Exception; public abstract FetchResult fetch(FetchRequest req) throws Exception ; public abstract void close() throws Exception; public abstract Object getClient(); - public String get(String url) { + public Object get(String url) { return this.fetch(Http.Method.GET, url); } - public String post(String url) { + public Object post(String url) { return this.fetch(Http.Method.POST, url); } - public String fetch(String method, String url) { + public Object fetch(String method, String url) { FetchRequest req = new FetchRequest(); try { req.setUrl(url); diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/SpiderConfig.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/SpiderConfig.java index 854e740898ae6dbcc3f2d8c226230288ca44a3d0..0df9e70b79cfc3d7e597e209196c11706d5ff091 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/SpiderConfig.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/fetcher/SpiderConfig.java @@ -51,7 +51,7 @@ public class SpiderConfig { * Politeness delay in milliseconds (delay between sending two requests to * the same host). */ - private int politenessDelay = 200; + private int politenessDelay = 60/*200*/; /** * Should we also crawl https pages? diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/BeginPoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/BeginPoint.java index 282cec2d67f8cc4eced5d01d824bdbf64b961edf..9c49447444a96ef87a1e93fabae5a763425d22d2 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/BeginPoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/BeginPoint.java @@ -1,14 +1,16 @@ package org.eweb4j.spiderman.plugin; -import org.eweb4j.spiderman.task.Task; +import org.eweb4j.spiderman.fetcher.FetchRequest; +import org.eweb4j.spiderman.fetcher.FetchResult; /** * 扩展点:爬虫开始时 * @author weiwei + * @author wchao * */ public interface BeginPoint extends Point{ - Task confirmTask(Task task) throws Exception; + FetchResult preProcess(FetchRequest request,FetchResult result) throws Exception; } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DigPoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DigPoint.java index 9f636b18d1f2ead8bee29b85592c248035c89a95..86cf274a50deb7cc444ad478f237d7a07f502831 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DigPoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DigPoint.java @@ -2,14 +2,14 @@ package org.eweb4j.spiderman.plugin; import java.util.Collection; +import org.eweb4j.spiderman.fetcher.FetchRequest; import org.eweb4j.spiderman.fetcher.FetchResult; -import org.eweb4j.spiderman.task.Task; public interface DigPoint extends Point{ // void context(FetchResult result, Task task) throws Exception; - Collection digNewUrls(FetchResult result, Task task, Collection urls) throws Exception; + Collection digNewUrls(FetchRequest request,FetchResult result) throws Exception; } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DoneException.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DoneException.java index 363bdf51e39b7aa75e86dc943c1f71aee20fbf61..b7a80fdd3588a9c85b2975d1b474b6969a0343c4 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DoneException.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DoneException.java @@ -2,6 +2,7 @@ package org.eweb4j.spiderman.plugin; /** * TODO * @author weiwei l.weiwei@163.com + * @author wchao wchaojava@163.com * @date 2013-1-15 下午02:14:16 */ public class DoneException extends RuntimeException { diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DupRemovalPoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DupRemovalPoint.java index 047fa346952e3cd677fcac489e9540bfd01bdb89..9e0a3d82884ddfbaa0616ad06073352a963e7843 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DupRemovalPoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/DupRemovalPoint.java @@ -2,6 +2,8 @@ package org.eweb4j.spiderman.plugin; import java.util.Collection; +import org.eweb4j.spiderman.fetcher.FetchRequest; +import org.eweb4j.spiderman.fetcher.FetchResult; import org.eweb4j.spiderman.task.Task; @@ -9,5 +11,5 @@ public interface DupRemovalPoint extends Point{ // void context(Task task, Collection newUrls); - Collection removeDuplicateTask(Task task, Collection newUrls, Collection tasks); + Collection removeDuplicateTask(FetchRequest request,FetchResult result); } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/EndPoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/EndPoint.java index b5ae34380e13affc4f0f81702356ded12914ea98..2052284ab3268ecef238d7f4e44b13d6e5f29c72 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/EndPoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/EndPoint.java @@ -1,15 +1,14 @@ package org.eweb4j.spiderman.plugin; -import java.util.List; -import java.util.Map; -import org.eweb4j.spiderman.task.Task; +import org.eweb4j.spiderman.fetcher.FetchRequest; +import org.eweb4j.spiderman.fetcher.FetchResult; public interface EndPoint extends Point{ // void context(Task task, List> models) throws Exception; - List> complete(Task task, List> models) throws Exception; + FetchResult complete(FetchRequest request,FetchResult result) throws Exception; } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/ExtensionPoints.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/ExtensionPoints.java index f4ea81464fb8e35653bac08be34ec3480d68dc57..04e8f6ca03459d7573072fbc5c9db388ddeec551 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/ExtensionPoints.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/ExtensionPoints.java @@ -6,6 +6,7 @@ import java.util.Collection; /** * 扩展点 * @author weiwei + * @author wchao * */ public class ExtensionPoints { @@ -24,25 +25,27 @@ public class ExtensionPoints { public static String getPointImplClassName(String point){ if (task_poll.equals(point)) - return "spiderman.plugin.impl.TaskPollPointImpl"; + return "org.eweb4j.spiderman.plugin.impl.TaskPollPointImpl"; if (begin.equals(point)) - return "spiderman.plugin.impl.BeginPointImpl"; + return "org.eweb4j.spiderman.plugin.impl.BeginPointImpl"; if (fetch.equals(point)) - return "spiderman.plugin.impl.FetchPointImpl"; + return "org.eweb4j.spiderman.plugin.impl.FetchPointImpl"; if (dig.equals(point)) - return "spiderman.plugin.impl.DigPointImpl"; + return "org.eweb4j.spiderman.plugin.impl.DigPointImpl"; if (dup_removal.equals(point)) - return "spiderman.plugin.impl.DupRemovalPointImpl"; + return "org.eweb4j.spiderman.plugin.impl.DupRemovalPointImpl"; if (task_sort.equals(point)) - return "spiderman.plugin.impl.TaskSortPointImpl"; + return "org.eweb4j.spiderman.plugin.impl.TaskSortPointImpl"; if (task_push.equals(point)) - return "spiderman.plugin.impl.TaskPushPointImpl"; + return "org.eweb4j.spiderman.plugin.impl.TaskPushPointImpl"; if (target.equals(point)) - return "spiderman.plugin.impl.TargetPointImpl"; + return "org.eweb4j.spiderman.plugin.impl.TargetPointImpl"; if (parse.equals(point)) - return "spiderman.plugin.impl.ParsePointImpl"; + return "org.eweb4j.spiderman.plugin.impl.ParsePointImpl"; + if (pojo.equals(point)) + return "org.eweb4j.spiderman.plugin.impl.PojoPointImpl"; if (end.equals(point)) - return "spiderman.plugin.impl.EndPointImpl"; + return "org.eweb4j.spiderman.plugin.impl.EndPointImpl"; return null; } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/FetchPoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/FetchPoint.java index 21ffb2c487be70e0abee8262c9c49cb7a41826d7..6f61ad763f4abcec6000aad525b06023430795e3 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/FetchPoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/FetchPoint.java @@ -1,13 +1,13 @@ package org.eweb4j.spiderman.plugin; +import org.eweb4j.spiderman.fetcher.FetchRequest; import org.eweb4j.spiderman.fetcher.FetchResult; -import org.eweb4j.spiderman.task.Task; public interface FetchPoint extends Point{ // void context(Task task) throws Exception; - FetchResult fetch(Task task, FetchResult result) throws Exception; + FetchResult fetch(FetchRequest request, FetchResult result) throws Exception; } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/ParsePoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/ParsePoint.java index 625c96d49621b50d4e194cf332659eefa375ea69..3bf114a1cc22399b3f941e4100ac2dcc20fe029c 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/ParsePoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/ParsePoint.java @@ -1,17 +1,14 @@ package org.eweb4j.spiderman.plugin; -import java.util.List; -import java.util.Map; -import org.eweb4j.spiderman.fetcher.Page; -import org.eweb4j.spiderman.task.Task; -import org.eweb4j.spiderman.xml.Target; +import org.eweb4j.spiderman.fetcher.FetchRequest; +import org.eweb4j.spiderman.fetcher.FetchResult; public interface ParsePoint extends Point{ // void context(Task task, Target target, Page page) throws Exception; - List> parse(Task task, Target target, Page page, List> models) throws Exception; + FetchResult parse(FetchRequest request,FetchResult result) throws Exception; } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/PluginManager.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/PluginManager.java index d3647e015b3dafa270c17892883d9df8477e74bb..93632e06244362db7844f04e5c1a9ffdef3b35d2 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/PluginManager.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/PluginManager.java @@ -19,6 +19,7 @@ import org.eweb4j.spiderman.xml.Plugin; /** * 插件管理 * @author weiwei l.weiwei@163.com + * @author wchao wchaojava@163.com * @date 2013-1-15 下午03:00:57 */ public class PluginManager { diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/Point.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/Point.java index 926f3f491f761ecee10742b7d61368205fb3cdc7..c82bbfa5688f3844025ffb919435e7798eea821b 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/Point.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/Point.java @@ -1,11 +1,11 @@ package org.eweb4j.spiderman.plugin; +import org.eweb4j.spiderman.container.Component; import org.eweb4j.spiderman.spider.SpiderListener; -import org.eweb4j.spiderman.xml.Site; public interface Point { - public void init(Site site, SpiderListener listener); + public void init(Component component, SpiderListener listener); public void destroy(); diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/PojoPoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/PojoPoint.java index 50dd653ee57ae07454eab1d61aac41b62e84abc7..096b993da1d0676aa0c016f460b4194f28a153db 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/PojoPoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/PojoPoint.java @@ -1,17 +1,16 @@ package org.eweb4j.spiderman.plugin; -import java.util.List; -import java.util.Map; - -import org.eweb4j.spiderman.task.Task; +import org.eweb4j.spiderman.fetcher.FetchRequest; +import org.eweb4j.spiderman.fetcher.FetchResult; /** * TODO * @author weiwei l.weiwei@163.com + * @author wchao wchaojava@163.com * @date 2013-1-2 下午07:01:00 */ public interface PojoPoint extends Point{ - List mapping(Task task, Class mappingClass, List> models, List pojo); + FetchResult mapping(FetchRequest request,FetchResult result,Class mappingClass); } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TargetPoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TargetPoint.java index 74319f0755626f96731e3496fa8acd14b4f07f76..640514d03fe962f5c7657f4a964f631826fe2dcb 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TargetPoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TargetPoint.java @@ -1,12 +1,11 @@ package org.eweb4j.spiderman.plugin; -import org.eweb4j.spiderman.task.Task; -import org.eweb4j.spiderman.xml.Target; - +import org.eweb4j.spiderman.fetcher.FetchRequest; +import org.eweb4j.spiderman.fetcher.FetchResult; public interface TargetPoint extends Point{ // void context(Task task) throws Exception; - Target confirmTarget(Task task, Target target) throws Exception; + FetchResult confirmTarget(FetchRequest request,FetchResult result) throws Exception; } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskPollPoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskPollPoint.java index d232c6f28491fbdddbe4dc2a838acf8d9fb6daf0..c390d688fc1fba04dcd736b4c5c21bfbff876184 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskPollPoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskPollPoint.java @@ -2,7 +2,6 @@ package org.eweb4j.spiderman.plugin; import org.eweb4j.spiderman.task.Task; - public interface TaskPollPoint extends Point{ Task pollTask() throws Exception; diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskPushPoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskPushPoint.java index f120c17ed5d474489e7486538c21e6a708203cc8..3cff0114baf333200de6054a95e1078d17f53274 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskPushPoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskPushPoint.java @@ -1,12 +1,11 @@ package org.eweb4j.spiderman.plugin; -import java.util.Collection; - -import org.eweb4j.spiderman.task.Task; +import org.eweb4j.spiderman.fetcher.FetchRequest; +import org.eweb4j.spiderman.fetcher.FetchResult; public interface TaskPushPoint extends Point{ - public Collection pushTask(Collection tasks) throws Exception; + public FetchResult pushTask(FetchRequest request,FetchResult result) throws Exception; } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskSortPoint.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskSortPoint.java index f711213a7dc1e9e7cfbdc0534ea98818a86c7489..631039f4b8a9cdff1d365a02812009d60f1240b7 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskSortPoint.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/plugin/TaskSortPoint.java @@ -1,12 +1,10 @@ package org.eweb4j.spiderman.plugin; -import java.util.Collection; - -import org.eweb4j.spiderman.task.Task; +import org.eweb4j.spiderman.fetcher.FetchResult; public interface TaskSortPoint extends Point{ - Collection sortTasks(Collection tasks) throws Exception; + FetchResult sortTasks(FetchResult result) throws Exception; } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Settings.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Settings.java index f5cc2784ce5dea85717f0cbd308ab9c0422aa731..2439d8400b0d2f3d68d8512475d08ad4ff7fc4d4 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Settings.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Settings.java @@ -25,4 +25,13 @@ public class Settings { public static long http_fetch_timeout(){ return CommonUtil.toSeconds(settings.get("http.fetch.timeout")).longValue(); } + + public static String[] modules(){ + String modules = settings.get("modules"); + if(modules == null || "".equals(modules)) + { + return null; + } + return modules.split(","); + } } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Spider.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Spider.java index 2467f54e9dbdd84f4d311bf525a23a2c540976c8..3bdc251d6772ac0ba2091d5ff52822bea1894833 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Spider.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Spider.java @@ -2,12 +2,13 @@ package org.eweb4j.spiderman.spider; import java.util.ArrayList; import java.util.Collection; +import java.util.ConcurrentModificationException; import java.util.HashSet; import java.util.Iterator; -import java.util.List; import java.util.Map; import java.util.Set; +import org.eweb4j.spiderman.fetcher.FetchRequest; import org.eweb4j.spiderman.fetcher.FetchResult; import org.eweb4j.spiderman.fetcher.Page; import org.eweb4j.spiderman.plugin.BeginPoint; @@ -26,261 +27,254 @@ import org.eweb4j.spiderman.url.SourceUrlChecker; import org.eweb4j.spiderman.xml.Field; import org.eweb4j.spiderman.xml.Rule; import org.eweb4j.spiderman.xml.Rules; -import org.eweb4j.spiderman.xml.Target; import org.eweb4j.util.CommonUtil; /** * 网络蜘蛛 * @author weiwei + * @author wchao * */ public class Spider implements Runnable{ - public Task task; + public FetchRequest request; + public FetchResult result; public SpiderListener listener; public void init(Task task, SpiderListener listener) { - this.task = task; + this.request = new FetchRequest(task); + this.result = new FetchResult(this.request); this.listener = listener; } public void run() { try { //扩展点:begin 蜘蛛开始 - Collection beginPoints = task.site.beginPointImpls; + Collection beginPoints = request.task.site.beginPointImpls; if (beginPoints != null && !beginPoints.isEmpty()){ for (Iterator it = beginPoints.iterator(); it.hasNext(); ){ BeginPoint point = it.next(); - task = point.confirmTask(task); + result = point.preProcess(request,result); } } - if (task == null) return ; - if (task.site.isStop) + if (result == null || request.task.site.isStop) return ; //扩展点:fetch 获取HTTP内容 - FetchResult result = null; - Collection fetchPoints = task.site.fetchPointImpls; + Collection fetchPoints = request.task.site.fetchPointImpls; if (fetchPoints != null && !fetchPoints.isEmpty()){ for (Iterator it = fetchPoints.iterator(); it.hasNext(); ){ FetchPoint point = it.next(); - result = point.fetch(task, result); + result = point.fetch(request, result); } } - listener.onFetch(Thread.currentThread(), task, result); + listener.onFetch(Thread.currentThread(), request, result); - if (result == null) + if (result == null || request.task.site.isStop) return ; - if (task.site.isStop) - return ; //扩展点:dig new url 发觉新URL - Collection newUrls = null; - Collection digPoints = task.site.digPointImpls; + Collection digPoints = request.task.site.digPointImpls; if (digPoints != null && !digPoints.isEmpty()){ for (Iterator it = digPoints.iterator(); it.hasNext(); ){ DigPoint point = it.next(); - newUrls = point.digNewUrls(result, task, newUrls); + point.digNewUrls(request,result); } } - if (task.site.isStop) + if (request.task.site.isStop) return ; - handleNewUrls(newUrls); + handleNewUrls(request,result); - if (task.site.isStop) + if (request.task.site.isStop) return ; Page page = result.getPage(); - if (page == null) { - return ; - } - if (task.site.isStop) + if (page == null || request.task.site.isStop) return ; //扩展点:target 确认是否有目标配置匹配当前URL - Target target = null; - Collection targetPoints = task.site.targetPointImpls; + Collection targetPoints = request.task.site.targetPointImpls; if (targetPoints != null && !targetPoints.isEmpty()){ for (Iterator it = targetPoints.iterator(); it.hasNext(); ){ TargetPoint point = it.next(); - target = point.confirmTarget(task, target); + point.confirmTarget(request, result); } } - if (target == null) { + if (result.getTarget() == null) { return ; } - task.target = target; - this.listener.onTargetPage(Thread.currentThread(), task, page); + this.listener.onTargetPage(Thread.currentThread(), request, page); - if (task.site.isStop) + if (request.task.site.isStop) return ; //检查sourceUrl - Rules rules = task.site.getTargets().getSourceRules(); - Rule sourceRule = SourceUrlChecker.checkSourceUrl(rules, task.sourceUrl); + Rules rules = request.task.site.getTargets().getSourceRules(); + Rule sourceRule = SourceUrlChecker.checkSourceUrl(rules, request.task.sourceUrl); if (sourceRule == null) { - listener.onInfo(Thread.currentThread(), task, "target url->"+task.url+"'s source url->"+task.sourceUrl+" is not match the SourceRules"); + listener.onInfo(Thread.currentThread(), request, "target url->"+request.task.url+"'s source url->"+request.task.sourceUrl+" is not match the SourceRules"); return ; } //扩展点:parse 把已确认好的目标页面解析成为Map对象 - List> models = null; - Collection parsePoints = task.site.parsePointImpls; + Collection parsePoints = request.task.site.parsePointImpls; if (parsePoints != null && !parsePoints.isEmpty()){ for (Iterator it = parsePoints.iterator(); it.hasNext(); ){ ParsePoint point = it.next(); - models = point.parse(task, target, page, models); + point.parse(request,result); } } - if (models == null) { + if (result.getModels() == null) { return ; } - for (Iterator> _it = models.iterator(); _it.hasNext(); ){ + for (Iterator> _it = result.getModels().iterator(); _it.hasNext(); ){ Map model = _it.next(); - for (Iterator it = target.getModel().getField().iterator(); it.hasNext(); ){ - Field f = it.next(); - //去掉那些被定义成 参数 的field - if ("1".equals(f.getIsParam()) || "true".equals(f.getIsParam())) - model.remove(f.getName()); + if(result.getTarget().getModel()!=null){ + for (Iterator it = result.getTarget().getModel().getField().iterator(); it.hasNext(); ){ + Field f = it.next(); + //去掉那些被定义成 参数 的field + if ("1".equals(f.getIsParam()) || "true".equals(f.getIsParam())) + model.remove(f.getName()); + } } - model.put("source_url", task.sourceUrl); - model.put("task_url", task.url); + model.put("source_url", request.task.sourceUrl); + model.put("task_url", request.task.url); } // 统计任务完成数+1 - this.task.site.counter.plus(); - listener.onParse(Thread.currentThread(), task, models); - - if (task.digNewUrls != null && !task.digNewUrls.isEmpty()) { - Set urls = new HashSet(task.digNewUrls.size()); - for (String s : task.digNewUrls){ + this.request.task.site.counter.plus(); + listener.onParse(Thread.currentThread(), request,result.getModels()); + //parse解析时,挖掘到的新的资源url; + if (request.task.digNewUrls != null && !request.task.digNewUrls.isEmpty()) { + Set urls = new HashSet(request.task.digNewUrls.size()); + for (String s : request.task.digNewUrls){ if (s == null || s.trim().length() == 0) continue; - urls.add(s); } - if (!urls.isEmpty()) { - handleNewUrls(urls); - task.digNewUrls.clear(); - task.digNewUrls = null; + result.setNewUrls(urls);//设置新挖掘的url + handleNewUrls(request,result); + request.task.digNewUrls.clear(); + request.task.digNewUrls = null; } } - listener.onInfo(Thread.currentThread(), task, "site -> " + task.site.getName() + " task parse finished count ->" + task.site.counter.getCount()); + listener.onInfo(Thread.currentThread(), request, "site -> " + request.task.site.getName() + " task parse finished count ->" + request.task.site.counter.getCount()); - if (task.site.isStop) + if (request.task.site.isStop) return ; //扩展点:pojo 将Map数据映射为POJO - String modelCls = target.getModel().getClazz(); + String modelCls = result.getTarget().getModel()==null?null:result.getTarget().getModel().getClazz(); Class cls = null; if (modelCls != null) cls = Thread.currentThread().getContextClassLoader().loadClass(modelCls); - List pojos = null; - Collection pojoPoints = task.site.pojoPointImpls; + Collection pojoPoints = request.task.site.pojoPointImpls; if (pojoPoints != null && !pojoPoints.isEmpty()){ for (Iterator it = pojoPoints.iterator(); it.hasNext(); ){ PojoPoint point = it.next(); - pojos = point.mapping(task, cls, models, pojos); + point.mapping(request, result,cls); } } - if (pojos != null) - listener.onPojo(Thread.currentThread(), task, pojos); + if (result.getPojos()!= null) + listener.onPojo(Thread.currentThread(),request,result.getPojos()); - if (task.site.isStop) + if (request.task.site.isStop) return ; //扩展点:end 蜘蛛完成工作,该收尾了 - Collection endPoints = task.site.endPointImpls; + Collection endPoints =request.task.site.endPointImpls; if (endPoints != null && !endPoints.isEmpty()){ for (Iterator it = endPoints.iterator(); it.hasNext(); ){ EndPoint point = it.next(); - models = point.complete(task, models); + point.complete(request, result); } } - } catch (DoneException e){ + }catch(ConcurrentModificationException e) + { + //this.listener.onError(Thread.currentThread(),request.task,e.getMessage(), e); + }catch (DoneException e){ if (this.listener != null) - this.listener.onInfo(Thread.currentThread(), task, "Spiderman has shutdown already..."); + this.listener.onInfo(Thread.currentThread(), request, "Spiderman has shutdown already..."); } catch(Throwable e){ if (this.listener != null) - this.listener.onError(Thread.currentThread(), task, CommonUtil.getExceptionString(e), e); + this.listener.onError(Thread.currentThread(), request.task, CommonUtil.getExceptionString(e), e); } } - private void handleNewUrls(Collection newUrls) throws Exception { - if (newUrls != null && !newUrls.isEmpty()) - this.listener.onNewUrls(Thread.currentThread(), task, newUrls); + private void handleNewUrls(FetchRequest request,FetchResult result) throws Exception { + if (result.getNewUrls() != null && !result.getNewUrls().isEmpty()) + this.listener.onNewUrls(Thread.currentThread(), request, result.getNewUrls()); else - newUrls = new ArrayList(); + result.setNewUrls(new ArrayList()); - if (task.site.isStop) + if (request.task.site.isStop) return ; //扩展点:dup_removal URL去重,然后变成Task - Collection validTasks = null; - Collection dupRemovalPoints = task.site.dupRemovalPointImpls; + Collection dupRemovalPoints = request.task.site.dupRemovalPointImpls; if (dupRemovalPoints != null && !dupRemovalPoints.isEmpty()){ for (Iterator it = dupRemovalPoints.iterator(); it.hasNext(); ){ DupRemovalPoint point = it.next(); - validTasks = point.removeDuplicateTask(task, newUrls, validTasks); + point.removeDuplicateTask(request,result); } } - if (newUrls != null && !newUrls.isEmpty()) - this.listener.onDupRemoval(Thread.currentThread(), task, validTasks); + if (result.getNewUrls() != null && !result.getNewUrls().isEmpty()) + this.listener.onDupRemoval(Thread.currentThread(), request, result.getValidTasks()); - if (validTasks == null) - validTasks = new ArrayList(); + if (result.getValidTasks() == null) + result.setValidTasks(new ArrayList()); - if (task.site.isStop) + if (request.task.site.isStop) return ; //扩展点:task_sort 给任务排序 - Collection taskSortPoints = task.site.taskSortPointImpls; + Collection taskSortPoints = request.task.site.taskSortPointImpls; if (taskSortPoints != null && !taskSortPoints.isEmpty()){ for (Iterator it = taskSortPoints.iterator(); it.hasNext(); ){ TaskSortPoint point = it.next(); - validTasks = point.sortTasks(validTasks); + point.sortTasks(result); } } - this.listener.onTaskSort(Thread.currentThread(), task, validTasks); + this.listener.onTaskSort(Thread.currentThread(), request, result.getValidTasks()); - if (validTasks == null) - validTasks = new ArrayList(); + if (result.getValidTasks() == null) + result.setValidTasks(new ArrayList()); - if (task.site.isStop) + if (request.task.site.isStop) return ; //扩展点:task_push 将任务放入队列 - validTasks = pushTask(validTasks); - if (validTasks != null && !validTasks.isEmpty()) - this.listener.onNewTasks(Thread.currentThread(), task, validTasks); + result = pushTask(request,result); + if (result.getValidTasks() != null && !result.getValidTasks().isEmpty()) + this.listener.onNewTasks(Thread.currentThread(), request, result.getValidTasks()); } - public Collection pushTask(Collection validTasks) throws Exception { - Collection taskPushPoints = task.site.taskPushPointImpls; + public FetchResult pushTask(FetchRequest request,FetchResult result) throws Exception { + Collection taskPushPoints = request.task.site.taskPushPointImpls; if (taskPushPoints != null && !taskPushPoints.isEmpty()){ for (Iterator it = taskPushPoints.iterator(); it.hasNext(); ){ TaskPushPoint point = it.next(); - validTasks = point.pushTask(validTasks); + point.pushTask(request,result); } } - return validTasks; + return result; } } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/SpiderListener.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/SpiderListener.java index c0ba42a48fd7e35874e793c52acbe7d091e6c39b..c2b8715be5f2626e16a4d3d321fc6678c09d9f0d 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/SpiderListener.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/SpiderListener.java @@ -5,44 +5,43 @@ import java.util.Date; import java.util.List; import java.util.Map; +import org.eweb4j.spiderman.container.Component; +import org.eweb4j.spiderman.fetcher.FetchRequest; import org.eweb4j.spiderman.fetcher.FetchResult; import org.eweb4j.spiderman.fetcher.Page; import org.eweb4j.spiderman.task.Task; -import org.eweb4j.spiderman.xml.Site; - - public interface SpiderListener { - void onDigUrls(Thread thread, Task task, String fieldName, Collection urls); + void onDigUrls(Thread thread, Task task, String fieldName, Collection urls); - void onNewUrls(Thread thread, Task task, Collection newUrls); + void onNewUrls(Thread thread, FetchRequest request, Collection newUrls); - void onFetch(Thread thread, Task task, FetchResult result); + void onFetch(Thread thread, FetchRequest request, FetchResult result); - void onDupRemoval(Thread currentThread, Task task, Collection validTasks); + void onDupRemoval(Thread currentThread, FetchRequest request, Collection validTasks); - void onTaskSort(Thread currentThread, Task task, Collection afterSortTasks); + void onTaskSort(Thread currentThread, FetchRequest request, Collection afterSortTasks); - void onNewTasks(Thread thread, Task task, Collection newTasks); + void onNewTasks(Thread thread, FetchRequest request, Collection newTasks); - void onTargetPage(Thread thread, Task task, Page page); + void onTargetPage(Thread thread, FetchRequest request, Page page); - void onParseField(Thread thread, Task task, Object selector, String field, Object value); + void onParseField(Thread thread, FetchRequest request, Object selector, String field, Object value); - void onParseOne(Thread thread, Task task, int size, int index, Map model); + void onParseOne(Thread thread, FetchRequest request, int size, int index, Map model); - void onParse(Thread thread, Task task, List> models); + void onParse(Thread thread, FetchRequest request, List> models); - void onPojo(Thread thread, Task task, List pojos); + void onPojo(Thread thread, FetchRequest request, List pojos); - void onInfo(Thread thread, Task task, String info); + void onInfo(Thread thread, FetchRequest request, String info); - void onStartup(Site site); + void onStartup(Component component); void onError(Thread thread, Task task, String err, Throwable e); - void onInitError(Site site, String err, Throwable e); + void onInitError(Component componen, String err, Throwable e); /** * 调度结束后回调此方法 @@ -75,11 +74,11 @@ public interface SpiderListener { * Spiderman.shutdown()被调用之前回调此方法 * @date 2013-6-3 下午05:00:43 */ - void onBeforeShutdown(Site site, Object... args); + void onBeforeShutdown(Component component, Object... args); /** * Spiderman.shutdown()被调用之后回调此方法 * @date 2013-6-3 下午05:01:02 */ - void onAfterShutdown(Site site, Object... args); + void onAfterShutdown(Component componen, Object... args); } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/SpiderListenerAdaptor.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/SpiderListenerAdaptor.java index 37f0b00f92623285177c530d7505741cf964b149..a14997b8584e74aa59469505f4ec00ffed1dbe45 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/SpiderListenerAdaptor.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/SpiderListenerAdaptor.java @@ -5,36 +5,38 @@ import java.util.Date; import java.util.List; import java.util.Map; +import org.eweb4j.spiderman.container.Component; +import org.eweb4j.spiderman.fetcher.FetchRequest; import org.eweb4j.spiderman.fetcher.FetchResult; import org.eweb4j.spiderman.fetcher.Page; import org.eweb4j.spiderman.task.Task; -import org.eweb4j.spiderman.xml.Site; /** * 爬虫监听适配器 * @author weiwei l.weiwei@163.com + * @author wchao wchaojava@163.com * @date 2013-1-7 上午11:39:57 */ public class SpiderListenerAdaptor implements SpiderListener{ - public void onDigUrls(Thread thread, Task task, String fieldName, Collection urls) {} - public void onFetch(Thread thread, Task task, FetchResult result) {} - public void onNewUrls(Thread thread, Task task, Collection newUrls) {} - public void onDupRemoval(Thread currentThread, Task task, Collection validTasks) {} - public void onTaskSort(Thread currentThread, Task task, Collection afterSortTasks) {} - public void onNewTasks(Thread thread, Task task, Collection newTasks) {} - public void onTargetPage(Thread thread, Task task, Page page) {} - public void onParse(Thread thread, Task task, List> models) {} - public void onPojo(Thread thread, Task task, List pojos) {} - public void onInfo(Thread thread, Task task, String info) {} - public void onStartup(Site site) {} + public void onDigUrls(Thread thread, Task task, String fieldName, Collection urls) {} + public void onFetch(Thread thread, FetchRequest request, FetchResult result) {} + public void onNewUrls(Thread thread, FetchRequest request, Collection newUrls) {} + public void onDupRemoval(Thread currentThread, FetchRequest request, Collection validTasks) {} + public void onTaskSort(Thread currentThread, FetchRequest request, Collection afterSortTasks) {} + public void onNewTasks(Thread thread, FetchRequest request, Collection newTasks) {} + public void onTargetPage(Thread thread, FetchRequest request, Page page) {} + public void onParse(Thread thread, FetchRequest request, List> models) {} + public void onPojo(Thread thread, FetchRequest request, List pojos) {} + public void onInfo(Thread thread, FetchRequest request, String info) {} + public void onStartup(Component component) {} public void onError(Thread thread, Task task, String err, Throwable e) {e.printStackTrace();} - public void onInitError(Site site, String err, Throwable e){e.printStackTrace();} + public void onInitError(Component component, String err, Throwable e){e.printStackTrace();} public void onAfterScheduleCancel() {} public void onBeforeEveryScheduleExecute(Date theLastTimeScheduledAt){} public void onBeforeShutdown(Object... args) {} public void onAfterShutdown(Object... args) {} - public void onBeforeShutdown(Site site, Object... args) {} - public void onAfterShutdown(Site site, Object... args) {} - public void onParseField(Thread thread, Task task, Object selector, String field, Object value) {} - public void onParseOne(Thread thread, Task task, int size, int index, Map model) {} + public void onBeforeShutdown(Component component, Object... args) {} + public void onAfterShutdown(Component component, Object... args) {} + public void onParseField(Thread thread, FetchRequest request, Object selector, String field, Object value) {} + public void onParseOne(Thread thread, FetchRequest request, int size, int index, Map model) {} } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Spiderman.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Spiderman.java index 05d597169af13c379821c02c1ea830201731af25..f6f9159960f5575ceec186d3bab2c94d653969c2 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Spiderman.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/spider/Spiderman.java @@ -1,61 +1,39 @@ +/** + * + */ package org.eweb4j.spiderman.spider; import java.io.File; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; import java.util.Date; -import java.util.Iterator; import java.util.List; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.ExecutorService; import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.RejectedExecutionHandler; -import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import org.eweb4j.spiderman.container.Container; +import org.eweb4j.spiderman.container.ContainerManager; import org.eweb4j.spiderman.infra.SpiderIOC; import org.eweb4j.spiderman.infra.SpiderIOCs; -import org.eweb4j.spiderman.plugin.BeginPoint; -import org.eweb4j.spiderman.plugin.DigPoint; -import org.eweb4j.spiderman.plugin.DoneException; -import org.eweb4j.spiderman.plugin.DupRemovalPoint; -import org.eweb4j.spiderman.plugin.EndPoint; -import org.eweb4j.spiderman.plugin.ExtensionPoint; -import org.eweb4j.spiderman.plugin.ExtensionPoints; -import org.eweb4j.spiderman.plugin.FetchPoint; -import org.eweb4j.spiderman.plugin.ParsePoint; -import org.eweb4j.spiderman.plugin.PluginManager; -import org.eweb4j.spiderman.plugin.Point; -import org.eweb4j.spiderman.plugin.PojoPoint; -import org.eweb4j.spiderman.plugin.TargetPoint; -import org.eweb4j.spiderman.plugin.TaskPollPoint; -import org.eweb4j.spiderman.plugin.TaskPushPoint; -import org.eweb4j.spiderman.plugin.TaskSortPoint; -import org.eweb4j.spiderman.task.Task; -import org.eweb4j.spiderman.task.TaskQueue; -import org.eweb4j.spiderman.xml.Plugin; -import org.eweb4j.spiderman.xml.Plugins; -import org.eweb4j.spiderman.xml.Seed; -import org.eweb4j.spiderman.xml.Seeds; -import org.eweb4j.spiderman.xml.Site; -import org.eweb4j.spiderman.xml.Target; import org.eweb4j.util.CommonUtil; import org.eweb4j.util.xml.BeanXMLUtil; import org.eweb4j.util.xml.XMLReader; import org.eweb4j.util.xml.XMLWriter; - -public class Spiderman { - +/** + * @author yangc + * + */ +public class Spiderman{ + public final SpiderIOC ioc = SpiderIOCs.create(); + private ContainerManager cm = null; public Boolean isShutdownNow = false; - private ExecutorService pool = null; - private Collection sites = null; private SpiderListener listener = null; - + private ExecutorService pool = null; private boolean isSchedule = false; private Timer timer = new Timer(); private String scheduleTime = "1h"; @@ -67,7 +45,6 @@ public class Spiderman { public final static Spiderman me() { return new Spiderman(); } - /** * @date 2013-1-17 下午01:43:52 * @param listener @@ -81,22 +58,18 @@ public class Spiderman { if (this.listener == null) this.listener = new SpiderListenerAdaptor(); isShutdownNow = false; - sites = null; - pool = null; try { if (file == null) loadConfigFiles(); else loadConfigFile(file); - initSites(); - initPool(); + initContainers(); + initPool(); } catch (Throwable e){ - e.printStackTrace(); this.listener.onError(Thread.currentThread(), null, e.toString(), e); } return this; } - public Spiderman init(){ File file = null; return this.init(file); @@ -136,7 +109,7 @@ public class Spiderman { _this.isSchedule = false; } else { - //阻塞,判断之前所有的网站是否都已经停止完全 + //阻塞,判断之前所有的容器是否都已经停止完全 //加个超时 long start = System.currentTimeMillis(); long timeout = 1*60*1000; @@ -144,10 +117,10 @@ public class Spiderman { try { if ((System.currentTimeMillis() - start) > timeout){ _this.listener.onError(Thread.currentThread(), null, "timeout of restart blocking check...", new Exception()); - for (Site site : _this.sites) { - if (!site.isStop){ + for (Container container : _this.cm.getContainers()) { + if (!container.isStop){ try { - site.destroy(_this.listener, _this.isShutdownNow); + container.destroy(_this.listener, _this.isShutdownNow); } catch (Throwable e){ e.printStackTrace(); _this.listener.onError(Thread.currentThread(), null, e.toString(), e); @@ -156,18 +129,17 @@ public class Spiderman { } break; } - if (_this.sites == null || _this.sites.isEmpty()) + if (_this.cm.getContainers() == null || _this.cm.getContainers().isEmpty()) break; Thread.sleep(1*1000); boolean canBreak = true; - for (Site site : _this.sites) { - if (!site.isStop){ + for (Container container : _this.cm.getContainers()) { + if (!container.isStop){ canBreak = false; - _this.listener.onInfo(Thread.currentThread(), null, "can not restart spiderman cause there has running-tasks of this site -> "+site.getName()+"..."); + _this.listener.onInfo(Thread.currentThread(), null, "can not restart Spiderman cause there has running-components of this container -> "+container.getId()+"..."); } } - if (canBreak) break; } catch (Exception e) { @@ -177,7 +149,7 @@ public class Spiderman { } try { - //只有所有的网站资源都已被释放[特殊情况timeout]完全才重启Spiderman + //只有所有的容器资源都已被释放[特殊情况timeout]完全才重启Spiderman _this.scheduleTimes++; String strTimes = _this.scheduleTimes + ""; if (_this.maxScheduleTimes > 0) @@ -201,19 +173,15 @@ public class Spiderman { return this; } - return _startup(); } private Spiderman _startup(){ - for (Site site : sites){ - pool.execute(new Spiderman._Executor(site)); - listener.onInfo(Thread.currentThread(), null, "spider tasks of site[" + site.getName() + "] start... "); - listener.onStartup(site); + for (Container container : cm.getContainers()){ + container.startup(); } return this; } - public void shutdown(){ shutdown(false); } @@ -228,10 +196,9 @@ public class Spiderman { //此处添加一个监听回调 listener.onBeforeShutdown(); } - if (sites != null) { - for (Site site : sites){ - site.destroy(listener, false); - listener.onInfo(Thread.currentThread(), null, "Site[" + site.getName() + "] destroy... "); + if (cm.getContainers() != null) { + for (Container container : cm.getContainers()){ + container.destroy(listener, false); } } if (pool != null) { @@ -264,11 +231,9 @@ public class Spiderman { listener.onError(Thread.currentThread(), null, e.toString(), e); } } - if (sites != null) { - for (Site site : sites){ - site.destroy(listener, true); - listener.onInfo(Thread.currentThread(), null, "Site[" + site.getName() + "] destroy... "); - listener.onAfterShutdown(site, args); + if (cm.getContainers() != null) { + for (Container container : cm.getContainers()){ + container.destroy(listener, true,args); } } @@ -360,30 +325,26 @@ public class Spiderman { } private void loadConfigFiles() throws Exception{ - File siteFolder = new File(Settings.website_xml_folder()); - if (!siteFolder.exists()) - throw new Exception("can not found WebSites folder -> " + siteFolder.getAbsolutePath()); + File containerFolder = new File(Settings.website_xml_folder()); + if (!containerFolder.exists()) + throw new Exception("can not found Cointainers folder -> " + containerFolder.getAbsolutePath()); - if (!siteFolder.isDirectory()) - throw new Exception("WebSites -> " + siteFolder.getAbsolutePath() + " must be folder !"); + if (!containerFolder.isDirectory()) + throw new Exception("WebSites -> " + containerFolder.getAbsolutePath() + " must be folder !"); - File[] files = siteFolder.listFiles(); + File[] files = containerFolder.listFiles(); if (files == null || files.length == 0){ - //generate a site.xml file - File file = new File(siteFolder.getAbsoluteFile()+File.separator+"_site_sample_.xml"); - Site site = new Site(); - - Plugins plugins = new Plugins(); - plugins.getPlugin().add(PluginManager.createPlugin()); - site.setPlugins(plugins); - - XMLWriter writer = BeanXMLUtil.getBeanXMLWriter(file, site); - writer.setBeanName("site"); - writer.setClass("site", Site.class); + //generate a container.xml file + File file = new File(containerFolder.getAbsoluteFile()+File.separator+"_container_sample_.xml"); + Container container = new Container("default"); + XMLWriter writer = BeanXMLUtil.getBeanXMLWriter(file, container); + writer.setBeanName("container"); + writer.setClass("container",Container.class); writer.write(); } - sites = new ArrayList(files.length); + cm = ContainerManager.me(); + for (File file : files){ if (!file.exists()) continue; @@ -395,6 +356,7 @@ public class Spiderman { } } + @SuppressWarnings("unused") public void loadConfigFile(File file) throws Exception { if (!file.exists()) return; @@ -403,271 +365,36 @@ public class Spiderman { if (!file.getName().endsWith(".xml")) return; XMLReader reader = BeanXMLUtil.getBeanXMLReader(file); - reader.setBeanName("site"); - reader.setClass("site", Site.class); - Site site = reader.readOne(); - if (site == null) - throw new Exception("site xml file error -> " + file.getAbsolutePath()); + reader.setBeanName("container"); + reader.setClass("container", Container.class); + + Container container = reader.readOne(); + reader.setRootElementName("container"); + container.setReader(reader); + if (container == null) + throw new Exception("container xml file error -> " + file.getAbsolutePath()); - if (!"1".equals(site.getEnable())) { -// String err = file.getAbsolutePath() + "'s site.enable != 1" ; -// this.listener.onInitError(site, err, new Exception(err)); + if (!"1".equals(container.getEnable())) { return; -// throw new Exception(err); } - sites = new ArrayList(); - sites.add(site); - } - - private void initSites() throws Exception{ - for (Site site : sites){ - if (site.getName() == null || site.getName().trim().length() == 0) - throw new Exception("site name required"); - if (site.getUrl() == null || site.getUrl().trim().length() == 0) - throw new Exception("site url required"); - if (site.getTargets() == null || site.getTargets().getTarget().isEmpty()) - throw new Exception("site target required"); - - List targets = site.getTargets().getTarget(); - if (targets == null || targets.isEmpty()) - throw new Exception("can not get any url target of site -> " + site.getName()); - - //---------------------插件初始化开始---------------------------- - listener.onInfo(Thread.currentThread(), null, "plugins loading begin..."); - Collection plugins = site.getPlugins().getPlugin(); - //加载网站插件配置 - try { - PluginManager pluginMgr = new PluginManager(); - pluginMgr.loadPluginConf(plugins, listener); - - //加载TaskPoll扩展点实现类 - ExtensionPoint taskPollPoint = pluginMgr.getExtensionPoint(ExtensionPoints.task_poll); - if (taskPollPoint != null) { - site.taskPollPointImpls = taskPollPoint.getExtensions(); - firstInitPoint(site.taskPollPointImpls, site, listener); - } - - //加载Begin扩展点实现类 - ExtensionPoint beginPoint = pluginMgr.getExtensionPoint(ExtensionPoints.begin); - if (beginPoint != null){ - site.beginPointImpls = beginPoint.getExtensions(); - firstInitPoint(site.beginPointImpls, site, listener); - } - - //加载Fetch扩展点实现类 - ExtensionPoint fetchPoint = pluginMgr.getExtensionPoint(ExtensionPoints.fetch); - if (fetchPoint != null){ - site.fetchPointImpls = fetchPoint.getExtensions(); - firstInitPoint(site.fetchPointImpls, site, listener); - } - - //加载Dig扩展点实现类 - ExtensionPoint digPoint = pluginMgr.getExtensionPoint(ExtensionPoints.dig); - if (digPoint != null){ - site.digPointImpls = digPoint.getExtensions(); - firstInitPoint(site.digPointImpls, site, listener); - } - - //加载DupRemoval扩展点实现类 - ExtensionPoint dupRemovalPoint = pluginMgr.getExtensionPoint(ExtensionPoints.dup_removal); - if (dupRemovalPoint != null){ - site.dupRemovalPointImpls = dupRemovalPoint.getExtensions(); - firstInitPoint(site.dupRemovalPointImpls, site, listener); - } - //加载TaskSort扩展点实现类 - ExtensionPoint taskSortPoint = pluginMgr.getExtensionPoint(ExtensionPoints.task_sort); - if (taskSortPoint != null){ - site.taskSortPointImpls = taskSortPoint.getExtensions(); - firstInitPoint(site.taskSortPointImpls, site, listener); - } - - //加载TaskPush扩展点实现类 - ExtensionPoint taskPushPoint = pluginMgr.getExtensionPoint(ExtensionPoints.task_push); - if (taskPushPoint != null){ - site.taskPushPointImpls = taskPushPoint.getExtensions(); - firstInitPoint(site.taskPushPointImpls, site, listener); - } - - //加载Target扩展点实现类 - ExtensionPoint targetPoint = pluginMgr.getExtensionPoint(ExtensionPoints.target); - if (targetPoint != null){ - site.targetPointImpls = targetPoint.getExtensions(); - firstInitPoint(site.targetPointImpls, site, listener); - } - - //加载Parse扩展点实现类 - ExtensionPoint parsePoint = pluginMgr.getExtensionPoint(ExtensionPoints.parse); - if (parsePoint != null){ - site.parsePointImpls = parsePoint.getExtensions(); - firstInitPoint(site.parsePointImpls, site, listener); - } - - //加载Pojo扩展点实现类 - ExtensionPoint pojoPoint = pluginMgr.getExtensionPoint(ExtensionPoints.pojo); - if (pojoPoint != null){ - site.pojoPointImpls = pojoPoint.getExtensions(); - firstInitPoint(site.pojoPointImpls, site, listener); - } - - //加载End扩展点实现类 - ExtensionPoint endPoint = pluginMgr.getExtensionPoint(ExtensionPoints.end); - if (endPoint != null){ - site.endPointImpls = endPoint.getExtensions(); - firstInitPoint(site.endPointImpls, site, listener); - } - //---------------------------插件初始化完毕---------------------------------- - } catch(Exception e){ - throw new Exception("Site["+site.getName()+"] loading plugins fail", e); - } - - //初始化网站的队列容器 - site.queue = new TaskQueue(); - site.queue.init(); - //初始化网站目标Model计数器 - site.counter = new Counter(); - } + if(cm == null) + cm = ContainerManager.me(); + cm.add(container); } - private void firstInitPoint(Collection points, Site site, SpiderListener listener){ - for (Point point : points){ - point.init(site, listener); + private void initContainers() throws Exception{ + for (Container container : cm.getContainers()){ + container.init(listener); } } private void initPool(){ if (pool == null){ - int size = sites.size(); + int size = cm.getContainers().size(); if (size == 0) - throw new RuntimeException("there is no website to fetch..."); - pool = new ThreadPoolExecutor(size, size, - 60L, TimeUnit.SECONDS, - new LinkedBlockingQueue()); - + throw new RuntimeException("there is no container to load..."); + pool = new ThreadPoolExecutor(size, size, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue()); listener.onInfo(Thread.currentThread(), null, "init thread pool size->"+size+" success "); } } - - private class _Executor implements Runnable{ - private Site site = null; - - public _Executor(Site site){ - this.site = site; - String strSize = site.getThread(); - int size = Integer.parseInt(strSize); - listener.onInfo(Thread.currentThread(), null, "site thread size -> " + size); - RejectedExecutionHandler rejectedHandler = new RejectedExecutionHandler() { - public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) { - //拿到被弹出来的爬虫引用 - Spider spider = (Spider)r; - try { - //将该爬虫的任务 task 放回队列 - spider.pushTask(Arrays.asList(spider.task)); - String info = "repush the task->"+spider.task+" to the Queue."; - spider.listener.onError(Thread.currentThread(), spider.task, info, new Exception(info)); - } catch (Exception e) { - String err = "could not repush the task to the Queue. cause -> " + e.toString(); - spider.listener.onError(Thread.currentThread(), spider.task, err, e); - } - } - }; - - if (size > 0) - this.site.pool = new ThreadPoolExecutor(size, size, - 60L, TimeUnit.SECONDS, - new LinkedBlockingQueue(), - rejectedHandler); - else - this.site.pool = new ThreadPoolExecutor(0, Integer.MAX_VALUE, - 60L, TimeUnit.SECONDS, - new SynchronousQueue(), - rejectedHandler); - } - - public void run() { - if (site.isStop) - return ; - - // 获取种子url - Seeds seeds = site.getSeeds(); - Collection seedTasks = new ArrayList(); - if (seeds == null || seeds.getSeed() == null || seeds.getSeed().isEmpty()) { - seedTasks.add(new Task(this.site.getUrl(), this.site.getHttpMethod(), null, this.site, 10)); - }else{ - for (Iterator it = seeds.getSeed().iterator(); it.hasNext(); ){ - Seed s = it.next(); - seedTasks.add(new Task(s.getUrl(), s.getHttpMethod(), null, this.site, 10)); - } - } - - // 运行种子任务 - for (Iterator it = seedTasks.iterator(); it.hasNext(); ) { - Task seedTask = it.next(); - Spider seedSpider = new Spider(); - seedSpider.init(seedTask, listener); -// this.site.pool.execute(seedSpider); - seedSpider.run(); - } - -// final float times = CommonUtil.toSeconds(this.site.getSchedule()) * 1000; -// long start = System.currentTimeMillis(); - while(true){ - if (site.isStop) - break; - - try { - //扩展点:TaskPoll - Task task = null; - Collection taskPollPoints = site.taskPollPointImpls; - if (taskPollPoints != null && !taskPollPoints.isEmpty()){ - for (Iterator it = taskPollPoints.iterator(); it.hasNext(); ){ - TaskPollPoint point = it.next(); - task = point.pollTask(); - } - } - - if (task == null){ - long wait = CommonUtil.toSeconds(site.getWaitQueue()).longValue(); -// listener.onInfo(Thread.currentThread(), null, "queue empty wait for -> " + wait + " seconds"); - if (wait > 0) { - try { - Thread.sleep(wait * 1000); - } catch (Exception e){ - - } - } - continue; - } - - Spider spider = new Spider(); - spider.init(task, listener); - - this.site.pool.execute(spider); - }catch (DoneException e) { - listener.onInfo(Thread.currentThread(), null, e.toString()); - return ; - } catch (Exception e) { - listener.onError(Thread.currentThread(), null, e.toString(), e); - }finally{ - if (site.isStop) - break; - if (site.pool == null) - break; - -// long cost = System.currentTimeMillis() - start; -// if (cost >= times){ -//// 运行种子任务 -// for (Iterator it = seedTasks.iterator(); it.hasNext(); ) { -// Task seedTask = it.next(); -// Spider seedSpider = new Spider(); -// seedSpider.init(seedTask, listener); -// seedSpider.run(); -// } -// listener.onInfo(Thread.currentThread(), null, " shcedule FeedSpider of Site->"+site.getName()+" per "+times+", now cost time ->"+cost); -// start = System.currentTimeMillis();//重新计时 -// } - } - } - } - } - } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/task/Task.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/task/Task.java index 04b814242ebd2dfb5f9c75e11372e9acc9551187..f8d213b78afbdfd2affcbfd90ecafa303a44d5ec 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/task/Task.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/task/Task.java @@ -4,10 +4,8 @@ import java.util.ArrayList; import java.util.List; import org.eweb4j.spiderman.fetcher.Page; -import org.eweb4j.spiderman.xml.Site; import org.eweb4j.spiderman.xml.Target; - - +import org.eweb4j.spiderman.xml.site.Site; public class Task { @@ -19,7 +17,15 @@ public class Task { this.sort = sort; this.httpMethod = httpMethod; } - + public Task(String url, String httpMethod, String sourceUrl, Object type,Site site, int sort) { + super(); + this.url = url; + this.sourceUrl = sourceUrl; + this.site = site; + this.sort = sort; + this.httpMethod = httpMethod; + this.type = type; + } public Task(Site site) { this.site = site; } @@ -32,9 +38,62 @@ public class Task { public String sourceUrl;//task.url的来源 public List digNewUrls = new ArrayList(); public String httpMethod; -// public List
headers = new ArrayList
(); -// public List cookies = new ArrayList(); + public Object type;//任务类型; + public Site getSite() { + return site; + } + public void setSite(Site site) { + this.site = site; + } + public Target getTarget() { + return target; + } + public void setTarget(Target target) { + this.target = target; + } + public Page getPage() { + return page; + } + public void setPage(Page page) { + this.page = page; + } + public double getSort() { + return sort; + } + public void setSort(double sort) { + this.sort = sort; + } + public String getUrl() { + return url; + } + public void setUrl(String url) { + this.url = url; + } + public String getSourceUrl() { + return sourceUrl; + } + public void setSourceUrl(String sourceUrl) { + this.sourceUrl = sourceUrl; + } + public List getDigNewUrls() { + return digNewUrls; + } + public void setDigNewUrls(List digNewUrls) { + this.digNewUrls = digNewUrls; + } + public String getHttpMethod() { + return httpMethod; + } + public void setHttpMethod(String httpMethod) { + this.httpMethod = httpMethod; + } + public Object getType() { + return type; + } + public void setType(Object type) { + this.type = type; + } public String toString() { return "Task [site=" + site.getName() + ", sort=" + sort + ", url=" + url + ", sourceUrl=" + sourceUrl + "]"; } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/task/TaskQueue.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/task/TaskQueue.java index bfe80bff34342c4609f97481e216265134658d96..5331402d293c47c358a7ab5ef8f582f2f8791eef 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/task/TaskQueue.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/task/TaskQueue.java @@ -10,6 +10,7 @@ import org.eweb4j.spiderman.xml.Rules; /** * 任务队列,阻塞+优先级排序 * @author weiwei l.weiwei@163.com + * @author wchao wchaojava@163.com * @date 2013-1-15 上午10:53:24 */ public class TaskQueue { @@ -51,10 +52,12 @@ public class TaskQueue { //检查是否匹配xml配置的url规则 Rules rules = task.site.getQueueRules(); - Rule queueRule = UrlRuleChecker.check(task.url, rules.getRule(), rules.getPolicy()); - if (queueRule == null) - return false; - + if(rules != null) + { + Rule queueRule = UrlRuleChecker.check(task.url, rules.getRule(), rules.getPolicy()); + if (queueRule == null) + return false; + } return queue.add(task); } @@ -62,4 +65,9 @@ public class TaskQueue { this.queue.clear(); isStop = true; } + + public int size() + { + return this.queue.size(); + } } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/url/SourceUrlChecker.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/url/SourceUrlChecker.java index ab1d439a9d09f5b3611570708838c271e073cdae..1fa4ffdbe4c7bccca7b68062195c065d83757b23 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/url/SourceUrlChecker.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/url/SourceUrlChecker.java @@ -9,6 +9,7 @@ import org.eweb4j.spiderman.xml.Rules; /** * TODO * @author weiwei l.weiwei@163.com + * @author wchao wchaojava@163.com * @date 2013-2-28 下午08:34:54 */ public class SourceUrlChecker { diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Field.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Field.java index 218de4a3be14fb3c75fd3865fea6185e0a64a703..0dde63399c43b6895f023ed336ec2c92274e32cc 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Field.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Field.java @@ -2,6 +2,7 @@ package org.eweb4j.spiderman.xml; import org.eweb4j.util.xml.AttrTag; + public class Field { @AttrTag @@ -36,9 +37,16 @@ public class Field { */ @AttrTag private String isAlsoParseInNextPage; - + /** + * 是否去掉前后的空格字符 + */ + @AttrTag + private String isTrim; + /** + * 是否作为数字类型引用; + */ @AttrTag - private String isTrim;//是否去掉前后的空格字符 + private String isNumber; private Parsers parsers; @@ -114,4 +122,11 @@ public class Field { this.isForDigNewUrl = isForDigNewUrl; } + public String getIsNumber() { + return isNumber; + } + + public void setIsNumber(String isNumber) { + this.isNumber = isNumber; + } } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Model.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Model.java index 97ed9595cf4e1b6b3d9c2445a3bed56f8d4ae7b2..225c7ae0ba6c7537ea015091858ee5ce572343aa 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Model.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Model.java @@ -45,6 +45,9 @@ public class Model { @AttrTag private String delay; + @AttrTag + private String maxPage;//针对分页做的最大页数; + private List field = new ArrayList(); public String getClazz() { @@ -159,5 +162,21 @@ public class Model { public void setDelay(String delay) { this.delay = delay; } + + public String getcType() { + return cType; + } + + public void setcType(String cType) { + this.cType = cType; + } + + public String getMaxPage() { + return maxPage; + } + + public void setMaxPage(String maxPage) { + this.maxPage = maxPage; + } } diff --git a/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Options.java b/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Options.java index 8d355c5106fa71b742f1a72a8cda4c4ad311176c..ed4295ffad5e984e6c0a206b7502b5331b7193ec 100644 --- a/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Options.java +++ b/spiderman-core/src/main/java/org/eweb4j/spiderman/xml/Options.java @@ -3,13 +3,18 @@ package org.eweb4j.spiderman.xml; import java.util.ArrayList; import java.util.List; +import org.eweb4j.util.xml.AttrTag; + /** * 其他额外的数据 * @author weiwei l.weiwei@163.com * @date 2013-6-9 上午10:30:26 */ public class Options { - + + @AttrTag + private String name; + private List