diff --git a/.travis.yml b/.travis.yml
index a9f233f37f99ae2dcd5aa2cfefe18738158dd470..9e6f78d38cb18e09fab941605d511519d6fea323 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,3 +1,3 @@
language: java
jdk:
- - oraclejdk7
+ - openjdk7
diff --git a/README-zh.md b/README-zh.md
index e8f07355168882959769d0dd375807cbade48a9b..cd1b090c73dc42fa6f676cb5fdddb70df04487b2 100644
--- a/README-zh.md
+++ b/README-zh.md
@@ -1,4 +1,4 @@
-![logo](https://raw.github.com/code4craft/webmagic/master/assets/logo.jpg)
+![logo](http://webmagic.io/images/logo.jpeg)
[![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic)
@@ -38,12 +38,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
us.codecraft
webmagic-core
- 0.6.1
+ 0.7.3
us.codecraft
webmagic-extension
- 0.6.1
+ 0.7.3
```
@@ -161,7 +161,7 @@ public class OschinaBlog {
webmagic-samples目录里有一些定制PageProcessor以抽取不同站点的例子。
-webmagic的使用可以参考:[oschina openapi 应用:博客搬家](http://my.oschina.net/oscfox/blog/194507)
+webmagic的使用可以参考:[oschina openapi 应用:博客搬家](https://git.oschina.net/yashin/MoveBlog)
### 协议
@@ -178,7 +178,7 @@ QQ:
### QQ群:
-373225642
+373225642(已满) 542327088
### 相关项目:
diff --git a/README.md b/README.md
index 8785844332fe22edaae2ee77c52d3e90bf2d7045..73cb48833bf10506414b63a31d24efff00626c46 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-![logo](https://raw.github.com/code4craft/webmagic/master/assets/logo.jpg)
+![logo](http://webmagic.io/images/logo.jpeg)
[Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md)
@@ -23,12 +23,12 @@ Add dependencies to your pom.xml:
us.codecraft
webmagic-core
- 0.6.1
+ 0.7.3
us.codecraft
webmagic-extension
- 0.6.1
+ 0.7.3
```
@@ -142,7 +142,7 @@ To write webmagic, I refered to the projects below :
[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988)
-QQ Group: 373225642
+QQ Group: 373225642 542327088
### Related Project
diff --git a/assets/data.plist b/assets/data.plist
deleted file mode 100644
index 5c8fa3adf557dfc18b96e88c61ebcd26243a3382..0000000000000000000000000000000000000000
--- a/assets/data.plist
+++ /dev/null
@@ -1,1525 +0,0 @@
-
-
-
-
- ApplicationVersion
-
- com.omnigroup.OmniGrafflePro
- 139.16.0.171715
-
- CreationDate
- 2014-03-12 08:47:15 +0000
- Creator
- 黄 亿华
- GraphDocumentVersion
- 8
- GuidesLocked
- NO
- GuidesVisible
- YES
- ImageCounter
- 2
- ImageLinkBack
-
-
-
- ImageList
-
- image1.pdf
-
- LinksVisible
- NO
- MagnetsVisible
- NO
- MasterSheets
-
- ModificationDate
- 2014-03-12 12:19:49 +0000
- Modifier
- 黄 亿华
- NotesVisible
- NO
- OriginVisible
- NO
- PageBreaks
- YES
- PrintInfo
-
- NSBottomMargin
-
- float
- 41
-
- NSHorizonalPagination
-
- coded
- BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG
-
- NSLeftMargin
-
- float
- 18
-
- NSPaperSize
-
- size
- {595, 842}
-
- NSPrintReverseOrientation
-
- int
- 0
-
- NSRightMargin
-
- float
- 18
-
- NSTopMargin
-
- float
- 18
-
-
- ReadOnly
- NO
- Sheets
-
-
- ActiveLayerIndex
- 0
- AutoAdjust
-
- BackgroundGraphic
-
- Bounds
- {{0, 0}, {559, 783}}
- Class
- SolidGraphic
- ID
- 2
- Style
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
- BaseZoom
- 0
- CanvasOrigin
- {0, 0}
- ColumnAlign
- 1
- ColumnSpacing
- 36
- DisplayScale
- 1 0/72 in = 1.0000 in
- GraphicsList
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- Head
-
- ID
- 3
-
- ID
- 47
- Points
-
- {280.41377887789179, 462.5}
- {280.41377887789179, 115.5}
-
- Style
-
- stroke
-
- HeadArrow
- FilledArrow
- Legacy
-
- TailArrow
- 0
-
-
- Tail
-
- ID
- 25
-
-
-
- Bounds
- {{146.91379269521701, 588}, {66, 22}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 46
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 Manage}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Bounds
- {{146.41379269521701, 139}, {37, 22}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 45
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 URL}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- Head
-
- ID
- 16
-
- ID
- 44
- Points
-
- {372.15749563673154, 519.63519787392613}
- {299.36323356646488, 641.57068447906465}
-
- Style
-
- stroke
-
- HeadArrow
- 0
- Legacy
-
- TailArrow
- FilledArrow
-
-
- Tail
-
- ID
- 19
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- Head
-
- ID
- 16
-
- ID
- 43
- Points
-
- {278.95058213917224, 553.49998251301065}
- {278.21479589361269, 641.50001748698935}
-
- Style
-
- stroke
-
- HeadArrow
- 0
- Legacy
-
- TailArrow
- FilledArrow
-
-
- Tail
-
- ID
- 18
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- Head
-
- ID
- 16
-
- ID
- 42
- Points
-
- {183.67008975370248, 519.63519787392613}
- {256.46435182396914, 641.57068447906465}
-
- Style
-
- stroke
-
- HeadArrow
- 0
- Legacy
-
- TailArrow
- FilledArrow
-
-
- Tail
-
- ID
- 17
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- Head
-
- ID
- 3
-
- ID
- 38
- Points
-
- {444.39191518528105, 474.53820418060883}
- {295.34133058251143, 115.4617958193911}
-
- Style
-
- stroke
-
- HeadArrow
- FilledArrow
- Legacy
-
- TailArrow
- 0
-
-
- Tail
-
- ID
- 28
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- Head
-
- ID
- 3
-
- ID
- 37
- Points
-
- {436.53677827701807, 306.58797902807424}
- {305.10330711341595, 115.41202097192573}
-
- Style
-
- stroke
-
- HeadArrow
- FilledArrow
- Legacy
-
- TailArrow
- 0
-
-
- Tail
-
- ID
- 26
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- Head
-
- ID
- 3
-
- ID
- 34
- Points
-
- {112.13640445647307, 462.54223298101982}
- {264.62814058011099, 115.45782532539049}
-
- Style
-
- stroke
-
- HeadArrow
- FilledArrow
- Legacy
-
- TailArrow
- 0
-
-
- Tail
-
- ID
- 22
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- Head
-
- ID
- 3
-
- ID
- 33
- Points
-
- {119.85283790129353, 306.5917371670962}
- {255.0255287391405, 115.40826283290379}
-
- Style
-
- stroke
-
- HeadArrow
- FilledArrow
- Legacy
-
- TailArrow
- 0
-
-
- Tail
-
- ID
- 20
-
-
-
- Bounds
- {{422.41379269521701, 411}, {60, 22}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 32
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 \'85\'85\'85.}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Bounds
- {{247.91379269521701, 411}, {60, 22}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 31
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 \'85\'85\'85.}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Bounds
- {{65.913792695217012, 411}, {60, 22}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 30
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 \'85\'85\'85.}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Bounds
- {{392.41379269521701, 475}, {128, 57}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 28
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 Spider}
-
-
-
- Bounds
- {{392.41379269521701, 307}, {128, 57}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 26
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 Spider}
-
-
-
- Bounds
- {{216.41379269521701, 463}, {128, 57}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 25
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 Spider}
-
-
-
- Bounds
- {{216.41379269521701, 307}, {128, 57}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 23
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 Spider}
-
-
-
- Bounds
- {{35.413792695217012, 463}, {128, 57}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 22
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 Spider}
-
-
-
- Bounds
- {{35.413792695217012, 307}, {128, 57}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 20
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 Spider}
-
-
-
- Bounds
- {{202.41379269521701, 642}, {151, 71}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 16
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs48 \cf0 Admin}
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- Head
-
- ID
- 3
-
- ID
- 15
- Points
-
- {428.35251803234684, 231.64153003420739}
- {315.28169719234131, 115.35846996579261}
-
- Style
-
- stroke
-
- HeadArrow
- FilledArrow
- Legacy
-
- TailArrow
- 0
-
-
- Tail
-
- ID
- 8
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- Head
-
- ID
- 3
-
- ID
- 13
- Points
-
- {128.26734609568192, 231.6464465995351}
- {244.56023929475211, 115.35355340046489}
-
- Style
-
- stroke
-
- HeadArrow
- FilledArrow
- Legacy
-
- TailArrow
- 0
-
-
- Tail
-
- ID
- 7
-
-
-
- Bounds
- {{392.41379269521701, 232}, {128, 57}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 8
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 Spider}
-
-
-
- Bounds
- {{35.413792695217012, 232}, {128, 57}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 7
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 Spider}
-
-
-
- Bounds
- {{216.41379269521701, 232}, {128, 57}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 5
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs36 \cf0 Spider}
-
-
-
- Bounds
- {{200.41379269521701, 44}, {160, 71}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 3
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs48 \cf0 Scheduler}
-
-
-
- Bounds
- {{15.413792695217012, 204}, {168, 349}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 17
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs48 \cf0 Worker}
-
-
-
- Bounds
- {{372.41379269521701, 204}, {168, 349}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 19
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs48 \cf0 Worker}
-
-
-
- Bounds
- {{200.41379269521701, 204}, {160, 349}}
- Class
- ShapedGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 18
-
- ID
- 18
- Shape
- Rectangle
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs48 \cf0 Worker}
-
-
-
- GridInfo
-
- HPages
- 1
- KeepToScale
-
- Layers
-
-
- Lock
- NO
- Name
- 图层 1
- Print
- YES
- View
- YES
-
-
- LayoutInfo
-
- Animate
- NO
- circoMinDist
- 18
- circoSeparation
- 0.0
- layoutEngine
- dot
- neatoSeparation
- 0.0
- twopiSeparation
- 0.0
-
- Orientation
- 2
- PrintOnePage
-
- RowAlign
- 1
- RowSpacing
- 36
- SheetTitle
- 版面 1
- UniqueID
- 1
- VPages
- 1
-
-
- ActiveLayerIndex
- 0
- AutoAdjust
-
- BackgroundGraphic
-
- Bounds
- {{0, 0}, {559, 783}}
- Class
- SolidGraphic
- ID
- 2
- Style
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
- BaseZoom
- 0
- CanvasOrigin
- {0, 0}
- ColumnAlign
- 1
- ColumnSpacing
- 36
- DisplayScale
- 1 0/72 in = 1.0000 in
- GraphicsList
-
-
- Bounds
- {{278, 395}, {172, 104}}
- Class
- ShapedGraphic
- ID
- 52
- ImageID
- 1
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs48 \cf0 Http API}
-
-
-
- Bounds
- {{113, 395}, {172, 104}}
- Class
- ShapedGraphic
- ID
- 51
- ImageID
- 1
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs48 \cf0 WebMagic}
-
-
-
- Bounds
- {{278, 499}, {172, 104}}
- Class
- ShapedGraphic
- ID
- 50
- ImageID
- 1
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs48 \cf0 Redis}
-
-
-
- Bounds
- {{113, 499}, {172, 104}}
- Class
- ShapedGraphic
- ID
- 49
- ImageID
- 1
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs48 \cf0 Mysql}
-
-
-
- GridInfo
-
- HPages
- 1
- KeepToScale
-
- Layers
-
-
- Lock
- NO
- Name
- 图层 1
- Print
- YES
- View
- YES
-
-
- LayoutInfo
-
- Animate
- NO
- circoMinDist
- 18
- circoSeparation
- 0.0
- layoutEngine
- dot
- neatoSeparation
- 0.0
- twopiSeparation
- 0.0
-
- Orientation
- 2
- PrintOnePage
-
- RowAlign
- 1
- RowSpacing
- 36
- SheetTitle
- 版面 2
- UniqueID
- 2
- VPages
- 1
-
-
- SmartAlignmentGuidesActive
- YES
- SmartDistanceGuidesActive
- YES
- UseEntirePage
-
- WindowInfo
-
- CurrentSheet
- 0
- ExpandedCanvases
-
- Frame
- {{373, 90}, {693, 788}}
- ListView
-
- OutlineWidth
- 142
- RightSidebar
-
- ShowRuler
-
- Sidebar
-
- SidebarWidth
- 120
- VisibleRegion
- {{-106, -56}, {769.65514710343643, 895.17238435507204}}
- Zoom
- 0.72500002384185791
- ZoomValues
-
-
- 版面 1
- 0.72500002384185791
- 1.4500000476837158
-
-
- 版面 2
- 1
- 0.5
-
-
-
-
-
diff --git a/assets/image1.pdf b/assets/image1.pdf
deleted file mode 100644
index 79fff308c863194379c2b05fa26aecbaeca4a0f4..0000000000000000000000000000000000000000
Binary files a/assets/image1.pdf and /dev/null differ
diff --git a/assets/logo-simple.jpg b/assets/logo-simple.jpg
deleted file mode 100644
index 366aa6276185d8b1c946aae4c3e453fdc377e1b9..0000000000000000000000000000000000000000
Binary files a/assets/logo-simple.jpg and /dev/null differ
diff --git a/assets/logo.graffle b/assets/logo.graffle
deleted file mode 100644
index 84bbe20b50ccfb49748687b6245a825c9b9ce682..0000000000000000000000000000000000000000
--- a/assets/logo.graffle
+++ /dev/null
@@ -1,351 +0,0 @@
-
-
-
-
- ActiveLayerIndex
- 0
- ApplicationVersion
-
- com.omnigroup.OmniGrafflePro
- 139.16.0.171715
-
- AutoAdjust
-
- BackgroundGraphic
-
- Bounds
- {{0, 0}, {48, 48}}
- Class
- SolidGraphic
- ID
- 2
- Style
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
- BaseZoom
- 0
- CanvasOrigin
- {0, 0}
- CanvasSize
- {48, 48}
- ColumnAlign
- 1
- ColumnSpacing
- 36
- CreationDate
- 2013-11-10 06:17:01 +0000
- Creator
- 黄 亿华
- DisplayScale
- 1 pt = 1 pt
- GraphDocumentVersion
- 8
- GraphicsList
-
-
- Bounds
- {{7.5, 24}, {23, 15}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- ID
- 45
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 Cochin;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs16 \cf0 Magi
-\fs24 c}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Bounds
- {{18, 13}, {19.359630584716797, 18}}
- Class
- ShapedGraphic
- FitText
- Vertical
- Flow
- Resize
- FontInfo
-
- Color
-
- w
- 0
-
- Font
- STHeitiSC-Light
- Size
- 6
-
- ID
- 39
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fmodern\fcharset0 Courier-Oblique;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\i\fs14 \cf0 eb}
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 13
-
- ID
- 31
- Points
-
- {6, 11}
- {15, 27}
- {14, 8}
- {21, 26}
- {22, 6}
- {22, 6}
-
- Style
-
- stroke
-
- HeadArrow
- 0
- Legacy
-
- LineType
- 1
- TailArrow
- 0
-
-
-
-
- GridInfo
-
- GridSpacing
- 1
- ShowsGrid
- YES
- SnapsToGrid
- YES
-
- GuidesLocked
- NO
- GuidesVisible
- YES
- HPages
- 1
- ImageCounter
- 2
- KeepToScale
-
- Layers
-
-
- Lock
- NO
- Name
- 图层 1
- Print
- YES
- View
- YES
-
-
- LayoutInfo
-
- Animate
- NO
- circoMinDist
- 18
- circoSeparation
- 0.0
- layoutEngine
- dot
- neatoSeparation
- 0.0
- twopiSeparation
- 0.0
-
- LinksVisible
- NO
- MagnetsVisible
- NO
- MasterSheets
-
- ModificationDate
- 2013-11-10 06:51:47 +0000
- Modifier
- 黄 亿华
- NotesVisible
- NO
- Orientation
- 2
- OriginVisible
- NO
- PageBreaks
- YES
- PrintInfo
-
- NSBottomMargin
-
- float
- 41
-
- NSHorizonalPagination
-
- coded
- BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG
-
- NSLeftMargin
-
- float
- 18
-
- NSPaperSize
-
- size
- {594.99997329711914, 842}
-
- NSPrintReverseOrientation
-
- int
- 0
-
- NSRightMargin
-
- float
- 18
-
- NSTopMargin
-
- float
- 18
-
-
- PrintOnePage
-
- ReadOnly
- NO
- RowAlign
- 1
- RowSpacing
- 36
- SheetTitle
- 版面 1
- SmartAlignmentGuidesActive
- NO
- SmartDistanceGuidesActive
- NO
- UniqueID
- 1
- UseEntirePage
-
- VPages
- 1
- WindowInfo
-
- CurrentSheet
- 0
- ExpandedCanvases
-
- Frame
- {{491, 381}, {498, 477}}
- ListView
-
- OutlineWidth
- 142
- RightSidebar
-
- Sidebar
-
- SidebarWidth
- 116
- VisibleRegion
- {{0.125, 0.125}, {47.75, 47.875}}
- Zoom
- 8
- ZoomValues
-
-
- 版面 1
- 8
- 1
-
-
-
-
-
diff --git a/assets/logo.jpg b/assets/logo.jpg
deleted file mode 100644
index 356e25df0185c7461037b9dc15dc9d4a8566f476..0000000000000000000000000000000000000000
Binary files a/assets/logo.jpg and /dev/null differ
diff --git a/assets/logo2.graffle/data.plist b/assets/logo2.graffle/data.plist
deleted file mode 100644
index 54d64a42f36e601bc342f1916f6224715d4c6bc8..0000000000000000000000000000000000000000
--- a/assets/logo2.graffle/data.plist
+++ /dev/null
@@ -1,552 +0,0 @@
-
-
-
-
- ActiveLayerIndex
- 0
- ApplicationVersion
-
- com.omnigroup.OmniGrafflePro
- 139.16.0.171715
-
- AutoAdjust
-
- BackgroundGraphic
-
- Bounds
- {{0, 0}, {1117.9999465942383, 783}}
- Class
- SolidGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 37
-
- ID
- 2
- Style
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
- BaseZoom
- 0
- CanvasOrigin
- {0, 0}
- ColumnAlign
- 1
- ColumnSpacing
- 36
- CreationDate
- 2013-11-10 06:51:58 +0000
- Creator
- 黄 亿华
- DisplayScale
- 1 0/72 in = 1 0/72 in
- GraphDocumentVersion
- 8
- GraphicsList
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 13
-
- Head
-
- ID
- 60
- Position
- 0.40939974784851074
-
- ID
- 62
- Points
-
- {324, 109}
- {339.36559006029825, 179.11528294284673}
-
- Style
-
- stroke
-
- HeadArrow
- 0
- Legacy
-
- LineType
- 1
- TailArrow
- 0
- Width
- 10
-
-
- Tail
-
- ID
- 59
- Info
- 4
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 13
-
- Head
-
- ID
- 60
- Position
- 0.73653632402420044
-
- ID
- 61
- Points
-
- {269, 146}
- {296, 194}
- {309, 266}
- {349, 265}
- {348.96211936963607, 215.03741157007715}
-
- Style
-
- stroke
-
- HeadArrow
- 0
- Legacy
-
- LineType
- 1
- TailArrow
- 0
- Width
- 10
-
-
- Tail
-
- ID
- 59
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 13
-
- ID
- 60
- Points
-
- {371.89694213867188, 179}
- {356.89694213867188, 162}
- {335.89694213867188, 188}
- {351.89694213867188, 217}
- {371.89694213867188, 202}
-
- Style
-
- stroke
-
- HeadArrow
- 0
- Legacy
-
- LineType
- 1
- TailArrow
- 0
- Width
- 10
-
-
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 13
-
- ID
- 59
- Points
-
- {269, 146}
- {295, 189}
- {300, 110}
- {310, 178}
- {324, 109}
-
- Style
-
- stroke
-
- HeadArrow
- 0
- Legacy
-
- LineType
- 1
- TailArrow
- 0
- Width
- 10
-
-
-
-
- Bounds
- {{335.89695436197019, 119}, {41, 43}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- ID
- 47
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs72 \cf0 eb}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Bounds
- {{164, 154}, {236.89692325714185, 98.181818181818088}}
- Class
- ShapedGraphic
- ID
- 45
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 Cochin;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\fs96 \cf0 Magi}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Class
- LineGraphic
- FontInfo
-
- Font
- Helvetica
- Size
- 13
-
- ID
- 31
- Points
-
- {50.404270172119141, 72.000000000000256}
- {115.40427017211914, 154.00000000000028}
- {103.80320000069037, 26.090909090909292}
- {124.95447158813477, 97}
- {175.90226360069005, 143.90909090909116}
- {186.20212982926148, 13}
- {186.20212982926148, 13}
-
- Style
-
- stroke
-
- HeadArrow
- 0
- Legacy
-
- LineType
- 1
- TailArrow
- 0
- Width
- 10
-
-
-
-
- Bounds
- {{406.79786682128906, 136.09091186523438}, {165, 160}}
- Class
- ShapedGraphic
- ID
- 46
- ImageID
- 1
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
-
- GridInfo
-
- GuidesLocked
- NO
- GuidesVisible
- YES
- HPages
- 2
- ImageCounter
- 2
- ImageLinkBack
-
-
-
- ImageList
-
- image1.tiff
-
- KeepToScale
-
- Layers
-
-
- Lock
- NO
- Name
- 图层 1
- Print
- YES
- View
- YES
-
-
- LayoutInfo
-
- Animate
- NO
- circoMinDist
- 18
- circoSeparation
- 0.0
- layoutEngine
- dot
- neatoSeparation
- 0.0
- twopiSeparation
- 0.0
-
- LinksVisible
- NO
- MagnetsVisible
- NO
- MasterSheets
-
- ModificationDate
- 2013-11-10 07:00:00 +0000
- Modifier
- 黄 亿华
- NotesVisible
- NO
- Orientation
- 2
- OriginVisible
- NO
- PageBreaks
- YES
- PrintInfo
-
- NSBottomMargin
-
- float
- 41
-
- NSHorizonalPagination
-
- coded
- BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG
-
- NSLeftMargin
-
- float
- 18
-
- NSPaperSize
-
- size
- {594.99997329711914, 842}
-
- NSPrintReverseOrientation
-
- int
- 0
-
- NSRightMargin
-
- float
- 18
-
- NSTopMargin
-
- float
- 18
-
-
- PrintOnePage
-
- ReadOnly
- NO
- RowAlign
- 1
- RowSpacing
- 36
- SheetTitle
- 版面 1
- SmartAlignmentGuidesActive
- YES
- SmartDistanceGuidesActive
- YES
- UniqueID
- 1
- UseEntirePage
-
- VPages
- 1
- WindowInfo
-
- CurrentSheet
- 0
- ExpandedCanvases
-
- Frame
- {{350, -208}, {693, 795}}
- ListView
-
- OutlineWidth
- 142
- RightSidebar
-
- ShowRuler
-
- Sidebar
-
- SidebarWidth
- 120
- VisibleRegion
- {{23, 0}, {558, 656}}
- Zoom
- 1
- ZoomValues
-
-
- 版面 1
- 1
- 1
-
-
-
-
-
diff --git a/assets/logo2.graffle/image1.tiff b/assets/logo2.graffle/image1.tiff
deleted file mode 100644
index 42bff86e55fec780c1b1eeec32b8c0e9f284ec2a..0000000000000000000000000000000000000000
Binary files a/assets/logo2.graffle/image1.tiff and /dev/null differ
diff --git a/assets/logo3.graffle/data.plist b/assets/logo3.graffle/data.plist
deleted file mode 100644
index 07fdd02cf9e43c8b9ac507df455dff1fa67be34b..0000000000000000000000000000000000000000
--- a/assets/logo3.graffle/data.plist
+++ /dev/null
@@ -1,840 +0,0 @@
-
-
-
-
- ApplicationVersion
-
- com.omnigroup.OmniGrafflePro
- 139.16.0.171715
-
- CreationDate
- 2013-11-10 07:01:04 +0000
- Creator
- 黄 亿华
- GraphDocumentVersion
- 8
- GuidesLocked
- NO
- GuidesVisible
- YES
- ImageCounter
- 6
- ImageLinkBack
-
-
-
-
-
-
- ImageList
-
- image5.tiff
- image4.tiff
- image2.tiff
- image1.tiff
-
- LinksVisible
- NO
- MagnetsVisible
- NO
- MasterSheets
-
- ModificationDate
- 2013-11-10 08:09:16 +0000
- Modifier
- 黄 亿华
- NotesVisible
- NO
- OriginVisible
- NO
- PageBreaks
- YES
- PrintInfo
-
- NSBottomMargin
-
- float
- 41
-
- NSHorizonalPagination
-
- coded
- BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG
-
- NSLeftMargin
-
- float
- 18
-
- NSPaperSize
-
- size
- {594.99997329711914, 842}
-
- NSPrintReverseOrientation
-
- int
- 0
-
- NSRightMargin
-
- float
- 18
-
- NSTopMargin
-
- float
- 18
-
-
- ReadOnly
- NO
- Sheets
-
-
- ActiveLayerIndex
- 0
- AutoAdjust
-
- BackgroundGraphic
-
- Bounds
- {{0, 0}, {558.99997329711914, 783}}
- Class
- SolidGraphic
- ID
- 2
- Style
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
- BaseZoom
- 0
- CanvasOrigin
- {0, 0}
- ColumnAlign
- 1
- ColumnSpacing
- 36
- DisplayScale
- 1 0/72 in = 1 0/72 in
- GraphicsList
-
-
- Bounds
- {{390, 391.5}, {114, 90}}
- Class
- ShapedGraphic
- ID
- 7
- ImageID
- 2
- Shape
- Rectangle
- Style
-
- fill
-
- FillType
- 2
- GradientAngle
- 90
- GradientColor
-
- w
- 0.666667
-
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
-
- Bounds
- {{3, 265}, {181, 114}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- FontInfo
-
- Font
- LucidaSans-DemiItalic
- Size
- 96
-
- ID
- 6
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 LucidaSans-Demi;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\i\b\fs192 \cf1 M }
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Bounds
- {{168, 314}, {77, 58}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- FontInfo
-
- Font
- LucidaSans-DemiItalic
- Size
- 48
-
- ID
- 5
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 LucidaSans-Demi;}
-{\colortbl;\red255\green255\blue255;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\i\b\fs96 \cf2 agi}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Bounds
- {{356, 201}, {86, 86}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- FontInfo
-
- Font
- LucidaBright-DemiItalic
- Size
- 72
-
- ID
- 4
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 LucidaBright-Demi;}
-{\colortbl;\red255\green255\blue255;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\i\b\fs144 \cf2 eb}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- Bounds
- {{43, 114}, {395, 400}}
- Class
- ShapedGraphic
- FitText
- Clip
- Flow
- Clip
- HFlip
- YES
- ID
- 3
- ImageID
- 1
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
-
- Bounds
- {{-4, 114}, {535, 400}}
- Class
- ShapedGraphic
- ID
- 1
- Shape
- Rectangle
- Style
-
- fill
-
- Color
-
- b
- 0
- g
- 0
- r
- 0
-
-
- shadow
-
- Draws
- NO
-
-
-
-
- GridInfo
-
- HPages
- 1
- KeepToScale
-
- Layers
-
-
- Lock
- NO
- Name
- 图层 1
- Print
- YES
- View
- YES
-
-
- LayoutInfo
-
- Animate
- NO
- circoMinDist
- 18
- circoSeparation
- 0.0
- layoutEngine
- dot
- neatoSeparation
- 0.0
- twopiSeparation
- 0.0
-
- Orientation
- 2
- PrintOnePage
-
- RowAlign
- 1
- RowSpacing
- 36
- SheetTitle
- 版面 1
- UniqueID
- 1
- VPages
- 1
-
-
- ActiveLayerIndex
- 0
- AutoAdjust
-
- BackgroundGraphic
-
- Bounds
- {{0, 0}, {558.99997329711914, 783}}
- Class
- SolidGraphic
- ID
- 2
- Style
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
- BaseZoom
- 0
- CanvasOrigin
- {0, 0}
- ColumnAlign
- 1
- ColumnSpacing
- 36
- DisplayScale
- 1 0/72 in = 1.0000 in
- GraphicsList
-
-
- Bounds
- {{232, 432}, {84, 93}}
- Class
- ShapedGraphic
- ID
- 10
- ImageID
- 4
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
-
- Bounds
- {{16, 421}, {500, 115}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- FontInfo
-
- Font
- Helvetica-Bold
- Size
- 96
-
- ID
- 8
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;\red0\green0\blue0;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\i\b\fs192 \cf2 Web agic}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- GridInfo
-
- HPages
- 1
- KeepToScale
-
- Layers
-
-
- Lock
- NO
- Name
- 图层 1
- Print
- YES
- View
- YES
-
-
- LayoutInfo
-
- Animate
- NO
- circoMinDist
- 18
- circoSeparation
- 0.0
- layoutEngine
- dot
- neatoSeparation
- 0.0
- twopiSeparation
- 0.0
-
- Orientation
- 2
- PrintOnePage
-
- RowAlign
- 1
- RowSpacing
- 36
- SheetTitle
- 版面 2
- UniqueID
- 2
- VPages
- 1
-
-
- ActiveLayerIndex
- 0
- AutoAdjust
-
- BackgroundGraphic
-
- Bounds
- {{0, 0}, {1117.9999465942383, 783}}
- Class
- SolidGraphic
- ID
- 2
- Style
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
- BaseZoom
- 0
- CanvasOrigin
- {0, 0}
- ColumnAlign
- 1
- ColumnSpacing
- 36
- DisplayScale
- 1 0/72 in = 1.0000 in
- GraphicsList
-
-
- Bounds
- {{9, 277.5}, {114, 114}}
- Class
- ShapedGraphic
- ID
- 11
- ImageID
- 5
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- shadow
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
-
-
- Bounds
- {{100, 294}, {474, 115}}
- Class
- ShapedGraphic
- FitText
- YES
- Flow
- Resize
- FontInfo
-
- Font
- Helvetica-Bold
- Size
- 96
-
- ID
- 8
- Shape
- Rectangle
- Style
-
- fill
-
- Draws
- NO
-
- stroke
-
- Draws
- NO
-
-
- Text
-
- Pad
- 0
- Text
- {\rtf1\ansi\ansicpg936\cocoartf1187\cocoasubrtf400
-\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
-{\colortbl;\red255\green255\blue255;}
-\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-
-\f0\i\b\fs192 \cf0 WebMagic}
- VerticalPad
- 0
-
- Wrap
- NO
-
-
- GridInfo
-
- HPages
- 2
- KeepToScale
-
- Layers
-
-
- Lock
- NO
- Name
- 图层 1
- Print
- YES
- View
- YES
-
-
- LayoutInfo
-
- Animate
- NO
- circoMinDist
- 18
- circoSeparation
- 0.0
- layoutEngine
- dot
- neatoSeparation
- 0.0
- twopiSeparation
- 0.0
-
- Orientation
- 2
- PrintOnePage
-
- RowAlign
- 1
- RowSpacing
- 36
- SheetTitle
- 版面 3
- UniqueID
- 3
- VPages
- 1
-
-
- SmartAlignmentGuidesActive
- YES
- SmartDistanceGuidesActive
- YES
- UseEntirePage
-
- WindowInfo
-
- CurrentSheet
- 2
- ExpandedCanvases
-
- Frame
- {{174, 77}, {771, 795}}
- ListView
-
- OutlineWidth
- 142
- RightSidebar
-
- ShowRuler
-
- Sidebar
-
- SidebarWidth
- 120
- VisibleRegion
- {{0, 0}, {636, 656}}
- Zoom
- 1
- ZoomValues
-
-
- 版面 1
- 1
- 1
-
-
- 版面 2
- 1
- 1
-
-
- 版面 3
- 1
- 1
-
-
-
-
-
diff --git a/assets/logo3.graffle/image1.tiff b/assets/logo3.graffle/image1.tiff
deleted file mode 100644
index 7d50474729e30e0fa30209b2b66cf5d5ee5ce7dc..0000000000000000000000000000000000000000
Binary files a/assets/logo3.graffle/image1.tiff and /dev/null differ
diff --git a/assets/logo3.graffle/image2.tiff b/assets/logo3.graffle/image2.tiff
deleted file mode 100644
index 606ae8dfcfa0e2eb843bad49f2d7c36832a0c3d9..0000000000000000000000000000000000000000
Binary files a/assets/logo3.graffle/image2.tiff and /dev/null differ
diff --git a/assets/logo3.graffle/image4.tiff b/assets/logo3.graffle/image4.tiff
deleted file mode 100644
index 0f674bf9628bf498431c5872703df59c7e17a6cf..0000000000000000000000000000000000000000
Binary files a/assets/logo3.graffle/image4.tiff and /dev/null differ
diff --git a/assets/logo3.graffle/image5.tiff b/assets/logo3.graffle/image5.tiff
deleted file mode 100644
index 2de8dfc47ed1ea521a6bba846569d592bd6a0a62..0000000000000000000000000000000000000000
Binary files a/assets/logo3.graffle/image5.tiff and /dev/null differ
diff --git a/assets/logo3.png b/assets/logo3.png
deleted file mode 100644
index bf4d7511b697a4748326c8841c4dff07b72e92c3..0000000000000000000000000000000000000000
Binary files a/assets/logo3.png and /dev/null differ
diff --git a/assets/logo4.png b/assets/logo4.png
deleted file mode 100644
index ba2337f7f93058d39a82b650ab94bf23fa0fc1e7..0000000000000000000000000000000000000000
Binary files a/assets/logo4.png and /dev/null differ
diff --git a/assets/page-extract-rule.bmml b/assets/page-extract-rule.bmml
deleted file mode 100644
index fec8d3ec84357157eec3ddd226636020b0a66852..0000000000000000000000000000000000000000
--- a/assets/page-extract-rule.bmml
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
-
-
- A%20Web%20Page%0Ahttp%3A//
-
-
-
-
\ No newline at end of file
diff --git a/assets/webmagic-create-spider.bmml b/assets/webmagic-create-spider.bmml
deleted file mode 100644
index 761704291613e3013950bfd57734ae1db2cd17c1..0000000000000000000000000000000000000000
--- a/assets/webmagic-create-spider.bmml
+++ /dev/null
@@ -1,440 +0,0 @@
-
-
-
-
- Create%20Spider%0Ahttp%3A//localhost%3A8080/spider/create
-
-
-
-
-
-
- true
- Custom%20PageProcessor%20
-
-
-
-
-
-
- true
- BlogSpider
-
-
-
-
- true
- SpiderTemplate
-
-
-
-
- true
- New%20Template
-
-
-
-
- true
- Title
-
-
-
-
- true
- //title/text%28%29
-
-
-
-
- true
- Content
-
-
-
-
- true
- //div%5B@class%3D%27BlogContent%27%5D/text%28%29
-
-
-
-
- true
- Date
-
-
-
-
- true
- //div%5B@class%3D%27BlogStat%27%5D/regex%28%27%5Cd+-%5Cd+-%5Cd+%5Cs+%5Cd+%3A%5Cd+%27%29
-
-
-
-
- true
- Tags
-
-
-
-
- true
- //div%5B@class%3D%27tags%27%5D/a/text%28%29
-
-
-
-
-
-
-
-
-
-
- Create%20Spider
-
-
-
-
- Name
-
-
-
-
- blog.oschina.net
-
-
-
-
- StartUrls
-
-
-
-
- http%3A//my.oschina.net/flashsword/blog/180623
-
-
-
-
- Other%20Source
-
-
-
-
- up
- Advanced%20Setting
-
-
-
-
-
-
-
-
- URL%20manangement
-
-
-
-
- Scheduler
-
-
-
-
- Host
-
-
-
-
- Redis
-
-
-
-
- 127.0.0.1
-
-
-
-
- 6379
-
-
-
-
-
-
- true
- New%20Scheduler
-
-
-
-
-
-
- Persistent
-
-
-
-
- Pipeline
-
-
-
-
- Path
-
-
-
-
- Local%20File
-
-
-
-
- /data/webmaigc/%7BspdierName%7D
-
-
-
-
-
-
- true
- New%20Pipeline
-
-
-
-
- Create
-
-
-
-
- Cancel
-
-
-
-
-
-
-
-
- Advanced%20Setting
-
-
-
-
-
-
- Headers
-
-
-
-
- true
- User%20Agent
-
-
-
-
- true
- Cookie
-
-
-
-
- true
- Mozilla/5.0%20%28compatible%3B%20MSIE%2010.0...
-
-
-
-
- true
- id
-
-
-
-
- Add
-
-
-
-
- true
- name
-
-
-
-
- true
- value
-
-
-
-
- Add
-
-
-
-
- Add
-
-
-
-
- true
- 123456
-
-
-
-
-
-
-
-
- true
- Proxy
-
-
-
-
- true
- 127.0.0.1
-
-
-
-
- true
- 8080
-
-
-
-
- true
- username
-
-
-
-
- true
- password
-
-
-
-
-
-
-
-
- true
- Charset
-
-
-
-
- true
- utf-8
-
-
-
-
- AutoDetect
-
-
-
-
-
-
-
-
- true
- Frenquecny
-
-
-
-
- true
- 3000
-
-
-
-
- true
- Sleep
-
-
-
-
- true
- milliseconds%20after%20download%20one%20page
-
-
-
-
-
-
-
-
-
-
- Error%20Handle
-
-
-
-
-
-
- true
- Retry
-
-
-
-
- true
- 3
-
-
-
-
- true
- Retry
-
-
-
-
- true
- times%20when%20downloading%20a%20page
-
-
-
-
- true
- If%20it%20still%20fails%20in%20downloading%2C%20re-insert%20it%20to%20url%20queue.%5Cr%5CrAfter%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20times%2C%20the%20url%20will%20be%20discarded.
-
-
-
-
- true
- 3
-
-
-
-
-
-
-
-
-
-
- Test
-
-
-
-
- import
-
-
-
-
- export
-
-
-
-
\ No newline at end of file
diff --git a/assets/webmagic-create-spider.png b/assets/webmagic-create-spider.png
deleted file mode 100644
index 8fe92c47018e22114df68581510f92152612631c..0000000000000000000000000000000000000000
Binary files a/assets/webmagic-create-spider.png and /dev/null differ
diff --git a/assets/webmagic-spider-manage.bmml b/assets/webmagic-spider-manage.bmml
deleted file mode 100644
index 1423b01f8fb8f31a76d471abf670a706f1d2e7ad..0000000000000000000000000000000000000000
--- a/assets/webmagic-spider-manage.bmml
+++ /dev/null
@@ -1,110 +0,0 @@
-
-
-
-
- Spider%20List%20Page%0Ahttp%3A//localhost%3A8080/spider/list
-
-
-
-
- Spider%20%2C%20Add%20Time%20%5Ev%2CPages%20Total%20%5Ev%2C%20Pages%20Downloaded%20%5Ev%2C%20Error%20%5Ev%2C%20%20Operation%0Agithub.com%2C%202014-3-1.12%3A20%3A10%2C1221%2C%20595%2C%204%2C%20Stop%20Edit%20Delete%0Aoschina.net%2C2014-2-12.16%3A10%3A20%2C120%2C%20%20120%2C%200%2C%20Start%20Edit%20Delete%0Aappstore.com%2C2014-2-10.9%3A20%3A10%2C100000%2C100000%2C%200%2CStart%20Edit%20Delete
-
-
-
-
- Works
-
-
-
-
-
-
- selected
- 10.1.2.1
-
-
-
-
- 10.1.2.2
-
-
-
-
- selected
- 10.1.2.3
-
-
-
-
- 10.1.2.4
-
-
-
-
- all
-
-
-
-
-
-
-
-
-
- Real%20Time
-
-
-
-
- Pages
-
-
-
-
- Time
-
-
-
-
-
-
- Keyword
-
-
-
-
- Search
-
-
-
-
- Spiders
-
-
-
-
- Charts
-
-
-
-
- 2014-2-1
-
-
-
-
- 2014-3-1
-
-
-
-
- Time%20from
-
-
-
-
- to
-
-
-
-
\ No newline at end of file
diff --git a/assets/webmagic-spider-manage.png b/assets/webmagic-spider-manage.png
deleted file mode 100644
index 8fbdb6a9e8bd9b49f6498bb5e52a1dc1b643fa9a..0000000000000000000000000000000000000000
Binary files a/assets/webmagic-spider-manage.png and /dev/null differ
diff --git a/assets/webmagic.psd b/assets/webmagic.psd
deleted file mode 100644
index 5f8fd3b72ef04b07fe7eefacd51a0cd55a0da7f5..0000000000000000000000000000000000000000
Binary files a/assets/webmagic.psd and /dev/null differ
diff --git a/pom.xml b/pom.xml
index 4279ec71c08e4a92f56c9f1f5eb10358a156e45a..2b2384fd827a7b2de547ae81e95df0ba901db2e4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
7
us.codecraft
- 0.7.0-SNAPSHOT
+ 0.7.3
4.0.0
pom
@@ -75,6 +75,11 @@
httpclient
4.5.2
+
+ org.apache.httpcomponents
+ httpcore
+ 4.4.4
+
com.google.guava
guava
@@ -83,7 +88,7 @@
com.jayway.jsonpath
json-path
- 0.8.1
+ 2.4.0
org.slf4j
@@ -108,7 +113,7 @@
com.github.dreamhead
moco-core
- 0.9.1
+ 0.11.0
test
@@ -146,7 +151,7 @@
org.jsoup
jsoup
- 1.8.3
+ 1.10.3
org.mockito
@@ -164,8 +169,7 @@
maven-surefire-plugin
2.18
- pertest
- -Xms1024m -Xmx1024m -Xss1m
+ 0
@@ -231,11 +235,20 @@
org.apache.maven.plugins
maven-javadoc-plugin
- 2.9.1
+ 2.10.4
UTF-8
+ WebMagic 0.7.3
+ en_US
+
+ aggregate
+
+ aggregate
+
+ site
+
attach-javadocs
@@ -289,7 +302,7 @@
org.apache.maven.plugins
maven-gpg-plugin
- 1.5
+ 1.6
verify
diff --git a/release-note.md b/release-note.md
deleted file mode 100755
index f44704efd075006a4fc3935fb6607b158f3815b4..0000000000000000000000000000000000000000
--- a/release-note.md
+++ /dev/null
@@ -1,91 +0,0 @@
-Release Notes
-----
-See latest versions in [https://github.com/code4craft/webmagic/releases](https://github.com/code4craft/webmagic/releases)
-
-*2012-9-4* `version:0.3.0`
-
-* Change default XPath selector from HtmlCleaner to [Xsoup](https://github.com/code4craft/xsoup).
-
- [Xsoup](https://github.com/code4craft/xsoup) is an XPath selector based on Jsoup written by me. It has much better performance than HtmlCleaner.
-
- Time of processing a page is reduced from 7~9ms to 0.4ms.
-
- If Xsoup is not stable for your usage, just use `Spider.xsoupOff()` to turn off it and report an issue to me!
-
-* Add cycle retry times for Site.
-
- When cycle retry times is set, Spider will put the url which downloading failed back to scheduler, and retry after a cycle of queue.
-
-*2012-8-20* `version:0.2.1`
-
-ComboExtractor support for annotation.
-
-Request priority support (using `PriorityScheduler`).
-
-Complete some I18n work (comments and documents).
-
-More convenient extractor API:
-
-* Add attribute name select for CSSSelector.
-* Group of regex selector can be specified.
-* Add OrSelector.
-* Add Selectors, import static Selectors.* for fluent API such as:
-
- or(regex("(.*)"), xpath("//title"), $("title")).select(s);
-* Add JsonPathSelector for Json parse.
-
-*2012-8-9* `version:0.2.0`
-
-此次更新的主题是"方便"(之前的主题是"灵活")。
-
-增加了webmagic-extension模块。
-
-增加了注解方式支持,可以通过POJO+注解的方式编写一个爬虫,更符合Java开发习惯。以下是抓取一个博客的完整代码:
-
- @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
- public class OschinaBlog {
-
- @ExtractBy("//title")
- private String title;
-
- @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
- private String content;
-
- @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
- private List tags;
-
- public static void main(String[] args) {
- OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"),
- new ConsolePageModelPipeline(), OschinaBlog.class)
- .scheduler(new RedisScheduler("127.0.0.1")).thread(5).run();
- }
-
- }
-
-增加一个Spider.test(url)方法,用于开发爬虫时进行调试。
-
-增加基于redis的分布式支持。
-
-增加XPath2.0语法支持(webmagic-saxon模块)。
-
-增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。
-
-修复了不支持https的bug。
-
-补充了文档:[webmagic-0.2.0用户手册](http://code4craft.github.io/webmagic/)。
-
-*2012-7-25* `version:0.1.0`
-
-第一个稳定版本。
-
-修改了若干API,使得可扩展性更强,为每个任务分配一个ID,可以通过ID区分不同任务。
-
-重写了Pipeline接口,将抽取结果集包装到ResultItems对象,而不是通用一个Page对象,便于逻辑分离。
-
-增加下载的重试机制,支持gzip,支持自定义UA/cookie。
-
-增加多线程抓取功能,只需在初始化的时候指定线程数即可。
-
-增加jquery形式的CSS Selector API,可以通过`page.getHtml().$("div.body")`形式抽取元素。
-
-完善了文档,架构说明:[webmagic的设计机制及原理-如何开发一个Java爬虫](http://my.oschina.net/flashsword/blog/145796),Javadoc:[http://code4craft.github.io/webmagic/docs](http://code4craft.github.io/webmagic/docs)。
\ No newline at end of file
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index 7ca5c7b077b9fe98a26bd80a535fd35d88babd41..e889cd491b6daa97c94e08b7238e540c7a69cd02 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -3,7 +3,7 @@
us.codecraft
webmagic-parent
- 0.7.0-SNAPSHOT
+ 0.7.3
4.0.0
diff --git a/webmagic-core/pom.xml.versionsBackup b/webmagic-core/pom.xml.versionsBackup
deleted file mode 100644
index b530bab4198920a32bd87eb95282b3c76ddba7e9..0000000000000000000000000000000000000000
--- a/webmagic-core/pom.xml.versionsBackup
+++ /dev/null
@@ -1,86 +0,0 @@
-
-
-
- us.codecraft
- webmagic-parent
- 0.5.2
-
- 4.0.0
-
- webmagic-core
-
-
-
- org.apache.httpcomponents
- httpclient
-
-
-
- junit
- junit
-
-
-
- com.google.guava
- guava
-
-
-
- org.apache.commons
- commons-lang3
-
-
-
- us.codecraft
- xsoup
-
-
-
- com.github.dreamhead
- moco-core
-
-
-
- org.slf4j
- slf4j-api
-
-
-
- org.slf4j
- slf4j-log4j12
-
-
-
- commons-collections
- commons-collections
-
-
-
- org.assertj
- assertj-core
-
-
-
- org.jsoup
- jsoup
-
-
-
- org.apache.commons
- commons-io
-
-
-
- com.jayway.jsonpath
- json-path
- 0.8.1
-
-
-
- com.alibaba
- fastjson
-
-
-
-
-
\ No newline at end of file
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index f9495a4a358fc751600baad0b4359a1d585e4bda..c11df693c75e14ce659595dcdad9e2bd65d9b160 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -4,9 +4,11 @@ import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
+import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
import java.util.Map;
@@ -41,15 +43,25 @@ public class Page {
private Map> headers;
- private int statusCode;
+ private int statusCode = HttpConstant.StatusCode.CODE_200;
- private boolean needCycleRetry;
+ private boolean downloadSuccess = true;
+
+ private byte[] bytes;
private List targetRequests = new ArrayList();
+ private String charset;
+
public Page() {
}
+ public static Page fail(){
+ Page page = new Page();
+ page.setDownloadSuccess(false);
+ return page;
+ }
+
public Page setSkip(boolean skip) {
resultItems.setSkip(skip);
return this;
@@ -73,7 +85,7 @@ public class Page {
*/
public Html getHtml() {
if (html == null) {
- html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
+ html = new Html(rawText, request.getUrl());
}
return html;
}
@@ -179,14 +191,6 @@ public class Page {
return request;
}
- public boolean isNeedCycleRetry() {
- return needCycleRetry;
- }
-
- public void setNeedCycleRetry(boolean needCycleRetry) {
- this.needCycleRetry = needCycleRetry;
- }
-
public void setRequest(Request request) {
this.request = request;
this.resultItems.setRequest(request);
@@ -221,17 +225,45 @@ public class Page {
this.headers = headers;
}
+ public boolean isDownloadSuccess() {
+ return downloadSuccess;
+ }
+
+ public void setDownloadSuccess(boolean downloadSuccess) {
+ this.downloadSuccess = downloadSuccess;
+ }
+
+ public byte[] getBytes() {
+ return bytes;
+ }
+
+ public void setBytes(byte[] bytes) {
+ this.bytes = bytes;
+ }
+
+ public String getCharset() {
+ return charset;
+ }
+
+ public void setCharset(String charset) {
+ this.charset = charset;
+ }
+
@Override
public String toString() {
return "Page{" +
"request=" + request +
", resultItems=" + resultItems +
+ ", html=" + html +
+ ", json=" + json +
", rawText='" + rawText + '\'' +
", url=" + url +
", headers=" + headers +
", statusCode=" + statusCode +
- ", needCycleRetry=" + needCycleRetry +
+ ", downloadSuccess=" + downloadSuccess +
", targetRequests=" + targetRequests +
+ ", charset='" + charset + '\'' +
+ ", bytes=" + Arrays.toString(bytes) +
'}';
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
index 21cd72e6a8736416989e1d9ed3934e5b04fabe03..eefd91bb521fb15507856132a6897554da0f302a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
@@ -1,5 +1,6 @@
package us.codecraft.webmagic;
+import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.utils.Experimental;
import java.io.Serializable;
@@ -23,14 +24,19 @@ public class Request implements Serializable {
private String method;
+ private HttpRequestBody requestBody;
+
/**
* Store additional information in extras.
*/
private Map extras;
+
/**
- * POST/GET param set
- * */
- private Map params=new HashMap();
+ * cookies for current url, if not set use Site's cookies
+ */
+ private Map cookies = new HashMap();
+
+ private Map headers = new HashMap();
/**
* Priority of the request.
@@ -39,6 +45,14 @@ public class Request implements Serializable {
*/
private long priority;
+ /**
+ * When it is set to TRUE, the downloader will not try to parse response body to text.
+ *
+ */
+ private boolean binaryContent = false;
+
+ private String charset;
+
public Request() {
}
@@ -87,12 +101,14 @@ public class Request implements Serializable {
return extras;
}
- public void setExtras(Map extras) {
+ public Request setExtras(Map extras) {
this.extras = extras;
+ return this;
}
- public void setUrl(String url) {
+ public Request setUrl(String url) {
this.url = url;
+ return this;
}
/**
@@ -105,31 +121,16 @@ public class Request implements Serializable {
return method;
}
- public void setMethod(String method) {
+ public Request setMethod(String method) {
this.method = method;
+ return this;
}
- public Map getParams() {
- return params;
- }
- /**
- * set params for request
- *
- * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
- * @param params params
- * */
- public void setParams(Map params) {
- this.params = params;
- }
- /**
- * set params for request
- *
- * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
- * @param key key
- * @param value value
- * */
- public void putParams(String key,String value) {
- params.put(key,value);
+ @Override
+ public int hashCode() {
+ int result = url != null ? url.hashCode() : 0;
+ result = 31 * result + (method != null ? method.hashCode() : 0);
+ return result;
}
@Override
@@ -140,16 +141,51 @@ public class Request implements Serializable {
Request request = (Request) o;
if (url != null ? !url.equals(request.url) : request.url != null) return false;
- if (method != null ? !method.equals(request.method) : request.method != null) return false;
- return params != null ? params.equals(request.params) : request.params == null;
+ return method != null ? method.equals(request.method) : request.method == null;
}
- @Override
- public int hashCode() {
- int result = url != null ? url.hashCode() : 0;
- result = 31 * result + (method != null ? method.hashCode() : 0);
- result = 31 * result + (params != null ? params.hashCode() : 0);
- return result;
+ public Request addCookie(String name, String value) {
+ cookies.put(name, value);
+ return this;
+ }
+
+ public Request addHeader(String name, String value) {
+ headers.put(name, value);
+ return this;
+ }
+
+ public Map getCookies() {
+ return cookies;
+ }
+
+ public Map getHeaders() {
+ return headers;
+ }
+
+ public HttpRequestBody getRequestBody() {
+ return requestBody;
+ }
+
+ public void setRequestBody(HttpRequestBody requestBody) {
+ this.requestBody = requestBody;
+ }
+
+ public boolean isBinaryContent() {
+ return binaryContent;
+ }
+
+ public Request setBinaryContent(boolean binaryContent) {
+ this.binaryContent = binaryContent;
+ return this;
+ }
+
+ public String getCharset() {
+ return charset;
+ }
+
+ public Request setCharset(String charset) {
+ this.charset = charset;
+ return this;
}
@Override
@@ -158,8 +194,10 @@ public class Request implements Serializable {
"url='" + url + '\'' +
", method='" + method + '\'' +
", extras=" + extras +
- ", params=" + params +
", priority=" + priority +
+ ", headers=" + headers +
+ ", cookies="+ cookies+
'}';
}
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index 520902db600219cf7d8f39f47f2c7258d03a5a6d..b6963ca43c7e4774da6577d9c46c230703eb33d2 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -1,5 +1,7 @@
package us.codecraft.webmagic;
+import us.codecraft.webmagic.utils.HttpConstant;
+
import java.util.*;
/**
@@ -39,8 +41,10 @@ public class Site {
private boolean useGzip = true;
+ private boolean disableCookieManagement = false;
+
static {
- DEFAULT_STATUS_CODE_SET.add(200);
+ DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
}
/**
@@ -236,7 +240,7 @@ public class Site {
* Put an Http header for downloader.
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
*
- * @param key key of http header, there are some keys constant in {@link HeaderConst}
+ * @param key key of http header, there are some keys constant in {@link HttpConstant.Header}
* @param value value of header
* @return this
*/
@@ -307,6 +311,22 @@ public class Site {
return this;
}
+ public boolean isDisableCookieManagement() {
+ return disableCookieManagement;
+ }
+
+ /**
+ * Downloader is supposed to store response cookie.
+ * Disable it to ignore all cookie fields and stay clean.
+ * Warning: Set cookie will still NOT work if disableCookieManagement is true.
+ * @param disableCookieManagement disableCookieManagement
+ * @return this
+ */
+ public Site setDisableCookieManagement(boolean disableCookieManagement) {
+ this.disableCookieManagement = disableCookieManagement;
+ return this;
+ }
+
public Task toTask() {
return new Task() {
@Override
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index 5e785af0fef3b9e0e9da24686a0dc2fc1534e3f9..62c989f1d3479eea3ac636ba05acd0576fc21dad 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -1,6 +1,7 @@
package us.codecraft.webmagic;
import org.apache.commons.collections.CollectionUtils;
+import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.Downloader;
@@ -302,7 +303,7 @@ public class Spider implements Runnable, Task {
public void run() {
checkRunningStat();
initComponent();
- logger.info("Spider " + getUUID() + " started!");
+ logger.info("Spider {} started!",getUUID());
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
final Request request = scheduler.poll(this);
if (request == null) {
@@ -334,6 +335,7 @@ public class Spider implements Runnable, Task {
if (destroyWhenExit) {
close();
}
+ logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
}
protected void onError(Request request) {
@@ -398,34 +400,59 @@ public class Spider implements Runnable, Task {
}
}
- protected void processRequest(Request request) {
+ private void processRequest(Request request) {
Page page = downloader.download(request, this);
- if (page == null) {
- sleep(site.getSleepTime());
- onError(request);
- return;
- }
- // for cycle retry
- if (page.isNeedCycleRetry()) {
- extractAndAddRequests(page, true);
- sleep(site.getRetrySleepTime());
- return;
- }
- pageProcessor.process(page);
- extractAndAddRequests(page, spawnUrl);
- if (!page.getResultItems().isSkip()) {
- for (Pipeline pipeline : pipelines) {
- pipeline.process(page.getResultItems(), this);
+ if (page.isDownloadSuccess()){
+ onDownloadSuccess(request, page);
+ } else {
+ onDownloaderFail(request);
+ }
+ }
+
+ private void onDownloadSuccess(Request request, Page page) {
+ if (site.getAcceptStatCode().contains(page.getStatusCode())){
+ pageProcessor.process(page);
+ extractAndAddRequests(page, spawnUrl);
+ if (!page.getResultItems().isSkip()) {
+ for (Pipeline pipeline : pipelines) {
+ pipeline.process(page.getResultItems(), this);
+ }
}
+ } else {
+ logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
+ return;
+ }
+
+ private void onDownloaderFail(Request request) {
+ if (site.getCycleRetryTimes() == 0) {
+ sleep(site.getSleepTime());
+ } else {
+ // for cycle retry
+ doCycleRetry(request);
+ }
+ }
+
+ private void doCycleRetry(Request request) {
+ Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
+ if (cycleTriedTimesObject == null) {
+ addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
+ } else {
+ int cycleTriedTimes = (Integer) cycleTriedTimesObject;
+ cycleTriedTimes++;
+ if (cycleTriedTimes < site.getCycleRetryTimes()) {
+ addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
+ }
+ }
+ sleep(site.getRetrySleepTime());
}
protected void sleep(int time) {
try {
Thread.sleep(time);
} catch (InterruptedException e) {
- e.printStackTrace();
+ logger.error("Thread interrupted when sleep",e);
}
}
@@ -474,6 +501,7 @@ public class Spider implements Runnable, Task {
* Download urls synchronizing.
*
* @param urls urls
+ * @param type of process result
* @return list downloaded
*/
public List getAll(Collection urls) {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java
index c835dc8b0040e04bf5dd37a4d1eedcc3f54b12b3..c27292d09d8571b3a9ba5d3503c422987a55942c 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java
@@ -41,20 +41,4 @@ public abstract class AbstractDownloader implements Downloader {
protected void onError(Request request) {
}
- protected Page addToCycleRetry(Request request, Site site) {
- Page page = new Page();
- Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
- if (cycleTriedTimesObject == null) {
- page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
- } else {
- int cycleTriedTimes = (Integer) cycleTriedTimesObject;
- cycleTriedTimes++;
- if (cycleTriedTimes >= site.getCycleRetryTimes()) {
- return null;
- }
- page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
- }
- page.setNeedCycleRetry(true);
- return page;
- }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index e6523ec8096005d31cca9571d102ef7b1ec8f665..24889c88b22b51b236b31f10667c74bff913aaff 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -2,16 +2,8 @@ package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
-import org.apache.http.annotation.ThreadSafe;
-import org.apache.http.auth.AuthState;
-import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.HttpUriRequest;
-import org.apache.http.client.protocol.HttpClientContext;
-import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.CloseableHttpClient;
-import org.apache.http.protocol.BasicHttpContext;
-import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -37,7 +29,6 @@ import java.util.Map;
* @author code4crafter@gmail.com
* @since 0.1.0
*/
-@ThreadSafe
public class HttpClientDownloader extends AbstractDownloader {
private Logger logger = LoggerFactory.getLogger(getClass());
@@ -83,43 +74,29 @@ public class HttpClientDownloader extends AbstractDownloader {
if (task == null || task.getSite() == null) {
throw new NullPointerException("task or site can not be null");
}
- logger.debug("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
- int statusCode = 0;
- Site site = task.getSite();
- Proxy proxy = null;
- HttpContext httpContext = new BasicHttpContext();
- if (proxyProvider != null) {
- proxy = proxyProvider.getProxy(task);
- AuthState authState = new AuthState();
- authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
- httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
- }
- CloseableHttpClient httpClient = getHttpClient(site);
- HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
+ CloseableHttpClient httpClient = getHttpClient(task.getSite());
+ Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
+ HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
+ Page page = Page.fail();
try {
- httpResponse = httpClient.execute(httpUriRequest, httpContext);
- statusCode = httpResponse.getStatusLine().getStatusCode();
- if (site.getAcceptStatCode().contains(statusCode)) {
- Page page = handleResponse(request, site.getCharset(), httpResponse, task);
- onSuccess(request);
- return page;
- } else {
- logger.warn("get page {} error, status code {} ",request.getUrl(),statusCode);
- return null;
- }
+ httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
+ page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
+ onSuccess(request);
+ logger.info("downloading page success {}", request.getUrl());
+ return page;
} catch (IOException e) {
logger.warn("download page {} error", request.getUrl(), e);
- if (site != null && site.getCycleRetryTimes() > 0) {
- return addToCycleRetry(request, site);
- }
onError(request);
- return null;
+ return page;
} finally {
if (httpResponse != null) {
//ensure the connection is released back to pool
EntityUtils.consumeQuietly(httpResponse.getEntity());
}
+ if (proxyProvider != null && proxy != null) {
+ proxyProvider.returnProxy(proxy, page, task);
+ }
}
}
@@ -129,34 +106,33 @@ public class HttpClientDownloader extends AbstractDownloader {
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
- String content = getContent(charset, httpResponse);
+ byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
+ String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page();
- page.setRawText(content);
+ page.setBytes(bytes);
+ if (!request.isBinaryContent()){
+ if (charset == null) {
+ charset = getHtmlCharset(contentType, bytes);
+ }
+ page.setCharset(charset);
+ page.setRawText(new String(bytes, charset));
+ }
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
+ page.setDownloadSuccess(true);
if (responseHeader) {
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
}
return page;
}
- private String getContent(String charset, HttpResponse httpResponse) throws IOException {
+ private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
+ String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
- byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
- String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
- if (htmlCharset != null) {
- return new String(contentBytes, htmlCharset);
- } else {
- logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
- return new String(contentBytes);
- }
- } else {
- return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
+ charset = Charset.defaultCharset().name();
+ logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
}
- }
-
- private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
- return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
+ return charset;
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
index 9e17f607594163594d18be0705d68530dcb3afb4..28a16f41d5eaf8101a9ec463b8d86938e305da12 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
@@ -9,6 +9,7 @@ import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
+import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
@@ -49,7 +50,9 @@ public class HttpClientGenerator {
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
try {
- return new SSLConnectionSocketFactory(createIgnoreVerifySSL()); // 优先绕过安全证书
+ return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"},
+ null,
+ new DefaultHostnameVerifier()); // 优先绕过安全证书
} catch (KeyManagementException e) {
logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) {
@@ -127,6 +130,10 @@ public class HttpClientGenerator {
}
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
+ if (site.isDisableCookieManagement()) {
+ httpClientBuilder.disableCookieManagement();
+ return;
+ }
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry cookieEntry : site.getCookies().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java
new file mode 100644
index 0000000000000000000000000000000000000000..74e6d25efadb34b1b4a63392eaf8949b83d896ef
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java
@@ -0,0 +1,34 @@
+package us.codecraft.webmagic.downloader;
+
+import org.apache.http.client.methods.HttpUriRequest;
+import org.apache.http.client.protocol.HttpClientContext;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/4/8
+ * Time: 19:43
+ * @since 0.7.0
+ */
+public class HttpClientRequestContext {
+
+ private HttpUriRequest httpUriRequest;
+
+ private HttpClientContext httpClientContext;
+
+ public HttpUriRequest getHttpUriRequest() {
+ return httpUriRequest;
+ }
+
+ public void setHttpUriRequest(HttpUriRequest httpUriRequest) {
+ this.httpUriRequest = httpUriRequest;
+ }
+
+ public HttpClientContext getHttpClientContext() {
+ return httpClientContext;
+ }
+
+ public void setHttpClientContext(HttpClientContext httpClientContext) {
+ this.httpClientContext = httpClientContext;
+ }
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
index db131d07e83c9d9a46e60f0f653dcb915b264f5a..28a7ce5ea22c9b8827a8c77a3dc0438963c1a612 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
@@ -1,33 +1,64 @@
package us.codecraft.webmagic.downloader;
import org.apache.http.HttpHost;
-import org.apache.http.NameValuePair;
+import org.apache.http.auth.AuthState;
+import org.apache.http.auth.ChallengeState;
+import org.apache.http.auth.UsernamePasswordCredentials;
+import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
-import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
-import org.apache.http.message.BasicNameValuePair;
+import org.apache.http.client.protocol.HttpClientContext;
+import org.apache.http.entity.ByteArrayEntity;
+import org.apache.http.impl.auth.BasicScheme;
+import org.apache.http.impl.client.BasicCookieStore;
+import org.apache.http.impl.cookie.BasicClientCookie;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.utils.HttpConstant;
+import us.codecraft.webmagic.utils.UrlUtils;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date: 17/3/18
- * Time: 上午11:28
+ * Time: 11:28
+ *
+ * @since 0.7.0
*/
public class HttpUriRequestConverter {
- public HttpUriRequest convert(Request request, Site site, Proxy proxy) {
- RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
+ public HttpClientRequestContext convert(Request request, Site site, Proxy proxy) {
+ HttpClientRequestContext httpClientRequestContext = new HttpClientRequestContext();
+ httpClientRequestContext.setHttpUriRequest(convertHttpUriRequest(request, site, proxy));
+ httpClientRequestContext.setHttpClientContext(convertHttpClientContext(request, site, proxy));
+ return httpClientRequestContext;
+ }
+
+ private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) {
+ HttpClientContext httpContext = new HttpClientContext();
+ if (proxy != null && proxy.getUsername() != null) {
+ AuthState authState = new AuthState();
+ authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
+ httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
+ }
+ if (request.getCookies() != null && !request.getCookies().isEmpty()) {
+ CookieStore cookieStore = new BasicCookieStore();
+ for (Map.Entry cookieEntry : request.getCookies().entrySet()) {
+ BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
+ cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl())));
+ cookieStore.addCookie(cookie1);
+ }
+ httpContext.setCookieStore(cookieStore);
+ }
+ return httpContext;
+ }
+
+ private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) {
+ RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl()));
if (site.getHeaders() != null) {
for (Map.Entry headerEntry : site.getHeaders().entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
@@ -46,47 +77,39 @@ public class HttpUriRequestConverter {
requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort()));
}
requestBuilder.setConfig(requestConfigBuilder.build());
- return requestBuilder.build();
+ HttpUriRequest httpUriRequest = requestBuilder.build();
+ if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
+ for (Map.Entry header : request.getHeaders().entrySet()) {
+ httpUriRequest.addHeader(header.getKey(), header.getValue());
+ }
+ }
+ return httpUriRequest;
}
private RequestBuilder selectRequestMethod(Request request) {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
- return addQueryParams(RequestBuilder.get(),request.getParams());
+ return RequestBuilder.get();
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
- return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
+ return addFormParams(RequestBuilder.post(),request);
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
- return addQueryParams(RequestBuilder.head(),request.getParams());
+ return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
- return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
+ return addFormParams(RequestBuilder.put(), request);
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
- return addQueryParams(RequestBuilder.delete(),request.getParams());
+ return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
- return addQueryParams(RequestBuilder.trace(),request.getParams());
+ return RequestBuilder.trace();
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
- private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map params) {
- List allNameValuePair=new ArrayList();
- if (nameValuePair != null && nameValuePair.length > 0) {
- allNameValuePair= Arrays.asList(nameValuePair);
- }
- if (params != null) {
- for (String key : params.keySet()) {
- allNameValuePair.add(new BasicNameValuePair(key, params.get(key)));
- }
- }
- requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
- return requestBuilder;
- }
-
- private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map params) {
- if (params != null) {
- for (Map.Entry entry : params.entrySet()) {
- requestBuilder.addParameter(entry.getKey(), entry.getValue());
- }
+ private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) {
+ if (request.getRequestBody() != null) {
+ ByteArrayEntity entity = new ByteArrayEntity(request.getRequestBody().getBody());
+ entity.setContentType(request.getRequestBody().getContentType());
+ requestBuilder.setEntity(entity);
}
return requestBuilder;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java
new file mode 100644
index 0000000000000000000000000000000000000000..7d3b307852ce142d7cac3d905f90dc7912f317e5
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java
@@ -0,0 +1,102 @@
+package us.codecraft.webmagic.model;
+
+import org.apache.http.NameValuePair;
+import org.apache.http.client.utils.URLEncodedUtils;
+import org.apache.http.message.BasicNameValuePair;
+
+import java.io.Serializable;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/4/8
+ */
+public class HttpRequestBody implements Serializable {
+
+ private static final long serialVersionUID = 5659170945717023595L;
+
+ public static abstract class ContentType {
+
+ public static final String JSON = "application/json";
+
+ public static final String XML = "text/xml";
+
+ public static final String FORM = "application/x-www-form-urlencoded";
+
+ public static final String MULTIPART = "multipart/form-data";
+ }
+
+ private byte[] body;
+
+ private String contentType;
+
+ private String encoding;
+
+ public HttpRequestBody() {
+ }
+
+ public HttpRequestBody(byte[] body, String contentType, String encoding) {
+ this.body = body;
+ this.contentType = contentType;
+ this.encoding = encoding;
+ }
+
+ public String getContentType() {
+ return contentType;
+ }
+
+ public String getEncoding() {
+ return encoding;
+ }
+
+ public void setBody(byte[] body) {
+ this.body = body;
+ }
+
+ public void setContentType(String contentType) {
+ this.contentType = contentType;
+ }
+
+ public void setEncoding(String encoding) {
+ this.encoding = encoding;
+ }
+
+ public static HttpRequestBody json(String json, String encoding) {
+ try {
+ return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
+ } catch (UnsupportedEncodingException e) {
+ throw new IllegalArgumentException("illegal encoding " + encoding, e);
+ }
+ }
+
+ public static HttpRequestBody xml(String xml, String encoding) {
+ try {
+ return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding);
+ } catch (UnsupportedEncodingException e) {
+ throw new IllegalArgumentException("illegal encoding " + encoding, e);
+ }
+ }
+
+ public static HttpRequestBody custom(byte[] body, String contentType, String encoding) {
+ return new HttpRequestBody(body, contentType, encoding);
+ }
+
+ public static HttpRequestBody form(Map params, String encoding){
+ List nameValuePairs = new ArrayList(params.size());
+ for (Map.Entry entry : params.entrySet()) {
+ nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue())));
+ }
+ try {
+ return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding);
+ } catch (UnsupportedEncodingException e) {
+ throw new IllegalArgumentException("illegal encoding " + encoding, e);
+ }
+ }
+
+ public byte[] getBody() {
+ return body;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
index 57d6eea3f86097c2824000bda3a139a025a532a4..be9fd7cc2e35948aa49a397b2c1e4f30202d2b34 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
@@ -1,10 +1,8 @@
package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.http.annotation.ThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;
@@ -21,7 +19,6 @@ import java.util.Map;
* @author code4crafter@gmail.com
* @since 0.1.0
*/
-@ThreadSafe
public class FilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java
index a2a17e8c210683c48307041bdffe4a674f786738..4c94eef198115dd2dff876e8a81799af537aac19 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java
@@ -16,9 +16,9 @@ public class ZhihuPageProcessor implements PageProcessor {
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
- page.putField("title", page.getHtml().xpath("//h2[@class='zm-item-title']/a/text()").toString());
- page.putField("question", page.getHtml().xpath("//div[@id='zh-question-detail']//tidyText()").toString());
- page.putField("answer", page.getHtml().xpath("//div[@id='zh-question-answer-wrap']//div[@class='zm-editable-content']/tidyText()").toString());
+ page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
+ page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString());
+ page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
if (page.getResultItems().get("title")==null){
//skip this page
page.setSkip(true);
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
index a38ccaa7e4fac484e6de2ecde724f7bb17f5adff..c5f100732c03346e7f39490e4aa4b33b2926be42 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
@@ -38,4 +38,36 @@ public class Proxy {
public String getPassword() {
return password;
}
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ Proxy proxy = (Proxy) o;
+
+ if (port != proxy.port) return false;
+ if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false;
+ if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false;
+ return password != null ? password.equals(proxy.password) : proxy.password == null;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = host != null ? host.hashCode() : 0;
+ result = 31 * result + port;
+ result = 31 * result + (username != null ? username.hashCode() : 0);
+ result = 31 * result + (password != null ? password.hashCode() : 0);
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "Proxy{" +
+ "host='" + host + '\'' +
+ ", port=" + port +
+ ", username='" + username + '\'' +
+ ", password='" + password + '\'' +
+ '}';
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
index 4266d78c94652a916c02f8c4bab3194fc154d872..5b61a993ac0b533ed0cc3d51f6d0fa2a6f349726 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
@@ -1,14 +1,29 @@
package us.codecraft.webmagic.proxy;
+import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Task;
/**
- * Created by edwardsbean on 15-2-28.
+ * Proxy provider.
+ *
+ * @since 0.7.0
*/
public interface ProxyProvider {
- void returnProxy(Proxy proxy, boolean banned, Task task);
+ /**
+ *
+ * Return proxy to Provider when complete a download.
+ * @param proxy the proxy config contains host,port and identify info
+ * @param page the download result
+ * @param task the download task
+ */
+ void returnProxy(Proxy proxy, Page page, Task task);
+ /**
+ * Get a proxy for task by some strategy.
+ * @param task the download task
+ * @return proxy
+ */
Proxy getProxy(Task task);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java
deleted file mode 100644
index 3e68c11687c6acc35d85fbf2bb7bfc6fdb18bdcc..0000000000000000000000000000000000000000
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java
+++ /dev/null
@@ -1,13 +0,0 @@
-package us.codecraft.webmagic.proxy;
-
-import org.apache.http.HttpResponse;
-
-/**
- * @author code4crafter@gmail.com
- * Date: 17/3/20
- * Time: 下午10:52
- */
-public interface ResponseChecker {
-
- boolean isBanned(HttpResponse httpResponse);
-}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
new file mode 100644
index 0000000000000000000000000000000000000000..d8f47fe44bd506c1482a18580fc64ed2051c212c
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
@@ -0,0 +1,62 @@
+package us.codecraft.webmagic.proxy;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Task;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * A simple ProxyProvider. Provide proxy as round-robin without heartbeat and error check. It can be used when all proxies are stable.
+ * @author code4crafter@gmail.com
+ * Date: 17/4/16
+ * Time: 10:18
+ * @since 0.7.0
+ */
+public class SimpleProxyProvider implements ProxyProvider {
+
+ private final List proxies;
+
+ private final AtomicInteger pointer;
+
+ public SimpleProxyProvider(List proxies) {
+ this(proxies, new AtomicInteger(-1));
+ }
+
+ private SimpleProxyProvider(List proxies, AtomicInteger pointer) {
+ this.proxies = proxies;
+ this.pointer = pointer;
+ }
+
+ public static SimpleProxyProvider from(Proxy... proxies) {
+ List proxiesTemp = new ArrayList(proxies.length);
+ for (Proxy proxy : proxies) {
+ proxiesTemp.add(proxy);
+ }
+ return new SimpleProxyProvider(Collections.unmodifiableList(proxiesTemp));
+ }
+
+ @Override
+ public void returnProxy(Proxy proxy, Page page, Task task) {
+ //Donothing
+ }
+
+ @Override
+ public Proxy getProxy(Task task) {
+ return proxies.get(incrForLoop());
+ }
+
+ private int incrForLoop() {
+ int p = pointer.incrementAndGet();
+ int size = proxies.size();
+ if (p < size) {
+ return p;
+ }
+ while (!pointer.compareAndSet(p, p % size)) {
+ p = pointer.get();
+ }
+ return p % size;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java
deleted file mode 100644
index 7002df47b9eacb6c8cd5df3224a1632b76d0aea9..0000000000000000000000000000000000000000
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java
+++ /dev/null
@@ -1,159 +0,0 @@
-package us.codecraft.webmagic.proxy;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.Delayed;
-import java.util.concurrent.TimeUnit;
-
-/**
- * >>>> Proxy lifecycle
-
- +----------+ +-----+
- | last use | | new |
- +-----+----+ +---+-+
- | +------+ |
- +->| init |<--+
- +--+---+
- |
- v
- +--------+
- +--->| borrow |
- | +---+----+
- | |+------------------+
- | v
- | +--------+
- | | in use | Respone Time
- | +---+----+
- | |+------------------+
- | v
- | +--------+
- | | return |
- | +---+----+
- | |+-------------------+
- | v
- | +-------+ reuse interval
- | | delay | (delay time)
- | +---+---+
- | |+-------------------+
- | v
- | +------+
- | | idle | idle time
- | +---+--+
- | |+-------------------+
- +--------+
- */
-
-/**
- * Object has these status of lifecycle above.
- *
- * @author yxssfxwzy@sina.com
- * @since 0.5.1
- * @see TimerReuseProxyPool
- */
-
-public class TimerReuseProxy extends Proxy implements Delayed, Serializable {
-
- private static final long serialVersionUID = 228939737383625551L;
- public static final int ERROR_403 = 403;
- public static final int ERROR_404 = 404;
- public static final int ERROR_BANNED = 10000;// banned by website
- public static final int ERROR_Proxy = 10001;// the proxy itself failed
- public static final int SUCCESS = 200;
-
- private int reuseTimeInterval = 1500;// ms
- private Long canReuseTime = 0L;
- private Long lastBorrowTime = System.currentTimeMillis();
- private Long responseTime = 0L;
-
- private int failedNum = 0;
- private int successNum = 0;
- private int borrowNum = 0;
-
- private List failedErrorType = new ArrayList();
-
- public TimerReuseProxy(String host, int port, String username, String password) {
- super(host, port, username, password);
- }
-
-
- public int getSuccessNum() {
- return successNum;
- }
-
- public void successNumIncrement(int increment) {
- this.successNum += increment;
- }
-
- public Long getLastUseTime() {
- return lastBorrowTime;
- }
-
- public void setLastBorrowTime(Long lastBorrowTime) {
- this.lastBorrowTime = lastBorrowTime;
- }
-
- public void recordResponse() {
- this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
- this.lastBorrowTime = System.currentTimeMillis();
- }
-
- public List getFailedErrorType() {
- return failedErrorType;
- }
-
- public void setFailedErrorType(List failedErrorType) {
- this.failedErrorType = failedErrorType;
- }
-
- public void fail(int failedErrorType) {
- this.failedNum++;
- this.failedErrorType.add(failedErrorType);
- }
-
- public void setFailedNum(int failedNum) {
- this.failedNum = failedNum;
- }
-
- public int getFailedNum() {
- return failedNum;
- }
-
- public String getFailedType() {
- String re = "";
- for (Integer i : this.failedErrorType) {
- re += i + " . ";
- }
- return re;
- }
-
- public int getReuseTimeInterval() {
- return reuseTimeInterval;
- }
-
- public void setReuseTimeInterval(int reuseTimeInterval) {
- this.reuseTimeInterval = reuseTimeInterval;
- this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
-
- }
-
- @Override
- public long getDelay(TimeUnit unit) {
- return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
- }
-
- @Override
- public int compareTo(Delayed o) {
- TimerReuseProxy that = (TimerReuseProxy) o;
- return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
-
- }
-
- public void borrowNumIncrement(int increment) {
- this.borrowNum += increment;
- }
-
- public int getBorrowNum() {
- return borrowNum;
- }
-}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java
deleted file mode 100644
index 6dbac5d58d0611a00d3b0d4861834d235bd21661..0000000000000000000000000000000000000000
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java
+++ /dev/null
@@ -1,204 +0,0 @@
-package us.codecraft.webmagic.proxy;
-
-import us.codecraft.webmagic.Task;
-
-/**
- * Pooled Proxy Object
- *
- * @author yxssfxwzy@sina.com
- * @see Proxy
- * @since 0.5.1
- */
-public class TimerReuseProxyPool implements ProxyProvider {
- @Override
- public void returnProxy(Proxy proxy, boolean banned, Task task) {
-
- }
-
- @Override
- public Proxy getProxy(Task task) {
- return null;
- }
-
-// private Logger logger = LoggerFactory.getLogger(getClass());
-//
-// private BlockingQueue proxyQueue = new DelayQueue();
-// private Map allProxy = new ConcurrentHashMap();
-//
-// private int reuseInterval = 1500;// ms
-// private int reviveTime = 2 * 60 * 60 * 1000;// ms
-// private int saveProxyInterval = 10 * 60 * 1000;// ms
-//
-// private boolean isEnable = false;
-// private boolean validateWhenInit = false;
-// // private boolean isUseLastProxy = true;
-//
-// public TimerReuseProxyPool(List httpProxyList) {
-// this(httpProxyList, true);
-// }
-//
-// private void addProxy(Map httpProxyMap) {
-// isEnable = true;
-// for (Entry entry : httpProxyMap.entrySet()) {
-// try {
-// if (allProxy.containsKey(entry.getKey())) {
-// continue;
-// }
-// if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
-// entry.getValue().setFailedNum(0);
-// entry.getValue().setReuseTimeInterval(reuseInterval);
-// proxyQueue.add(entry.getValue());
-// allProxy.put(entry.getKey(), entry.getValue());
-// }
-// } catch (NumberFormatException e) {
-// logger.error("HttpHost init error:", e);
-// }
-// }
-// logger.info("proxy pool size>>>>" + allProxy.size());
-// }
-//
-// public void addProxy(Proxy... httpProxyList) {
-// isEnable = true;
-// for (Proxy proxy : httpProxyList) {
-// if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
-// TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval);
-// proxyQueue.add(p);
-// allProxy.put(p.getProxyHost().getHost(), p);
-// }
-// }
-// logger.info("proxy pool size>>>>" + allProxy.size());
-// }
-//
-// public TimerReuseProxy getProxy() {
-// TimerReuseProxy proxy = null;
-// try {
-// Long time = System.currentTimeMillis();
-// proxy = proxyQueue.take();
-// double costTime = (System.currentTimeMillis() - time) / 1000.0;
-// if (costTime > reuseInterval) {
-// logger.info("get proxy time >>>> " + costTime);
-// }
-// TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost());
-// p.setLastBorrowTime(System.currentTimeMillis());
-// p.borrowNumIncrement(1);
-// } catch (InterruptedException e) {
-// logger.error("get proxy error", e);
-// }
-// if (proxy == null) {
-// throw new NoSuchElementException();
-// }
-// return proxy;
-// }
-//
-// public void returnProxy(Proxy proxy, int statusCode) {
-// TimerReuseProxy p = allProxy.get(proxy.getProxyHost());
-// if (p == null) {
-// return;
-// }
-// switch (statusCode) {
-// case TimerReuseProxy.SUCCESS:
-// p.setReuseTimeInterval(reuseInterval);
-// p.setFailedNum(0);
-// p.setFailedErrorType(new ArrayList());
-// p.recordResponse();
-// p.successNumIncrement(1);
-// break;
-// case TimerReuseProxy.ERROR_403:
-// // banned,try longer interval
-// p.fail(TimerReuseProxy.ERROR_403);
-// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
-// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
-// break;
-// case TimerReuseProxy.ERROR_BANNED:
-// p.fail(TimerReuseProxy.ERROR_BANNED);
-// p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
-// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
-// break;
-// case TimerReuseProxy.ERROR_404:
-// // p.fail(Proxy.ERROR_404);
-// // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
-// break;
-// default:
-// p.fail(statusCode);
-// break;
-// }
-// if (p.getFailedNum() > 20) {
-// p.setReuseTimeInterval(reviveTime);
-// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
-// return;
-// }
-// if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
-// if (!ProxyUtils.validateProxy(proxy)) {
-// p.setReuseTimeInterval(reviveTime);
-// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
-// return;
-// }
-// }
-// try {
-// proxyQueue.put(p);
-// } catch (InterruptedException e) {
-// logger.warn("proxyQueue return proxy error", e);
-// }
-// }
-//
-// public String allProxyStatus() {
-// String re = "all proxy info >>>> \n";
-// for (Entry entry : allProxy.entrySet()) {
-// re += entry.getValue().toString() + "\n";
-// }
-// return re;
-// }
-//
-// public int getIdleNum() {
-// return proxyQueue.size();
-// }
-//
-// public int getReuseInterval() {
-// return reuseInterval;
-// }
-//
-// public void setReuseInterval(int reuseInterval) {
-// this.reuseInterval = reuseInterval;
-// }
-//
-// public void enable(boolean isEnable) {
-// this.isEnable = isEnable;
-// }
-//
-// public boolean isEnable() {
-// return isEnable;
-// }
-//
-// public int getReviveTime() {
-// return reviveTime;
-// }
-//
-// public void setReviveTime(int reviveTime) {
-// this.reviveTime = reviveTime;
-// }
-//
-// public boolean isValidateWhenInit() {
-// return validateWhenInit;
-// }
-//
-// public void validateWhenInit(boolean validateWhenInit) {
-// this.validateWhenInit = validateWhenInit;
-// }
-//
-// public int getSaveProxyInterval() {
-// return saveProxyInterval;
-// }
-//
-// public void setSaveProxyInterval(int saveProxyInterval) {
-// this.saveProxyInterval = saveProxyInterval;
-// }
-//
-// public String getProxyFilePath() {
-// return proxyFilePath;
-// }
-//
-// public void setProxyFilePath(String proxyFilePath) {
-// this.proxyFilePath = proxyFilePath;
-// }
-
-}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
index 8fa1b9ea29996c884b98614ebc79d80711d8d2fb..14cbaff327a3b9778ddd08372746a606e5577fd1 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
@@ -1,6 +1,5 @@
package us.codecraft.webmagic.scheduler;
-import org.apache.http.annotation.ThreadSafe;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.NumberUtils;
@@ -16,7 +15,6 @@ import java.util.concurrent.PriorityBlockingQueue;
* @author code4crafter@gmail.com
* @since 0.2.1
*/
-@ThreadSafe
public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
public static final int INITIAL_CAPACITY = 5;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
index 078506c6f9e09d4b1d1392275b8c0a064cf065d4..f9ad0e98f8fb0cff89d28248e852e417b5cd229d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
@@ -1,6 +1,5 @@
package us.codecraft.webmagic.scheduler;
-import org.apache.http.annotation.ThreadSafe;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
@@ -15,7 +14,6 @@ import java.util.concurrent.LinkedBlockingQueue;
* @author code4crafter@gmail.com
* @since 0.1.0
*/
-@ThreadSafe
public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
private BlockingQueue queue = new LinkedBlockingQueue();
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
index d80e8b48e813b8a4e08a65a5e1299b6229618d10..f2218f12611a5dadeba6c651a740132cb698677b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
@@ -3,7 +3,6 @@ package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Entities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -20,33 +19,28 @@ public class Html extends HtmlNode {
private Logger logger = LoggerFactory.getLogger(getClass());
- private static volatile boolean INITED = false;
-
/**
* Disable jsoup html entity escape. It can be set just before any Html instance is created.
+ * @deprecated
*/
public static boolean DISABLE_HTML_ENTITY_ESCAPE = false;
- /**
- * Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2.
- */
- private void disableJsoupHtmlEntityEscape() {
- if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
- Entities.EscapeMode.base.getMap().clear();
- Entities.EscapeMode.extended.getMap().clear();
- Entities.EscapeMode.xhtml.getMap().clear();
- INITED = true;
- }
- }
-
/**
* Store parsed document for better performance when only one text exist.
*/
private Document document;
+ public Html(String text, String url) {
+ try {
+ this.document = Jsoup.parse(text, url);
+ } catch (Exception e) {
+ this.document = null;
+ logger.warn("parse document error ", e);
+ }
+ }
+
public Html(String text) {
try {
- disableJsoupHtmlEntityEscape();
this.document = Jsoup.parse(text);
} catch (Exception e) {
this.document = null;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
index 030522f0ab3f8f32e45218896396dae66bff5126..c063b48259e2f8ab3fec807b87ca7322a75b8a6d 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
@@ -34,7 +34,7 @@ public class HtmlNode extends AbstractSelectable {
@Override
public Selectable links() {
- return xpath("//a/@href");
+ return selectElements(new LinksSelector());
}
@Override
@@ -90,7 +90,7 @@ public class HtmlNode extends AbstractSelectable {
* See: https://github.com/code4craft/webmagic/issues/113
*
* @param elementIterator elementIterator
- * @param element element
+ * @return element element
*/
private Element checkElementAndConvert(ListIterator elementIterator) {
Element element = elementIterator.next();
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
index b0b90f9bf2fcc75445184a7d379eccc47f3bdd41..f5c0baeb591a132d8272dc795e537f77f8c03c89 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
@@ -1,9 +1,11 @@
package us.codecraft.webmagic.selector;
+import com.alibaba.fastjson.JSON;
import com.jayway.jsonpath.JsonPath;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
/**
* JsonPath selector.
@@ -32,12 +34,20 @@ public class JsonPathSelector implements Selector {
if (object instanceof List) {
List list = (List) object;
if (list != null && list.size() > 0) {
- return list.iterator().next().toString();
+ return toString(list.iterator().next());
}
}
return object.toString();
}
+ private String toString(Object object) {
+ if (object instanceof Map) {
+ return JSON.toJSONString(object);
+ } else {
+ return String.valueOf(object);
+ }
+ }
+
@Override
public List selectList(String text) {
List list = new ArrayList();
@@ -48,10 +58,10 @@ public class JsonPathSelector implements Selector {
if (object instanceof List) {
List