1 Star 0 Fork 4

medy13 / wechat_spider

forked from GoLibs / wechat_spider 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
processor.go 3.67 KB
一键复制 编辑 原始数据 按行查看 历史
package wechat_spider
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strings"
"time"
"github.com/palantir/stacktrace"
)
type Processor interface {
//Core method
Process(req *http.Request, data []byte) error
//Result urls
Urls() []string
//Output
Output()
//Sleep method to avoid the req control of wechat
Sleep()
}
type BaseProcessor struct {
req *http.Request
lastId string
data []byte
result []string
}
var (
replacer = strings.NewReplacer(
"\t", "", " ", "",
""", `"`, " ", "",
`\\`, "", "&", "&",
"&", "&", `\`, "",
)
urlRegex = regexp.MustCompile("http://mp.weixin.qq.com/s?[^#]*")
idRegex = regexp.MustCompile(`"id":(\d+)`)
MsgNotFound = errors.New("MsgLists not found")
)
func NewBaseProcessor() *BaseProcessor {
return &BaseProcessor{}
}
func (p *BaseProcessor) init(req *http.Request, data []byte) (err error) {
p.req = req
p.data = data
fmt.Println("Running a new wechat processor, please wait...")
return nil
}
func (p *BaseProcessor) Process(req *http.Request, data []byte) error {
if err := p.init(req, data); err != nil {
return err
}
if err := p.processMain(); err != nil {
return err
}
if err := p.processPages(); err != nil {
return err
}
return nil
}
func (p *BaseProcessor) Sleep() {
time.Sleep(50 * time.Millisecond)
}
func (p *BaseProcessor) Urls() []string {
return p.result
}
func (p *BaseProcessor) Output() {
fmt.Println("result => [")
fmt.Println(strings.Join(p.Urls(), ","))
fmt.Println("]")
}
//Parse the html
func (p *BaseProcessor) processMain() error {
p.result = make([]string, 0, 100)
buffer := bytes.NewBuffer(p.data)
var msgs string
str, err := buffer.ReadString('\n')
for err == nil {
if strings.Contains(str, "msgList = ") {
msgs = str
break
}
str, err = buffer.ReadString('\n')
}
if msgs == "" {
return stacktrace.Propagate(MsgNotFound, "Failed parse main")
}
msgs = replacer.Replace(msgs)
p.result = urlRegex.FindAllString(msgs, -1)
if len(p.result) < 1 {
return stacktrace.Propagate(MsgNotFound, "Failed find url in main")
}
idMatcher := idRegex.FindAllStringSubmatch(msgs, -1)
if len(idMatcher) < 1 {
return stacktrace.Propagate(MsgNotFound, "Failed find id in main")
}
p.lastId = idMatcher[len(idMatcher)-1][1]
return nil
}
func (p *BaseProcessor) processPages() (err error) {
var pageUrl = p.genUrl()
p.logf("process pages....")
req, err := http.NewRequest("GET", pageUrl, nil)
if err != nil {
return stacktrace.Propagate(err, "Failed new page request")
}
for k, _ := range p.req.Header {
req.Header.Set(k, p.req.Header.Get(k))
}
req.Header.Set("Content-Type", "application/json; charset=UTF-8")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return stacktrace.Propagate(err, "Failed get page response")
}
bs, _ := ioutil.ReadAll(resp.Body)
defer resp.Body.Close()
str := replacer.Replace(string(bs))
result := urlRegex.FindAllString(str, -1)
if len(result) < 1 {
return stacktrace.Propagate(err, "Failed get page url")
}
idMatcher := idRegex.FindAllStringSubmatch(str, -1)
if len(idMatcher) < 1 {
return stacktrace.Propagate(err, "Failed get page id")
}
p.lastId = idMatcher[len(idMatcher)-1][1]
p.logf("Page Get => %d,lastid: %s", len(result), p.lastId)
p.result = append(p.result, result...)
if p.lastId != "" {
p.Sleep()
return p.processPages()
}
return nil
}
func (P *BaseProcessor) Save() {
}
func (p *BaseProcessor) genUrl() string {
url := "http://mp.weixin.qq.com/mp/getmasssendmsg?" + p.req.URL.RawQuery
url += "&frommsgid=" + p.lastId + "&f=json&count=100"
return url
}
func (P *BaseProcessor) logf(format string, msg ...interface{}) {
if Verbose {
Logger.Printf(format, msg...)
}
}
Go
1
https://gitee.com/medy13/wechat_spider.git
git@gitee.com:medy13/wechat_spider.git
medy13
wechat_spider
wechat_spider
master

搜索帮助