代码拉取完成,页面将自动刷新
同步操作将从 GoLibs/wechat_spider 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
package wechat_spider
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strings"
"time"
"github.com/palantir/stacktrace"
)
type Processor interface {
//Core method
Process(req *http.Request, data []byte) error
//Result urls
Urls() []string
//Output
Output()
//Sleep method to avoid the req control of wechat
Sleep()
}
type BaseProcessor struct {
req *http.Request
lastId string
data []byte
result []string
}
var (
replacer = strings.NewReplacer(
"\t", "", " ", "",
""", `"`, " ", "",
`\\`, "", "&", "&",
"&", "&", `\`, "",
)
urlRegex = regexp.MustCompile("http://mp.weixin.qq.com/s?[^#]*")
idRegex = regexp.MustCompile(`"id":(\d+)`)
MsgNotFound = errors.New("MsgLists not found")
)
func NewBaseProcessor() *BaseProcessor {
return &BaseProcessor{}
}
func (p *BaseProcessor) init(req *http.Request, data []byte) (err error) {
p.req = req
p.data = data
fmt.Println("Running a new wechat processor, please wait...")
return nil
}
func (p *BaseProcessor) Process(req *http.Request, data []byte) error {
if err := p.init(req, data); err != nil {
return err
}
if err := p.processMain(); err != nil {
return err
}
if err := p.processPages(); err != nil {
return err
}
return nil
}
func (p *BaseProcessor) Sleep() {
time.Sleep(50 * time.Millisecond)
}
func (p *BaseProcessor) Urls() []string {
return p.result
}
func (p *BaseProcessor) Output() {
fmt.Println("result => [")
fmt.Println(strings.Join(p.Urls(), ","))
fmt.Println("]")
}
//Parse the html
func (p *BaseProcessor) processMain() error {
p.result = make([]string, 0, 100)
buffer := bytes.NewBuffer(p.data)
var msgs string
str, err := buffer.ReadString('\n')
for err == nil {
if strings.Contains(str, "msgList = ") {
msgs = str
break
}
str, err = buffer.ReadString('\n')
}
if msgs == "" {
return stacktrace.Propagate(MsgNotFound, "Failed parse main")
}
msgs = replacer.Replace(msgs)
p.result = urlRegex.FindAllString(msgs, -1)
if len(p.result) < 1 {
return stacktrace.Propagate(MsgNotFound, "Failed find url in main")
}
idMatcher := idRegex.FindAllStringSubmatch(msgs, -1)
if len(idMatcher) < 1 {
return stacktrace.Propagate(MsgNotFound, "Failed find id in main")
}
p.lastId = idMatcher[len(idMatcher)-1][1]
return nil
}
func (p *BaseProcessor) processPages() (err error) {
var pageUrl = p.genUrl()
p.logf("process pages....")
req, err := http.NewRequest("GET", pageUrl, nil)
if err != nil {
return stacktrace.Propagate(err, "Failed new page request")
}
for k, _ := range p.req.Header {
req.Header.Set(k, p.req.Header.Get(k))
}
req.Header.Set("Content-Type", "application/json; charset=UTF-8")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return stacktrace.Propagate(err, "Failed get page response")
}
bs, _ := ioutil.ReadAll(resp.Body)
defer resp.Body.Close()
str := replacer.Replace(string(bs))
result := urlRegex.FindAllString(str, -1)
if len(result) < 1 {
return stacktrace.Propagate(err, "Failed get page url")
}
idMatcher := idRegex.FindAllStringSubmatch(str, -1)
if len(idMatcher) < 1 {
return stacktrace.Propagate(err, "Failed get page id")
}
p.lastId = idMatcher[len(idMatcher)-1][1]
p.logf("Page Get => %d,lastid: %s", len(result), p.lastId)
p.result = append(p.result, result...)
if p.lastId != "" {
p.Sleep()
return p.processPages()
}
return nil
}
func (P *BaseProcessor) Save() {
}
func (p *BaseProcessor) genUrl() string {
url := "http://mp.weixin.qq.com/mp/getmasssendmsg?" + p.req.URL.RawQuery
url += "&frommsgid=" + p.lastId + "&f=json&count=100"
return url
}
func (P *BaseProcessor) logf(format string, msg ...interface{}) {
if Verbose {
Logger.Printf(format, msg...)
}
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。