代码拉取完成,页面将自动刷新
package WebCrawler
import (
"gitee.com/fierce_wolf/go-fox-edge-common/commUtil/Slice"
"github.com/PuerkitoBio/goquery"
"regexp"
"strings"
)
type webSite struct {
}
// ReadPage 读取页面的元素信息
func (e *webSite) ReadPage(host, url string) (*WebPage, error) {
doc, err := Http.HttpGet(host+url, 0)
if err != nil {
return nil, err
}
page := &WebPage{}
page.Host = host
page.Url = url
// 获得页面的头部信息
page.Head = e.GetHead(doc.Selection)
// 获得下一层的链接信息
page.Hrefs = e.GetHrefs(doc.Selection)
delete(page.Hrefs, host)
// 获得文本相关的元素
page.Texts = e.GetTexts(doc.Selection)
// 获得文本相关的元素
page.Images = e.GetImages(doc.Selection)
return page, nil
}
// FindTexts 遍历每一个文本的元素,主要是用于分析页面的信息
// 参数:finders 遍历时的查找器
func (e *webSite) FindTexts(page *WebPage, finders []IFindElement) (*WebPage, error) {
for _, finder := range finders {
finder.Find(page.Texts)
}
return page, nil
}
// FindImages 遍历每一个文本的元素,主要是用于分析页面的信息
// 参数:finders 遍历时的查找器
func (e *webSite) FindImages(page *WebPage, finders []IFindElement) (*WebPage, error) {
for _, finder := range finders {
finder.Find(page.Images)
}
return page, nil
}
// GetHead 获得页面的常见头部信息
func (e *webSite) GetHead(selection *goquery.Selection) *WebHead {
head := &WebHead{}
head.Title = selection.Find("title").Text()
head.Description = selection.Find("meta[name=description]").AttrOr("content", "")
keywords := selection.Find("meta[name=keywords]").AttrOr("content", "")
head.Keywords = strings.Split(keywords, ",")
return head
}
// GetHrefs 获得下一层的链接
func (e *webSite) GetHrefs(selection *goquery.Selection) map[string]*WebHref {
hrefs := make(map[string]*WebHref, 0)
selection.Find("a").Each(func(i int, s *goquery.Selection) {
// 获取<a>标签的href属性
href, exists := s.Attr("href")
if !exists {
return
}
// 忽略指向自身的url
if href == "" || href == "#" {
return
}
hrefs[href] = &WebHref{
Href: href,
Title: s.Text(),
}
})
return hrefs
}
// GetTexts 获得文本节点
func (e *webSite) GetTexts(selection *goquery.Selection) []*WebElement {
keys := make([]string, 0)
keys = append(keys, "h")
keys = append(keys, "h1")
keys = append(keys, "h2")
keys = append(keys, "h3")
keys = append(keys, "h4")
keys = append(keys, "h5")
keys = append(keys, "p")
keys = append(keys, "a")
keys = append(keys, "span")
result := make([]*WebElement, 0)
for _, key := range keys {
list := e.getTexts(selection, key)
result = Slice.Append(result, list)
}
return result
}
// getTexts 获得文本节点
// 参数说明:selector 是html的节点类型,比如<h2>、<a>、<p>
func (e *webSite) getTexts(selection *goquery.Selection, selector string) []*WebElement {
list := make([]*WebElement, 0)
selection.Find(selector).Each(func(i int, s *goquery.Selection) {
text := strings.TrimSpace(s.Text())
if text == "" {
return
}
list = append(list, &WebElement{
Text: text,
Tag: selector,
Selection: s,
})
})
return list
}
// GetImages 获得文本节点
// 参数说明:selector 是html的节点类型,比如<h2>、<a>、<p>
func (e *webSite) GetImages(selection *goquery.Selection) []*WebElement {
list := make([]*WebElement, 0)
selection.Find("img").Each(func(i int, s *goquery.Selection) {
// 获取<a>标签的href属性
href, exists := s.Attr("src")
if !exists {
return
}
// 忽略指向自身的url
if len(href) <= 1 {
return
}
list = append(list, &WebElement{
Text: href,
Tag: "img",
Selection: s,
})
})
return list
}
// FindElements 使用过滤器查找元素
func (e *webSite) FindElements(els []*WebElement, finder Filter) []*WebElement {
data := make([]*WebElement, 0)
for _, node := range els {
if finder.Mode == ModeContain {
if strings.Contains(strings.ToLower(node.Text), strings.ToLower(finder.Param.(string))) {
data = append(data, node)
}
continue
}
if finder.Mode == ModeEqual {
if node.Text == finder.Param {
data = append(data, node)
}
continue
}
if finder.Mode == ModeRegexp {
matches := finder.Param.(*regexp.Regexp).FindAllString(node.Text, -1)
if len(matches) > 0 {
data = append(data, node)
}
continue
}
if finder.Mode == ModeElTag {
if node.Tag == finder.Param {
data = append(data, node)
}
continue
}
}
return data
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。