代码拉取完成,页面将自动刷新
package crawler
import (
"strings"
"gitee.com/generals-space/site-mirror-go.git/model"
"github.com/PuerkitoBio/goquery"
)
// ParseLinkingPages 解析并改写页面中的页面链接, 包括a, iframe等元素
func (crawler *Crawler) ParseLinkingPages(htmlDom *goquery.Document, req *model.URLRecord) {
aList := htmlDom.Find("a")
crawler.parseLinkingPages(aList, req, "href")
}
// parseLinkingPages 遍历选中节点, 解析链接入库, 同时修改节点的链接属性.
func (crawler *Crawler) parseLinkingPages(nodeList *goquery.Selection, req *model.URLRecord, attrName string) {
// nodeList.Nodes 对象表示当前选择器中包含的元素
nodeList.Each(func(i int, nodeItem *goquery.Selection) {
subURL, exist := nodeItem.Attr(attrName)
if !exist || emptyLinkPattern.MatchString(subURL) {
return
}
fullURL, fullURLWithoutFrag := joinURL(req.URL, subURL)
if !URLFilter(fullURL, model.URLTypePage, crawler.Config) {
return
}
localLink, err := TransToLocalLink(crawler.Config.MainSite, fullURL, model.URLTypePage)
if err != nil {
return
}
nodeItem.SetAttr(attrName, localLink)
// 新任务入队列
req := &model.URLRecord{
URL: fullURLWithoutFrag,
URLType: model.URLTypePage,
Refer: req.URL,
Depth: req.Depth + 1,
}
crawler.EnqueuePage(req)
})
}
// ParseLinkingAssets 解析并改写页面中的静态资源链接, 包括js, css, img等元素
func (crawler *Crawler) ParseLinkingAssets(htmlDom *goquery.Document, req *model.URLRecord) {
linkList := htmlDom.Find("link")
crawler.parseLinkingAssets(linkList, req, "href")
scriptList := htmlDom.Find("script")
crawler.parseLinkingAssets(scriptList, req, "src")
imgList := htmlDom.Find("img")
crawler.parseLinkingAssets(imgList, req, "src")
videoList := htmlDom.Find("video")
crawler.parseLinkingAssets(videoList, req, "src")
audioList := htmlDom.Find("audio")
crawler.parseLinkingAssets(audioList, req, "src")
}
func (crawler *Crawler) parseLinkingAssets(nodeList *goquery.Selection, req *model.URLRecord, attrName string) {
// nodeList.Nodes 对象表示当前选择器中包含的元素
nodeList.Each(func(i int, nodeItem *goquery.Selection) {
subURL, exist := nodeItem.Attr(attrName)
if !exist || emptyLinkPattern.MatchString(subURL) {
return
}
fullURL, fullURLWithoutFrag := joinURL(req.URL, subURL)
if !URLFilter(fullURL, model.URLTypeAsset, crawler.Config) {
return
}
localLink, err := TransToLocalLink(crawler.Config.MainSite, fullURL, model.URLTypeAsset)
if err != nil {
return
}
nodeItem.SetAttr(attrName, localLink)
// 新任务入队列
req := &model.URLRecord{
URL: fullURLWithoutFrag,
URLType: model.URLTypeAsset,
Refer: req.URL,
Depth: req.Depth + 1,
}
crawler.EnqueueAsset(req)
})
}
// parseCSSFile 解析css文件中的链接, 获取资源并修改其引用路径.
// css中可能包含url属性,或者是background-image属性的引用路径,
// 格式可能为url('./bg.jpg'), url("./bg.jpg"), url(bg.jpg)
func (crawler *Crawler) parseCSSFile(content []byte, req *model.URLRecord) (newContent []byte, err error) {
fileStr := string(content)
// FindAllStringSubmatch返回值为切片, 是所有匹配到的字符串集合.
// 其成员也是切片, 此切片类似于FindStringSubmatch()的结果, 表示分组的匹配情况.
matchedArray := cssAssetURLPattern.FindAllStringSubmatch(fileStr, -1)
for _, matchedItem := range matchedArray {
for _, matchedURL := range matchedItem[1:] {
if matchedURL == "" || emptyLinkPattern.MatchString(matchedURL) {
continue
}
fullURL, fullURLWithoutFrag := joinURL(req.URL, matchedURL)
if !URLFilter(fullURL, model.URLTypeAsset, crawler.Config) {
return
}
localLink, err := TransToLocalLink(crawler.Config.MainSite, fullURL, model.URLTypeAsset)
if err != nil {
continue
}
fileStr = strings.Replace(fileStr, matchedURL, localLink, -1)
// 新任务入队列
req := &model.URLRecord{
URL: fullURLWithoutFrag,
URLType: model.URLTypeAsset,
Refer: req.URL,
Depth: req.Depth + 1,
}
crawler.EnqueueAsset(req)
}
}
newContent = []byte(fileStr)
return
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。