1 Star 0 Fork 1

roberChen / webfetch

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
fetch.go 8.18 KB
一键复制 编辑 原始数据 按行查看 历史
roberChen 提交于 2020-09-03 13:23 . Solve css url abstract bug. Add version
package main
import (
"bytes"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"net/url"
"os"
"regexp"
"strconv"
"strings"
"time"
"github.com/cheggaaa/pb/v3"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/extensions"
)
var (
version string
buildtime string
topname string
help *bool
verbose *bool
vverbose *bool
silent *bool
topnsplit []string
start *string
proxyurl *string
useproxy *bool
timeout *int
outer *bool
target targetmap
domains targetmap
update *bool
ignoreerror *bool
showversion *bool
)
type targetmap map[string]bool
func (t targetmap) String() string {
return fmt.Sprintln(map[string]bool(t))
}
func (t targetmap) Set(s string) error {
t[s] = true
return nil
}
func init() {
start = flag.String("url", "", "the start point to download full url")
verbose = flag.Bool("v", false, "verbose mode")
vverbose = flag.Bool("vv", false, "more verbose mode")
silent = flag.Bool("s", false, "silence mode")
help = flag.Bool("help", false, "print help page")
useproxy = flag.Bool("p", false, "use proxy when fetching")
timeout = flag.Int("T", 30, "short of timeout")
outer = flag.Bool("outer", false, "allow fetching outer url(may spend many times)")
flag.BoolVar(help, "h", false, "short of help")
flag.IntVar(timeout, "timeout", 30, "timeout of request")
target = make(targetmap)
flag.Var(&target, "t", "target of each web page(in simple jquery), eg a[href], or img[class]@src."+
"if target is a[href], then get element named a and use href as link."+
" if target is img[class]@src, then webfetch will get element named img with class attribute,"+
" but use attribute src as link, when suffix with"+
" @_text, then use text of element as link rather than attribute, multiple target is permitted")
domains = make(targetmap)
flag.Var(&domains, "U", "allowd domains, multidomains permitted")
proxyurl = flag.String("proxy", "socks5://127.0.0.1:1080", "proxy for fetching")
update = flag.Bool("u", true, "only update not existed files")
ignoreerror = flag.Bool("I", true, "ignore error")
showversion = flag.Bool("version", false, "show version")
}
func main() {
flag.Parse()
*verbose = *verbose && !*silent
*vverbose = *vverbose && !*silent
*verbose = *verbose || *vverbose
target["a[href]"] = true //another webpage
target["img[src]"] = true // image
target["link[href]"] = true // might be css file
target["script[src]"] = true // js file
if *help {
log.Println(os.Args[0] + "(version " + version + "): a simple web fetcher, fetch contents starts from the given url path into local dir")
flag.Usage()
os.Exit(0)
}
if *showversion {
if buildtime == "" {
buildtime = "unknown"
}
fmt.Println(version, "build at:", buildtime)
os.Exit(0)
}
if *start == "" {
log.Fatalln("empty topname")
}
if *verbose {
log.Println("Start with", *start)
}
starturl, err := url.Parse(*start)
if err != nil {
log.Fatalln(err)
}
toppathsplit := strings.Split(starturl.Path, "/")
/*
if *vverbose {
log.Println("url:", starturl.Scheme, ",", starturl.Host, ",", starturl.Path)
log.Println("top path split", toppathsplit)
}
*/
if len(toppathsplit) == 0 {
topname = starturl.Host
} else if !strings.Contains(toppathsplit[len(toppathsplit)-1], ".") {
// the path is given as path
topname = starturl.Host + strings.Join(toppathsplit, "/")
} else {
topname = starturl.Host + strings.Join(toppathsplit[:len(toppathsplit)-1], "/")
}
if *verbose {
log.Println("topname:", topname)
}
// setting collector
c := colly.NewCollector()
c.Async = true
c.SetRequestTimeout(time.Second * time.Duration(*timeout))
extensions.RandomUserAgent(c)
if *useproxy {
c.SetProxy(*proxyurl)
}
//setting route
domains[topname] = true
c.OnRequest(func(r *colly.Request) {
var ok bool
// if url fits one of the urls, then pass
// TODO: algorithm needs improve!
for name := range domains {
if strings.Contains(r.URL.String(), name) && !*outer {
ok = true
}
}
if ok {
if *verbose {
log.Println("fetching", r.URL)
}
} else {
log.Println("Aborting:", r.URL.String())
r.Abort()
return
}
})
c.OnResponse(func(r *colly.Response) {
if *vverbose {
log.Println(string(r.Body))
}
fname := "./" + r.Request.URL.Host + r.Request.URL.Path
if strings.HasSuffix(fname, "/") {
fname = fname + "index.html"
}
if r.Request.URL.Path == "" {
fname = fname + "/index.html"
}
if r.Request.URL.RawQuery != "" {
fname = fname + "?" + r.Request.URL.RawQuery
}
if _, _err := os.Stat(fname); os.IsExist(_err) && *update {
log.Println("Ignoring existed:", fname)
return
}
if !*silent {
log.Println("Downloading:", r.Request.URL.String(), "to", fname)
}
f, err := fopen(fname)
if err != nil {
if *ignoreerror {
return
}
log.Fatalln("error:", err)
}
defer func() {
if err := f.Close(); err != nil {
fmt.Fprintln(os.Stderr, err)
}
}()
var reader io.Reader
if r.Headers.Get("Content-Length") != "" {
length, err := strconv.Atoi(r.Headers.Get("Content-Length"))
if err != nil {
log.Fatalln(err)
}
bar := pb.StartNew(length)
reader = bar.NewProxyReader(bytes.NewReader(r.Body))
} else {
reader = bytes.NewReader(r.Body)
}
io.Copy(f, reader)
})
re := regexp.MustCompile(`^(\w+\[(\w+)\])(@\w*)?`)
for strtarget := range target {
matches := re.FindStringSubmatch(strtarget)
link := matches[2]
var text bool
if len(matches) == 4 {
if strings.HasPrefix(matches[3], "@") {
link = matches[3][1:]
if matches[3] == "@_text" {
text = true
}
}
}
if *verbose {
fmt.Print("Registering ", matches[1], " ", link)
if text {
log.Println(" Using text rather than attribute")
} else {
log.Println()
}
}
c.OnHTML(matches[1], func(e *colly.HTMLElement) {
var src string
if text {
src = e.Text
} else {
src = e.Attr(link)
}
if src == "" {
fmt.Fprintln(os.Stderr, "Target empty")
return
}
if *verbose {
log.Println("Detecting:", src, "from", e.Request.URL.String())
}
if visited, err := e.Request.HasVisited(src); visited || err != nil {
return
}
if err := e.Request.Visit(src); err != nil && *vverbose {
fmt.Fprintln(os.Stderr, "Request error with", src, ":", err)
}
})
}
c.OnResponse(func(r *colly.Response) {
if !strings.HasSuffix(r.Request.URL.Path, "css") {
return
}
re := regexp.MustCompile(`url\((.*?)\)`)
for _, l := range re.FindAllStringSubmatch(string(r.Body), -1) {
defer func() {
if e := recover(); e != nil {
log.Println(e)
}
}()
if *vverbose {
log.Println("list of css url regexp:", l)
}
src := l[1]
if *verbose {
log.Println("Detecting url:", src, "from css", r.Request.URL.String())
}
if visited, err := r.Request.HasVisited(src); visited || err != nil {
return
}
if err := r.Request.Visit(src); err != nil && *vverbose {
fmt.Fprintln(os.Stderr, "Request error with", src, ":", err)
}
}
})
c.Visit(*start)
c.Wait()
}
func fopen(fpath string) (fp *os.File, err error) {
// not routing save, but ok for this
fnamesplit := strings.Split(fpath, "/")
path := strings.Join(fnamesplit[:len(fnamesplit)-1], "/")
if fs, e := os.Stat(fpath); os.IsNotExist(e) || e != nil {
log.Println("making dir:", path)
if err = os.MkdirAll(path, os.ModePerm); err != nil {
// if exit a dir but saved as file:
// then copy the file to path/index.html
buf, err := ioutil.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("copy file to /index.html falied: %s", err)
}
if err = os.RemoveAll(path); err != nil {
return nil, fmt.Errorf("copy file to /index.html falied: %s", err)
}
if err = os.MkdirAll(path, os.ModePerm); err != nil {
return nil, fmt.Errorf("remake dir %s failed: %s", path, err)
}
fp, err = os.OpenFile(path+"/index.html", os.O_WRONLY|os.O_CREATE, 0666)
if _, err = fp.Write(buf); err != nil {
return nil, fmt.Errorf("copy file to /index.html falied: %s", err)
}
return fopen(fpath)
}
if fp, err = os.Create(fpath); err != nil {
return nil, err
}
} else if fs.IsDir() {
return nil, fmt.Errorf("file %s is a dir", fpath)
} else {
fp, err = os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE, 0666)
}
return
}
Go
1
https://gitee.com/roberchen/webfetch.git
git@gitee.com:roberchen/webfetch.git
roberchen
webfetch
webfetch
5b43d920919e

搜索帮助