webfetch
/
fetch.go

package main

import (
	"bytes"
	"flag"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"net/url"
	"os"
	"regexp"
	"strconv"
	"strings"
	"time"

	"github.com/cheggaaa/pb/v3"
	"github.com/gocolly/colly/v2"
	"github.com/gocolly/colly/v2/extensions"
)

var (
	version     string
	buildtime   string
	topname     string
	help        *bool
	verbose     *bool
	vverbose    *bool
	silent      *bool
	topnsplit   []string
	start       *string
	proxyurl    *string
	useproxy    *bool
	timeout     *int
	outer       *bool
	target      targetmap
	domains     targetmap
	update      *bool
	ignoreerror *bool
	showversion *bool
)

type targetmap map[string]bool

func (t targetmap) String() string {
	return fmt.Sprintln(map[string]bool(t))
}

func (t targetmap) Set(s string) error {
	t[s] = true
	return nil
}

func init() {
	start = flag.String("url", "", "the start point to download full url")
	verbose = flag.Bool("v", false, "verbose mode")
	vverbose = flag.Bool("vv", false, "more verbose mode")
	silent = flag.Bool("s", false, "silence mode")
	help = flag.Bool("help", false, "print help page")
	useproxy = flag.Bool("p", false, "use proxy when fetching")
	timeout = flag.Int("T", 30, "short of timeout")
	outer = flag.Bool("outer", false, "allow fetching outer url(may spend many times)")
	flag.BoolVar(help, "h", false, "short of help")
	flag.IntVar(timeout, "timeout", 30, "timeout of request")
	target = make(targetmap)
	flag.Var(&target, "t", "target of each web page(in simple jquery), eg a[href], or img[class]@src."+
		"if target is a[href], then get element named a and use href as link."+
		" if target is img[class]@src, then webfetch will get element named img with class attribute,"+
		" but use attribute src as link, when suffix with"+
		" @_text, then use text of element as link rather than attribute, multiple target is permitted")
	domains = make(targetmap)
	flag.Var(&domains, "U", "allowd domains, multidomains permitted")
	proxyurl = flag.String("proxy", "socks5://127.0.0.1:1080", "proxy for fetching")
	update = flag.Bool("u", true, "only update not existed files")
	ignoreerror = flag.Bool("I", true, "ignore error")
	showversion = flag.Bool("version", false, "show version")
}

func main() {
	flag.Parse()
	*verbose = *verbose && !*silent
	*vverbose = *vverbose && !*silent
	*verbose = *verbose || *vverbose
	target["a[href]"] = true     //another webpage
	target["img[src]"] = true    // image
	target["link[href]"] = true  // might be css file
	target["script[src]"] = true // js file
	if *help {
		log.Println(os.Args[0] + "(version " + version + "): a simple web fetcher, fetch contents starts from the given url path into local dir")
		flag.Usage()
		os.Exit(0)
	}
	if *showversion {
		if buildtime == "" {
			buildtime = "unknown"
		}
		fmt.Println(version, "build at:", buildtime)
		os.Exit(0)
	}
	if *start == "" {
		log.Fatalln("empty topname")
	}
	if *verbose {
		log.Println("Start with", *start)

	}
	starturl, err := url.Parse(*start)
	if err != nil {
		log.Fatalln(err)
	}
	toppathsplit := strings.Split(starturl.Path, "/")
	/*
		if *vverbose {
			log.Println("url:", starturl.Scheme, ",", starturl.Host, ",", starturl.Path)
			log.Println("top path split", toppathsplit)
		}
	*/
	if len(toppathsplit) == 0 {
		topname = starturl.Host
	} else if !strings.Contains(toppathsplit[len(toppathsplit)-1], ".") {
		// the path is given as path
		topname = starturl.Host + strings.Join(toppathsplit, "/")
	} else {
		topname = starturl.Host + strings.Join(toppathsplit[:len(toppathsplit)-1], "/")
	}
	if *verbose {
		log.Println("topname:", topname)
	}
	// setting collector
	c := colly.NewCollector()
	c.Async = true
	c.SetRequestTimeout(time.Second * time.Duration(*timeout))
	extensions.RandomUserAgent(c)
	if *useproxy {
		c.SetProxy(*proxyurl)
	}
	//setting route
	domains[topname] = true
	c.OnRequest(func(r *colly.Request) {
		var ok bool
		// if url fits one of the urls, then pass
		// TODO: algorithm needs improve!
		for name := range domains {
			if strings.Contains(r.URL.String(), name) && !*outer {
				ok = true
			}
		}
		if ok {
			if *verbose {
				log.Println("fetching", r.URL)
			}
		} else {
			log.Println("Aborting:", r.URL.String())
			r.Abort()
			return

		}
	})
	c.OnResponse(func(r *colly.Response) {
		if *vverbose {
			log.Println(string(r.Body))
		}
		fname := "./" + r.Request.URL.Host + r.Request.URL.Path
		if strings.HasSuffix(fname, "/") {
			fname = fname + "index.html"
		}
		if r.Request.URL.Path == "" {
			fname = fname + "/index.html"
		}
		if r.Request.URL.RawQuery != "" {
			fname = fname + "?" + r.Request.URL.RawQuery
		}
		if _, _err := os.Stat(fname); os.IsExist(_err) && *update {
			log.Println("Ignoring existed:", fname)
			return
		}
		if !*silent {
			log.Println("Downloading:", r.Request.URL.String(), "to", fname)
		}
		f, err := fopen(fname)
		if err != nil {
			if *ignoreerror {
				return
			}
			log.Fatalln("error:", err)
		}
		defer func() {
			if err := f.Close(); err != nil {
				fmt.Fprintln(os.Stderr, err)
			}
		}()
		var reader io.Reader
		if r.Headers.Get("Content-Length") != "" {
			length, err := strconv.Atoi(r.Headers.Get("Content-Length"))
			if err != nil {
				log.Fatalln(err)
			}
			bar := pb.StartNew(length)
			reader = bar.NewProxyReader(bytes.NewReader(r.Body))
		} else {
			reader = bytes.NewReader(r.Body)
		}
		io.Copy(f, reader)
	})
	re := regexp.MustCompile(`^(\w+\[(\w+)\])(@\w*)?`)
	for strtarget := range target {
		matches := re.FindStringSubmatch(strtarget)
		link := matches[2]
		var text bool
		if len(matches) == 4 {
			if strings.HasPrefix(matches[3], "@") {
				link = matches[3][1:]
				if matches[3] == "@_text" {
					text = true
				}
			}
		}
		if *verbose {
			fmt.Print("Registering ", matches[1], " ", link)
			if text {
				log.Println(" Using text rather than attribute")
			} else {
				log.Println()
			}
		}
		c.OnHTML(matches[1], func(e *colly.HTMLElement) {
			var src string
			if text {
				src = e.Text
			} else {
				src = e.Attr(link)
			}
			if src == "" {
				fmt.Fprintln(os.Stderr, "Target empty")
				return
			}
			if *verbose {
				log.Println("Detecting:", src, "from", e.Request.URL.String())
			}
			if visited, err := e.Request.HasVisited(src); visited || err != nil {
				return
			}
			if err := e.Request.Visit(src); err != nil && *vverbose {
				fmt.Fprintln(os.Stderr, "Request error with", src, ":", err)
			}
		})

	}
	c.OnResponse(func(r *colly.Response) {
		if !strings.HasSuffix(r.Request.URL.Path, "css") {
			return
		}
		re := regexp.MustCompile(`url\((.*?)\)`)
		for _, l := range re.FindAllStringSubmatch(string(r.Body), -1) {
			defer func() {
				if e := recover(); e != nil {
					log.Println(e)
				}
			}()
			if *vverbose {
				log.Println("list of css url regexp:", l)
			}
			src := l[1]
			if *verbose {
				log.Println("Detecting url:", src, "from css", r.Request.URL.String())
			}
			if visited, err := r.Request.HasVisited(src); visited || err != nil {
				return
			}
			if err := r.Request.Visit(src); err != nil && *vverbose {
				fmt.Fprintln(os.Stderr, "Request error with", src, ":", err)
			}
		}

	})

	c.Visit(*start)
	c.Wait()
}

func fopen(fpath string) (fp *os.File, err error) {
	// not routing save, but ok for this
	fnamesplit := strings.Split(fpath, "/")
	path := strings.Join(fnamesplit[:len(fnamesplit)-1], "/")
	if fs, e := os.Stat(fpath); os.IsNotExist(e) || e != nil {
		log.Println("making dir:", path)
		if err = os.MkdirAll(path, os.ModePerm); err != nil {
			// if exit a dir but saved as file:
			// then copy the file to path/index.html
			buf, err := ioutil.ReadFile(path)
			if err != nil {
				return nil, fmt.Errorf("copy file to /index.html falied: %s", err)
			}
			if err = os.RemoveAll(path); err != nil {
				return nil, fmt.Errorf("copy file to /index.html falied: %s", err)
			}
			if err = os.MkdirAll(path, os.ModePerm); err != nil {
				return nil, fmt.Errorf("remake dir %s failed: %s", path, err)
			}
			fp, err = os.OpenFile(path+"/index.html", os.O_WRONLY|os.O_CREATE, 0666)
			if _, err = fp.Write(buf); err != nil {
				return nil, fmt.Errorf("copy file to /index.html falied: %s", err)
			}
			return fopen(fpath)
		}
		if fp, err = os.Create(fpath); err != nil {
			return nil, err
		}
	} else if fs.IsDir() {
		return nil, fmt.Errorf("file %s is a dir", fpath)
	} else {
		fp, err = os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE, 0666)
	}
	return
}