使用golang下载github仓库中部分文件

在GitHub我们经常需要下载某一个目录下面的文件，而不是整个仓库，而GitHub提供给用户的下载功能并不能指定下载某一个目录下的文件，于是使用golang写了一个工具，可以用来下载目录中的文件，下载之后的目录结构与仓库中的目录结构一致

效果如下

代码如下，可见 github-dl-tool，欢迎 ⭐️

package main

import (
	"bytes"
	"flag"
	"fmt"
	"io"
	"io/ioutil"
	"net/http"
	"os"
	"path/filepath"
	"regexp"
	"strings"
	"sync"
	"time"
)
const GITHUB = "https://github.com"
const CONTENT = "https://raw.githubusercontent.com"

var urlPattern = regexp.MustCompile(`<a class="js-navigation-open link-gray-dark" title="(.*?)" href="(.*?)">`)
var repositoryPattern = regexp.MustCompile(`(/.*?/.*?/)blob/(.*$)`)
// command line args
var url string
var path string

func init(){
	flag.StringVar(&url, "url", "", "the url you want to grab")
	flag.StringVar(&path, "dl", "", "the path you want to save files")
}


func main() {
	flag.Parse()
	if url == ""{
		fmt.Println("please specify the github url!")
		return
	}
	if path == ""{
		path = getRepositoryName(url)
	}
	var client http.Client
	var wg sync.WaitGroup
	start := time.Now()
	dl(client, url, path, &wg)
	wg.Wait()
	fmt.Printf("total time: %.2f s\n", float64(time.Since(start))/float64(time.Second))
}

// get all file link and download it
func dl(client http.Client, url , path string, wg *sync.WaitGroup) {
	// if the path is not existed, then create it
	if !isExist(path){
		os.MkdirAll(path, 0775)
	}
	// get html source
	html, err := getHtml(client, url)
	if err != nil{
		fmt.Printf("get html error: %s", err.Error())
		return
	}
	// find all file and directory link
	links := urlPattern.FindAllSubmatch(html, -1)
	for _, link := range links{
		// if is directory, we can do it recursively
		if isDir(link[2]){
			dl(client, GITHUB + string(link[2]), filepath.Join(path, strings.SplitN(string(link[2]), "/", 6)[5]), wg)
		} else{
			// download it if it is file
			rep := repositoryPattern.FindSubmatch(link[2])
			// rep[1] is the repositoryPattern path
			// rep[2] is the file path in the repositoryPattern
			wg.Add(1)
			go downloadFile(client, CONTENT + string(rep[1])+string(rep[2]), path, string(link[1]), wg)
		}
	}
}

// download file
func downloadFile(client http.Client, fileURL, path,  filename string, wg *sync.WaitGroup) {
	defer wg.Done()
	fmt.Println("start to download: ", filename)

	resp, err := client.Get(fileURL)
	if err != nil {
		fmt.Printf("download file %s failed due to: %s\n",filename, err.Error())
		return
	}
	defer resp.Body.Close()
	var buff [1024]byte
	// 创建文件
	file, err := os.Create(filepath.Join(path, filename))
	defer file.Close()
	if err != nil{
		fmt.Printf("create file: %s error\n", filename)
		return
	}
	// 写入文件
	for{
		n, err := resp.Body.Read(buff[:])
		file.Write(buff[:n])
		if err != nil{
			if err == io.EOF{
				break
			}
			fmt.Println("error: ", err)
			// if failed delete this file
			os.Remove(filepath.Join(path, filename))
			return
		}
	}
	fmt.Println("finish download:", filename)
}

// get html source
func getHtml(client http.Client, url string) ([]byte, error) {
	resp, err := client.Get(url)
	if err != nil{
		return nil, err
	}
	defer resp.Body.Close()
	data, err := ioutil.ReadAll(resp.Body)
	if err != nil{
		return nil,err
	}
	return data, nil
}

// if is a directory
func isDir(link []byte) bool {
	return bytes.Contains(link, []byte("tree"))
}

// if file or directory exits
func isExist(path string) bool {
	_, err := os.Stat(path)
	if os.IsNotExist(err){
		return false
	}
	return true
}

func getRepositoryName(url string) string {
	var pattern *regexp.Regexp
	var count = strings.Count(url, "/")

	if count > 4{
		pattern = regexp.MustCompile(`https://github.com/.*?/(.*?)/`)
	}else if count == 4{
		pattern = regexp.MustCompile(`https://github.com/.*?/(.*$)`)
	}else{
		fmt.Println("url is wrong")
		os.Exit(-1)
	}
	name := pattern.FindStringSubmatch(url)
	return name[1]
}