代码结构写在前面的话:有时候网上看到一篇写的比较好的公众号文章或者是其他博客文章,想要转载或者添加到个人笔记中来时,如果文中插入的图片较多,就需要一张张地拷贝文章。为了更方便地处理这种情况,抽了点时间写了个小工具,希望能带来帮助。
package main
import (
"context"
"flag"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/axgle/mahonia"
"github.com/chromedp/chromedp"
"github.com/gofrs/uuid"
"github.com/siddontang/go-log/log"
"io/ioutil"
"net/http"
"os"
"strings"
"time"
)
var configFile string
var pathMap map[string]string
var sourceContent string
var projectPath string
func main() {
//initCmd()
//var err0 error = nil
//if err0 = conf.LoadConf(configFile); err0 != nil {
// return
//}
dir, _ := os.Getwd()
println(dir)
projectPath = dir
configFile = "./conf/download.conf"
content,err0 := ioutil.ReadFile(configFile)
byteContent,_ := ioutil.ReadFile("./conf/source.conf")
sourceContent = string(byteContent)
//fmt.Println(sourceContent)
if err0 != nil {
return
}
path := string(content)
//创建map
pathMap = make(map[string]string)
ctxt,cancel := chromedp.NewContext(context.Background())
defer cancel()
var res string
context.WithTimeout(ctxt,15 * time.Second)
//https://ethfans.org/posts/wtf-is-the-blockchain
site:= "https://ethfans.org/posts/wtf-is-the-blockchain"
//site := "https://mp.weixin.qq.com/s/qnceG5MVwlFcm1FaHSKgWA"
//downloadWebChat(ctxt, site, res, path)
downloadSimpleHtml(ctxt,site,res,path)
}
func downloadSimpleHtml(ctxt context.Context, site string, res string, path string) {
err := chromedp.Run(ctxt,visibleSimpleHtml(site,&res))
if err != nil {
return
}
fmt.Println("===============" + res)
reader := strings.NewReader(res)
//解析utf-8格式的串
dec := mahonia.NewDecoder("utf-8")
rd := dec.NewReader(reader)
doc,err := goquery.NewDocumentFromReader(rd)
if err != nil {
log.Fatal(err)
}
html, err := doc.Html()
fmt.Println("=========" + html)
channels := make(chan string)
doc.Find("img").Each(func(i int, selection *goquery.Selection) {
img_url,_ := selection.Attr("src")
fmt.Println("imgUrl:" + img_url)
if strings.Trim(img_url, " ") == "" {
return
}
go download(img_url,channels,path)
fmt.Println("src = " + <-channels + "图片爬取完毕")
})
//全部结束后,替换文件
for originalPath,newPath := range pathMap {
sourceContent = strings.Replace(sourceContent,originalPath,newPath,-1)
}
dir, _ := os.Getwd()
//这里路径会变化,需要注意
println(dir)
//将新的sourceContent输出
fmt.Println(sourceContent)
er := ioutil.WriteFile(projectPath + "/conf/outputs.conf",[]byte(sourceContent),0644)
if er != nil {
//file, _ := exec.LookPath(os.Args[0])
//path, _ := filepath.Abs(file)
//println(path)
//dir2, _ := os.Executable()
//exPath := filepath.Dir(dir2)
//println(exPath)
log.Error(er)
}
}
func downloadWebChat(ctxt context.Context, site string, res string, path string) {
err := chromedp.Run(ctxt, visible(site, &res))
if err != nil {
log.Fatal(err)
}
fmt.Println("===========" + res)
reader := strings.NewReader(res)
dec := mahonia.NewDecoder("utf-8")
rd := dec.NewReader(reader)
doc, err := goquery.NewDocumentFromReader(rd)
//获取将要爬取的html文档信息
if err != nil {
log.Fatal(err)
}
html, err := doc.Html()
fmt.Println("=====" + html)
//创建管道
channels := make(chan string)
doc.Find("img").Each(func(i int, selection *goquery.Selection) {
img_url, _ := selection.Attr("data-src")
fmt.Println("imgUrl:" + img_url)
if strings.Trim(img_url, " ") == "" {
return
}
if strings.Index(img_url, "https") == -1 {
return
}
index := strings.Index(img_url, "?")
if index != -1 {
rs := []rune(img_url)
newUrl := string(rs[0:index])
go download(newUrl, channels, path)
} else {
go download(img_url, channels, path)
}
//从管道消费
fmt.Println("src = " + <-channels + "图片爬取完毕")
})
}
func visibleSimpleHtml(host string,res *string) chromedp.Tasks {
//sel := "body > div.site-content > article > main"
return chromedp.Tasks{
chromedp.Navigate(host),
chromedp.Sleep(3 * time.Second),
chromedp.InnerHTML("body",res,chromedp.ByQuery),
}
}
//func LoadConf(filename string) error {
// content, err := ioutil.ReadFile(filename)
// if err != nil {
// return err
// }
//
// conf := Conf{}
// err = json.Unmarshal(content, &conf)
// if err != nil {
// return err
// }
// GConf = &conf
// return nil
//}
func initCmd() {
flag.StringVar(&configFile, "config", "./config/download.conf", "where download.conf is.")
flag.Parse()
}
func download(img_url string,channels chan string,path string) {
fmt.Println("准备抓取:" + img_url)
uid,_ := uuid.NewV4()
file_name := uid.String() + ".png"
base_dir := path
file_dir := base_dir + "\\"
exists,err := PathExists(file_dir)
if err != nil {
fmt.Printf("get dir error![%v]\n",err)
return
}
if !exists {
os.MkdirAll(file_dir,os.ModePerm)
}
os.Chdir(file_dir)
f, err := os.Create(file_name)
if err != nil {
log.Panic("文件创建失败")
}
defer f.Close()
resp, err := http.Get(img_url)
if err != nil {
fmt.Println("http.get err",err)
}
body,err1 := ioutil.ReadAll(resp.Body)
if err1 != nil {
fmt.Println("读取数据失败")
}
defer resp.Body.Close()
f.Write(body)
pathMap[img_url] = "img/" + file_name
//成功后将文件名传入管道内
channels <- " " + "sourceUrl is:" + img_url
}
func PathExists(path string) (bool,error) {
_,err := os.Stat(path)
if err == nil {
return true , nil
}
if os.IsNotExist(err) {
return false , nil
}
return false , err
}
/**
设置获取的html区域
*/
func visible(host string,res *string) chromedp.Tasks {
sel := "#page-content"
return chromedp.Tasks{
chromedp.Navigate(host),
chromedp.Sleep(3 * time.Second),
chromedp.InnerHTML(sel,res,chromedp.ByID),
}
}
代码构建直接运行main.go文件或者通过go build ./打成windows下的exe包或者在linux下打成downloadPic包直接运行
代码分析代码逻辑主要分为以下几步:
- 解析配置,主要是配置图片下载的目录,如果需要对文章的路径进行替换,也可在conf/source.conf中配置需要替换路径的文章;
- 解析网址并对响应网页流进行解析,主要是对chromedp包的使用;
- 对网页中的img src处获取的路径进行解析,每个路径交给一个协程去处理,因为一篇文章不会太长,所以也可以直接当前线程来处理所有图片的下载;
- 在每个协程中负责图片的下载,并将需要替换的图片路径和要替换成的图片路径放入map中。
- 所有协程处理结束后,替换文章中的图片路径。
chromedp包的使用示例:
// 任务 主要用来设置cookie ,获取登录账号后的页面
func visitWeb(url string) chromedp.Tasks {
return chromedp.Tasks{
chromedp.ActionFunc(func(ctxt context.Context, h cdp.Executor) error {
expr := cdp.TimeSinceEpoch(time.Now().Add(180 * 24 * time.Hour))
success, err := network.SetCookie("ASP.NET_SessionId", "这里是值"). //设置cookie
WithExpires(&expr).
WithDomain("dl.gaggjz.pw:8086"). //访问网站主体
WithHTTPOnly(true).
Do(ctxt, h)
if err != nil {
return err
}
if !success {
return errors.New("could not set cookie")
}
return nil
}),
chromedp.Navigate(url), //页面跳转
}
}
// 任务 主要执行翻页功能和或者html
func DoCrawler() chromedp.Tasks {
//sel =fmt.Sprintf(`javascript:__doPostBack('anpDataPager','%s')`,"2")
return chromedp.Tasks{
chromedp.Sleep(1*time.Second), // 等待
chromedp.WaitVisible(`#form1`, chromedp.ByQuery),//等待id=from1页面可见 ByQuery是使用DOM选择器查找
chromedp.Sleep(1*time.Second),
chromedp.Click(`.pagination li:nth-last-child(4) a`, chromedp.ByQuery),//点击翻页
chromedp.OuterHTML(`tbody`, &res, chromedp.ByQuery), //获取改 tbody标签的html
}
}