Selection 选择器的用法

golang 爬虫简介

spider/craw 爬取

golang 爬虫框架

net/http
Colly
Goquery

第一个爬虫

  import (
         "fmt"
         "io/ioutil"
         "net/http"
)

func fech(url  string) string{
     client := &http.Client{}
     req, _ := http.NewRequest("GET",  url,  nil)
     req.Header.Set("User-Agent",  "Mozilla........")
     req.Header.Add("Cookie",  "_gads=..........")

                I
     resp, err  := client.Do(req)

     if err != nil {
         fmt.Println("Http  get err:", err)
         return ""
     }

     if resp.StatusCode  != 200 {
         fmt.Println("Http  status code:",  resp.StatusCode)
         return ""
     }
    defer  resp.Body.Close()
    body,  err := ioutifl.ReadAll (resp. Body)

     if err != nil {
         fmt.Println("Read  error", err)
         return ""
     }

	return string(body)

内容的解析

func parse(ntml string) {

   html = strings.Replace(html, "\n", "", -1)

   re_sidebar := regexp.MustCompile(``<aside id="sidebar" role="navigation">(.*?)</aside>`)
   //
   sidebar := re_sidebar.FindString(html)
   //
   re_link := regexp.MustCompile(`href="(.*?)">`)
   // TRENT A ERE
   link := re_link.FindAllString(sidebar, -1)

    base_url  := "https://gorm.io/zh_CN/docs/"
    for _,  v := range links {
        s  := v[6 : len(v)-1]
        url  := base_url +s

        fmt.Printf("url:  %v\n", url)

        body  := fetch(url)
        go parse2(body)
    
	 }
 }



 func parse2(body string) {
         body = strings.Replace(body, "\n", "", -1)
         re_content := regexp.MustCompile(`<div class="article">(.*?)</div>`)


        content := re_content.FindString(body)
         // fmt.Printf("content:   %v\n", content)

		re_title := regexp.MustCompile(`<h1   class="article-title" itemprop="name">(.*?)</h1>`)

          title := re_title.FindString(content)
         fint.Printf("title: %v\n",  title)
        
         title  = title[42  : len(title)-5]
         fint.Printf("title: %v\n",  title)
}


 func main() {
	      url  := "https://gorm.io/zh_CN/docs/"
		  fetch(url)
	}

保存到本地


 // "io/ioutil"
 func save(title  string, content string)  {
     err := os.WriteFile("./"+title+".html",   []byte(content),  0644)
     if err  != nil {
        panic(err)
    }

}

保存到数据库

xorm框架

go mod init learn_craw
go get xorm.io/xorm
go get github.com/go-sql-driver/mysql



var engine  *xorm.Engine
var err  error

func init()  {  // init 代码会自动执行
           engine,  err = xorm.NewEngine("mysql", "root:123456@/test_xorm?charset=utf8")
           if  err != nil {
                    fmt.Printf("err:  %v\n",  err)
            } else {
                    err2  := engine.Ping()
                    if err2  != nil {
                            fmt.Print®("err2: %v\n", err2)
                    } else {

                           print("连接成功")
                    }
            }
}



 type GormPage  struct {

     Id      int64
     Title   string
     Content string       `xorm:"text"`
     Created  time.Time  `xorm:"created"`
     Updated  time.Time  `xorm:"updated"`
}



 func savetoDB(title string, content string) {
          engine.Sync(new(GormPage) )

          page  := GormPage{
                  Title:    title,
                  Content:  content,

          }
          affected,  err := engine.Insert(&page)
          if err  != nil {
                   fmt.Printf("err:  %v\n", err)
          }

          fmt .Println("save:" + string(affected))
}




// mysql
mysql -uroot -p
use test_xorm;
show tables
drop table gorm_page;

select *  from gorm_page;

goquery的使用

快速分析页面并且提取内容
colly也是基于goquery

go get -u github.com/PuerkitoBio/goquery

// pkgm   tys
 package main

    import (
            “fmt”
            "log"
            "github. com/PuerkitoBio/goquery"
    )
    
    func main()  {
            url  := "https://gorm.io/zh_CN/docs/"
            dom,  err := goquery.NewDocument(url)
            if err != nil {
                     log.Fatalln(err)
            }

            dom.Find(".sidebar-link").Each(func(i int,  s *goquery.Selection)  {
                     href, _ := s.Attr("href")
                     text := s.Text()
                     fmt.Println(i,  href, text)
            })
    }


// 重用api
NewDocument(url)
NewDocumentFromResponse(resp)



 func getDoc2()  {
     client  := &http.Client{}

     url  := "https://gorm.io/zh_CN/docs/"
     req,  _ := http.NewRequest("GET",   url,  nil)
     resp,  err := client.Do(req)

     dom, err  := goquery.NewDocumentFromResponse(resp)
     if err  != nil {
         log.Fatalln(err)

     }
     dom.Find("")
 }

newDocumentFrom Reader

goquery api 选择器

标签选择器

id选择器

class 选择器

.className

Selection 选择器的用法

 package main

import (
	"fmt"
	"log"

	"github.com/PuerkitoBio/goquery"
)

func main() {
	url := "https://gorm.io/zh_CN/docs/"
	d, err := goquery.NewDocument(url)
	if err != nil {
		log.Fatalln(err)
	}

	d.Find(".sidebar-link").Each(func(i int, s *goquery.Selection) {
		link, _ := s.Attr("href")
		base_url := "https://gorm.io/zh_CN/docs/"

		detail_url := base_url + link
		fmt.Printf("detail_url:  %v\n", detail_url)
		d, _ = goquery.NewDocument(detail_url)

		title := d.Find(".article-title").Text()
		content, _ := d.Find(".article").Html()

		fmt.Printf("title:  %v\n", title)
		fmt.Printf("content:  %v\n", content[0:23])

	})

}

Colly 爬虫框架

go get -u github.com/gocolly/colly

 package main
 import "github.com/gocolly/colly"
 func main() {
     c := colly.NewCollector()

     

     // goquery selector  class
     c.OnHTML(".sidebar-link",  func(e *colly.HTMLElement)  {
        e.Request.Visit(e.attr("href"))]  I
     })
     c.OnRequest(func(r *colly.Request) {
	     fmt.Println("url", r.URL)
     })
	 // 注意顺序，先设置回调再调用这里
	 c.Visit("https://gorm.io/zh_CN/docs/")

}

Colly 回调

func main() {
    c := colly.NewCollector()

    c.OnRequest(func(r   *colly.Request)  {
        fmt. Print1n("before request:onRequest")
    })

    c.OnError(func(_  *colly.Response,   err error) {
        fmt. Printin ("ON error:OnError")
    })

    c.OnResponse(func(r *colly.Response) {
        fmt .Print1n ("on response:OnResponse")
         })

    c.OnHTML("a[href]",   func(e  *colly.HTMLElement)  {
    |    fmt. Printin("after get PAGE:ONHTML" )
     })

    c.OnXML("//h1", func(e *colly.XMLElement) {
        fmt. Print1n("OnResponse for XML:OnXML")
    })

    c.OnScraped(func(r  *colly.Response) {
        fmt.Println("结束",  r.Request.URL)
    })
	c.Visit("https://gorm.io/zh_CN/docs/")
}