正文

Colly是Go的爬虫框架,简单快速,适合日常工作获取数据。


安装

go get -u github.com/gocolly/colly/...

1

示例1

package main


import (

"fmt"

"time"


"github.com/gocolly/colly"

)


func main() {

ua := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"

c := colly.NewCollector(

colly.UserAgent(ua),                      // 设置UA

colly.DetectCharset(),                    // 自动编码,防止乱码

colly.AllowedDomains("www.tcmap.com.cn"), // 限制域名

)

c.AllowURLRevisit = true                  // 另外一种设置方式,允许重复访问

_ = c.SetProxy("socks://127.0.0.1:10808") // 设置代理


// 响应内容是HTML时调用,goquerySelector来查找元素

c.OnHTML("a[href*=\"shandong\"]", func(h *colly.HTMLElement) {

// fmt.Println(h.Text)

href := h.Request.AbsoluteURL(h.Attr("href")) // 绝对路径

_ = h.Request.Visit(href)

// 接收上下文传递过来的数据

city := h.Response.Ctx.Get("city")

fmt.Println(city)

})


_ = c.Limit(&colly.LimitRule{

DomainGlob:  "*",

RandomDelay: 1 * time.Second, // 延时

})


// 请求前调用

c.OnRequest(func(r *colly.Request) {

fmt.Println("访问:", r.URL)

// 从请求往响应传递上下文数据

r.Ctx.Put("city", "城市")

})


// 收到响应后调用

c.OnResponse(func(r *colly.Response) {

// fmt.Println(string(r.Body))

})


// 通过xpath来获取元素

c.OnXML("//", func(element *colly.XMLElement) {


})


// 请求发生错误时调用

c.OnError(func(r *colly.Response, err error) {

fmt.Println(err)

})


c.Visit("http://www.tcmap.com.cn/shandong/")

}


示例2

package main


import (

"fmt"

"github.com/gocolly/colly"

"gorm.io/driver/mysql"

"gorm.io/gorm"

"time"

)


func main() {

dsn := "root:pass@tcp(127.0.0.1:3306)/test?charset=utf8mb4&parseTime=True&loc=Local"

db, err := gorm.Open(mysql.New(mysql.Config{

DSN:               dsn,

DefaultStringSize: 256,

}), &gorm.Config{})

if err != nil {

fmt.Println("连结数据库失败")

}


ua := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"

c := colly.NewCollector(

colly.UserAgent(ua),                      // 设置UA

colly.DetectCharset(),                    // 自动编码,防止乱码

colly.AllowedDomains("www.tcmap.com.cn"), // 限制域名

)

cityCollector := c.Clone()

countyCollector := c.Clone()

townCollector := c.Clone()


// 省 http://www.tcmap.com.cn/shandong/

c.OnHTML("#pagebody #page_left > table", func(element *colly.HTMLElement) {

element.ForEach("tr td:first-child", func(i int, e *colly.HTMLElement) {

city := e.ChildText("a")

fmt.Println(city)

relative_url := e.ChildAttr("a", "href")

if relative_url != "" {

absURL := e.Request.AbsoluteURL(relative_url)

// fmt.Println(absURL)

ctx := colly.NewContext()

ctx.Put("city", city)

_ = cityCollector.Request("GET", absURL, nil, ctx, nil)

}

})

})


// 市 http://www.tcmap.com.cn/shandong/jinan.html

cityCollector.OnHTML("#pagebody #page_left > table", func(element *colly.HTMLElement) {

city := element.Request.Ctx.Get("city")

element.ForEach("tr td:first-child", func(i int, e *colly.HTMLElement) {

county := e.ChildText("a")

fmt.Println(city, county)

relative_url := e.ChildAttr("a", "href")

if relative_url != "" {

absURL := e.Request.AbsoluteURL(relative_url)

//fmt.Println(absURL)

ctx := colly.NewContext()

ctx.Put("city", city)

ctx.Put("county", county)

_ = countyCollector.Request("GET", absURL, nil, ctx, nil)

}

})

})


// 区县 http://www.tcmap.com.cn/shandong/lixiaqu.html

countyCollector.OnHTML("#pagebody #page_left > table", func(element *colly.HTMLElement) {

city := element.Request.Ctx.Get("city")

county := element.Request.Ctx.Get("county")

element.ForEach("tr td:first-child", func(i int, e *colly.HTMLElement) {

town := e.ChildText("a")

fmt.Println(city, county, town)

relative_url := e.ChildAttr("a", "href")

if relative_url != "" {

absURL := e.Request.AbsoluteURL(relative_url)

//fmt.Println(absURL)

ctx := colly.NewContext()

ctx.Put("city", city)

ctx.Put("county", county)

ctx.Put("town", town)

_ = townCollector.Request("GET", absURL, nil, ctx, nil)

}

})

})


// 乡镇 http://www.tcmap.com.cn/shandong/lixiaqu_jiefanglujiedao.html

townCollector.OnHTML("#pagebody #page_left > table", func(element *colly.HTMLElement) {

city := element.Request.Ctx.Get("city")

county := element.Request.Ctx.Get("county")

town := element.Request.Ctx.Get("town")

element.ForEach("tr td:first-child", func(i int, e *colly.HTMLElement) {

village := e.ChildText("a")

if village != "" {

fmt.Println(city, county, town, village)

_ = save(db, city, county, town, village)

}

})

})


_ = c.Limit(&colly.LimitRule{

DomainGlob:  "*",

RandomDelay: 1 * time.Second, // 延时

})

_ = c.Visit("http://www.tcmap.com.cn/shandong/")

// c.Wait()

}


type Village struct {

ID      uint `gorm:"primaryKey"`

City    string

County  string

Town    string

Village string

}


func (Village) TableName() string {

return "village"

}


func save(db *gorm.DB, city string, county string, town string, village string) error {

villageRecord := Village{City: city, County: county, Town: town, Village: village}

db = db.Create(&villageRecord)

db = db.Commit()

return nil

}