Colly是Go的爬虫框架,简单快速,适合日常工作获取数据。
安装go get -u github.com/gocolly/colly/...
示例1
package main
import (
"fmt"
"time"
"github.com/gocolly/colly"
)
func main() {
ua := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
c := colly.NewCollector(
colly.UserAgent(ua), // 设置UA
colly.DetectCharset(), // 自动编码,防止乱码
colly.AllowedDomains("www.tcmap.com.cn"), // 限制域名
)
c.AllowURLRevisit = true // 另外一种设置方式,允许重复访问
_ = c.SetProxy("socks://127.0.0.1:10808") // 设置代理
// 响应内容是HTML时调用,goquerySelector来查找元素
c.OnHTML("a[href*=\"shandong\"]", func(h *colly.HTMLElement) {
// fmt.Println(h.Text)
href := h.Request.AbsoluteURL(h.Attr("href")) // 绝对路径
_ = h.Request.Visit(href)
// 接收上下文传递过来的数据
city := h.Response.Ctx.Get("city")
fmt.Println(city)
})
_ = c.Limit(&colly.LimitRule{
DomainGlob: "*",
RandomDelay: 1 * time.Second, // 延时
})
// 请求前调用
c.OnRequest(func(r *colly.Request) {
fmt.Println("访问:", r.URL)
// 从请求往响应传递上下文数据
r.Ctx.Put("city", "城市")
})
// 收到响应后调用
c.OnResponse(func(r *colly.Response) {
// fmt.Println(string(r.Body))
})
// 通过xpath来获取元素
c.OnXML("//", func(element *colly.XMLElement) {
})
// 请求发生错误时调用
c.OnError(func(r *colly.Response, err error) {
fmt.Println(err)
})
c.Visit("http://www.tcmap.com.cn/shandong/")
}
示例2
package main
import (
"fmt"
"github.com/gocolly/colly"
"gorm.io/driver/mysql"
"gorm.io/gorm"
"time"
)
func main() {
dsn := "root:pass@tcp(127.0.0.1:3306)/test?charset=utf8mb4&parseTime=True&loc=Local"
db, err := gorm.Open(mysql.New(mysql.Config{
DSN: dsn,
DefaultStringSize: 256,
}), &gorm.Config{})
if err != nil {
fmt.Println("连结数据库失败")
}
ua := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
c := colly.NewCollector(
colly.UserAgent(ua), // 设置UA
colly.DetectCharset(), // 自动编码,防止乱码
colly.AllowedDomains("www.tcmap.com.cn"), // 限制域名
)
cityCollector := c.Clone()
countyCollector := c.Clone()
townCollector := c.Clone()
// 省 http://www.tcmap.com.cn/shandong/
c.OnHTML("#pagebody #page_left > table", func(element *colly.HTMLElement) {
element.ForEach("tr td:first-child", func(i int, e *colly.HTMLElement) {
city := e.ChildText("a")
fmt.Println(city)
relative_url := e.ChildAttr("a", "href")
if relative_url != "" {
absURL := e.Request.AbsoluteURL(relative_url)
// fmt.Println(absURL)
ctx := colly.NewContext()
ctx.Put("city", city)
_ = cityCollector.Request("GET", absURL, nil, ctx, nil)
}
})
})
// 市 http://www.tcmap.com.cn/shandong/jinan.html
cityCollector.OnHTML("#pagebody #page_left > table", func(element *colly.HTMLElement) {
city := element.Request.Ctx.Get("city")
element.ForEach("tr td:first-child", func(i int, e *colly.HTMLElement) {
county := e.ChildText("a")
fmt.Println(city, county)
relative_url := e.ChildAttr("a", "href")
if relative_url != "" {
absURL := e.Request.AbsoluteURL(relative_url)
//fmt.Println(absURL)
ctx := colly.NewContext()
ctx.Put("city", city)
ctx.Put("county", county)
_ = countyCollector.Request("GET", absURL, nil, ctx, nil)
}
})
})
// 区县 http://www.tcmap.com.cn/shandong/lixiaqu.html
countyCollector.OnHTML("#pagebody #page_left > table", func(element *colly.HTMLElement) {
city := element.Request.Ctx.Get("city")
county := element.Request.Ctx.Get("county")
element.ForEach("tr td:first-child", func(i int, e *colly.HTMLElement) {
town := e.ChildText("a")
fmt.Println(city, county, town)
relative_url := e.ChildAttr("a", "href")
if relative_url != "" {
absURL := e.Request.AbsoluteURL(relative_url)
//fmt.Println(absURL)
ctx := colly.NewContext()
ctx.Put("city", city)
ctx.Put("county", county)
ctx.Put("town", town)
_ = townCollector.Request("GET", absURL, nil, ctx, nil)
}
})
})
// 乡镇 http://www.tcmap.com.cn/shandong/lixiaqu_jiefanglujiedao.html
townCollector.OnHTML("#pagebody #page_left > table", func(element *colly.HTMLElement) {
city := element.Request.Ctx.Get("city")
county := element.Request.Ctx.Get("county")
town := element.Request.Ctx.Get("town")
element.ForEach("tr td:first-child", func(i int, e *colly.HTMLElement) {
village := e.ChildText("a")
if village != "" {
fmt.Println(city, county, town, village)
_ = save(db, city, county, town, village)
}
})
})
_ = c.Limit(&colly.LimitRule{
DomainGlob: "*",
RandomDelay: 1 * time.Second, // 延时
})
_ = c.Visit("http://www.tcmap.com.cn/shandong/")
// c.Wait()
}
type Village struct {
ID uint `gorm:"primaryKey"`
City string
County string
Town string
Village string
}
func (Village) TableName() string {
return "village"
}
func save(db *gorm.DB, city string, county string, town string, village string) error {
villageRecord := Village{City: city, County: county, Town: town, Village: village}
db = db.Create(&villageRecord)
db = db.Commit()
return nil
}
参考链接