基于golang的小说爬虫
/**
下载
*/
func DownLoad(host, bookUrl string, pool *Pool) {
showTime("download start")
pool.VisitMap.WriteMap(bookUrl)
//http 客户端
client := http.Client{}
//创建请求
req, err := http.NewRequest("GET", bookUrl, nil)
if err != nil {
fmt.Println(err)
}
//设置请求header
req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36")
//执行请求
showTime("client.Do start")
resp, err := client.Do(req)
showTime("client.Do end")
if err != nil {
fmt.Println(err)
}
//这里判断下,如果响应关闭。则直接返回,实测是存在这种情况的
if resp == nil || resp.Close {
return
}
defer resp.Body.Close()
showTime("links start")
body, err := html.Parse(resp.Body)
links := visit(nil, body)
showTime("links end")
for _, link := range links {
absolute := urlJoin(link, bookUrl)
//fmt.Println(runtime.NumGoroutine())
//匹配是否是文章页面
rh, _ := regexp.Compile(host)
host := rh.MatchString(absolute)
//匹配是否是文章页面
r, _ := regexp.Compile(`\.html`)
html := r.MatchString(absolute)
if bookUrl != " " && host && !html {
len := strings.Index(absolute, "#")
if len != -1 {
absolute = exstrings.SubString(absolute, 0, len)
}
fmt.Println("current url: " + absolute)
if !pool.VisitMap.ReadMap(absolute) {
fmt.Println("add url: " + absolute)
go urlQueue(absolute, pool)
}
}
}
}
func urlQueue(url string, pool *Pool) {
pool.queue <- url
}
func urlJoin(href, base string) string {
uri, err := url.Parse(href)
if err != nil {
return " "
}
baseUrl, err := url.Parse(base)
if err != nil {
return " "
}
return baseUrl.ResolveReference(uri).String()
}
func showTime(action string) {
//fmt.Println(fmt.Sprintf("%s :%s", action, time.Now().String()))
}