golang并发版的爬虫
package main
import (
"fmt"
"io"
"net/http"
"os"
"strconv"
)
var(
spiderchan = make(chan bool)
)
func main() {
//爬取一个网页并保存在文件中
var start,end int
fmt.Println("请输入要爬取的起始页(>=1)")
fmt.Scan(&start)
fmt.Println("请输入要爬取的起始页(>=start)")
fmt.Scan(&end)
fmt.Println("开始爬取")
//写一个函数爬取网页
scapy(start,end)
return
}
func spiderhtml(i int,pagechan chan int){
url:= "http://tieba.baidu.com/f?kw=%E9%87%8D%E5%BA%86%E5%A4%A7%E5%AD%A6&ie=utf-8&pn="+strconv.Itoa((i-1)*50)
resp,err:= http.Get(url)
errprint("http get err",err)
defer resp.Body.Close()
f,err2:=os.Create("第"+strconv.Itoa(i)+"个网页.html")
errprint("os create err",err2)
//读取文件
buff := make([]byte,4096)
for{
n,err:=resp.Body.Read(buff)
errprint("resp body err",err)
if n == 0{
break
}
//写出到文件
f.Write(buff[:n])
}
f.Close()
pagechan<-i
}
func scapy(start int, end int) {
pagechan := make(chan int)
fmt.Printf("正在爬取第%d页到%d页\n",start,end)
for i:=start;i<=end ;i++ {
go spiderhtml(i,pagechan)
}
for i:=start; i<=end;i++ {
fmt.Printf("第%d个页面爬取完成\n",<-pagechan)
}
}
func errprint(s string, e error) {
if e!=nil&&e!=io.EOF{
fmt.Println(s,e)
return
}
}