golang采集模板
package main
import (
"fmt"
"time"
"os"
"io/ioutil"
"strings"
"sync"
"net/http"
"regexp"
"html"
"github.com/360EntSecGroup-Skylar/excelize"
"bufio"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
"io"
)
//前后截取返回首个匹配字符串
func data_findall(a,b,c string) string{
reg:=regexp.MustCompile("(?s)"+a+"(.+?)"+b)
d:=reg.FindString(c)
if len(d)!=0{
if a!="^"{
d=strings.Replace(d,a,"",1)
}
if b!="$"{
d=strings.Replace(d,b,"",1)
}
d=strings.Replace(d,` `,"",-1)
d=html.UnescapeString(d) //html解码
re,_:=regexp.Compile(`(?s)<(.+?)>`)
d=re.ReplaceAllString(d,"")
re2,_:=regexp.Compile(`\s\s\s*`)
d=re2.ReplaceAllString(d," ")
d=strings.Trim(d," ")
return d
}else{
return ""
}
}
//前后截取返回匹配数组
func data_findall_all(a,b,c string) []string{
reg:=regexp.MustCompile("(?s)"+a+"(.+?)"+b)
d:=reg.FindAllString(c,-1)
f:=[]string{}
if len(d)!=0{
for i:=0;i<len(d);i++{
e:=d[i]
if a!="^"{
e=strings.Replace(e,a,"",1)
}
if b!="$"{
e=strings.Replace(e,b,"",1)
}
e=strings.Replace(e,` `,"",-1)
e=html.UnescapeString(e) //html解码
re,_:=regexp.Compile(`(?s)<(.+?)>`)
e=re.ReplaceAllString(e,"")
re2,_:=regexp.Compile(`\s\s\s*`)
e=re2.ReplaceAllString(e," ")
e=strings.Trim(e," ")
f=append(f,e)
}
return f
}else{
return []string{}
}
}
//读取text
func text_read(path string) []string{
fi,_:=os.Open(path)
defer fi.Close()
fd,_:=ioutil.ReadAll(fi)
a:=string(fd)
a=strings.Replace(a,"\r","",-1)
if len(a)>3 && a[:3]=="\xEF\xBB\xBF"{
a=a[3:]
}
if len(a)>0 && a[len(a)-1:]=="\n"{
a=a[:len(a)-1]
}
b:=strings.Split(a,"\n")
fmt.Println("读取结束")
return b
}
//读取Excel
func excel_read(path string) [][]string{
f,_:=excelize.OpenFile(path)
rows,_:= f.GetRows("Sheet1")
a:=[][]string{}
for i:=0;i<len(rows);i++{
a=append(a,rows[i])
}
return a
}
//写入Excel
func excel_write(a [][]string,path string) {
file:=excelize.NewFile()
file.SetDefaultFont("宋体")
streamWriter,_:= file.NewStreamWriter("Sheet1")
for x:=0;x<len(a);x++{
row:= make([]interface{},len(a[x]))
for y:=0;y<len(a[x]);y++{
row[y]=a[x][y]
}
cell,_:=excelize.CoordinatesToCellName(1,x+1)
streamWriter.SetRow(cell,row)
}
streamWriter.Flush()
file.SaveAs(path)
}
//自动转码
func data_encoding(r io.Reader) []byte{
OldReader:= bufio.NewReader(r)
bytes,_:= OldReader.Peek(1024)
e,_,_:= charset.DetermineEncoding(bytes, "")
reader:= transform.NewReader(OldReader, e.NewDecoder())
all,_:= ioutil.ReadAll(reader)
return all
}
//get请求
func request_get(url,cookie string) string{
req,_ := http.NewRequest("GET",url,nil)
req.Header.Add("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36")
req.Header.Add("Cookie",cookie)
resp,err:= http.DefaultClient.Do(req)
if err!=nil{
fmt.Println(err)
return ""
}
defer resp.Body.Close()
return string(data_encoding(resp.Body))
//body,_:= ioutil.ReadAll(resp.Body)
//return string(body)
}
//post请求
func request_post(url,cookie,post_data string) string{
req,_ := http.NewRequest("POST",url,strings.NewReader(post_data))
req.Header.Add("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36")
req.Header.Add("Content-Type","application/x-www-form-urlencoded; charset=UTF-8")
req.Header.Add("Cookie",cookie)
resp,err:= http.DefaultClient.Do(req)
if err!=nil{
fmt.Println(err)
return ""
}
defer resp.Body.Close()
return string(data_encoding(resp.Body))
//body,_:= ioutil.ReadAll(resp.Body)
//return string(body)
}
//请求网页并存入数组
func data_request(url string,ch chan bool){
ch<-true
//time.Sleep(time.Second)
b:=[]string{}
b=append(b,url)
cookie:=""
page:=request_get(url,cookie)
title:=data_findall(`<title>`,`</title>`,page)
b=append(b,title)
if page==""{
data_lost=append(data_lost,b)
fmt.Println("请求失败")
}else{
data_all=append(data_all,b)
fmt.Println(len(data_all)-1,title)
}
<-ch
wg.Done()
}
//全局变量
var data_all [][]string
var data_lost [][]string
var wg sync.WaitGroup
func main() {
ch:=make(chan bool,3)//设置线程数
start:=time.Now()
data_read:=text_read("urls.txt")
data_all=append(data_all,[]string{"URL","Title"})
for i:=0;i<len(data_read);i++{
wg.Add(1)
go data_request(data_read[i],ch)
}
wg.Wait()
cost:=time.Since(start)
if len(data_all)>1{
excel_write(data_all,"data.xlsx")
}
if len(data_lost)>0{
excel_write(data_lost,"data_lost.xlsx")
}
fmt.Printf("\n共采集 %d 条数据\n",len(data_all)-1)
fmt.Printf("耗时 %s\n",cost)
var str string
fmt.Scan(&str)
}