package main import ( "fmt" "time" "os" "io/ioutil" "strings" "sync" "net/http" "regexp" "html" "github.com/360EntSecGroup-Skylar/excelize" "bufio" "golang.org/x/net/html/charset" "golang.org/x/text/transform" "io" ) //前后截取返回首个匹配字符串 func data_findall(a,b,c string) string{ reg:=regexp.MustCompile("(?s)"+a+"(.+?)"+b) d:=reg.FindString(c) if len(d)!=0{ if a!="^"{ d=strings.Replace(d,a,"",1) } if b!="$"{ d=strings.Replace(d,b,"",1) } d=strings.Replace(d,`&nbsp;`,"",-1) d=html.UnescapeString(d) //html解码 re,_:=regexp.Compile(`(?s)<(.+?)>`) d=re.ReplaceAllString(d,"") re2,_:=regexp.Compile(`\s\s\s*`) d=re2.ReplaceAllString(d," ") d=strings.Trim(d," ") return d }else{ return "" } } //前后截取返回匹配数组 func data_findall_all(a,b,c string) []string{ reg:=regexp.MustCompile("(?s)"+a+"(.+?)"+b) d:=reg.FindAllString(c,-1) f:=[]string{} if len(d)!=0{ for i:=0;i<len(d);i++{ e:=d[i] if a!="^"{ e=strings.Replace(e,a,"",1) } if b!="$"{ e=strings.Replace(e,b,"",1) } e=strings.Replace(e,`&nbsp;`,"",-1) e=html.UnescapeString(e) //html解码 re,_:=regexp.Compile(`(?s)<(.+?)>`) e=re.ReplaceAllString(e,"") re2,_:=regexp.Compile(`\s\s\s*`) e=re2.ReplaceAllString(e," ") e=strings.Trim(e," ") f=append(f,e) } return f }else{ return []string{} } } //读取text func text_read(path string) []string{ fi,_:=os.Open(path) defer fi.Close() fd,_:=ioutil.ReadAll(fi) a:=string(fd) a=strings.Replace(a,"\r","",-1) if len(a)>3 && a[:3]=="\xEF\xBB\xBF"{ a=a[3:] } if len(a)>0 && a[len(a)-1:]=="\n"{ a=a[:len(a)-1] } b:=strings.Split(a,"\n") fmt.Println("读取结束") return b } //读取Excel func excel_read(path string) [][]string{ f,_:=excelize.OpenFile(path) rows,_:= f.GetRows("Sheet1") a:=[][]string{} for i:=0;i<len(rows);i++{ a=append(a,rows[i]) } return a } //写入Excel func excel_write(a [][]string,path string) { file:=excelize.NewFile() file.SetDefaultFont("宋体") streamWriter,_:= file.NewStreamWriter("Sheet1") for x:=0;x<len(a);x++{ row:= make([]interface{},len(a[x])) for y:=0;y<len(a[x]);y++{ row[y]=a[x][y] } cell,_:=excelize.CoordinatesToCellName(1,x+1) streamWriter.SetRow(cell,row) } streamWriter.Flush() file.SaveAs(path) } //自动转码 func data_encoding(r io.Reader) []byte{ OldReader:= bufio.NewReader(r) bytes,_:= OldReader.Peek(1024) e,_,_:= charset.DetermineEncoding(bytes, "") reader:= transform.NewReader(OldReader, e.NewDecoder()) all,_:= ioutil.ReadAll(reader) return all } //get请求 func request_get(url,cookie string) string{ req,_ := http.NewRequest("GET",url,nil) req.Header.Add("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36") req.Header.Add("Cookie",cookie) resp,err:= http.DefaultClient.Do(req) if err!=nil{ fmt.Println(err) return "" } defer resp.Body.Close() return string(data_encoding(resp.Body)) //body,_:= ioutil.ReadAll(resp.Body) //return string(body) } //post请求 func request_post(url,cookie,post_data string) string{ req,_ := http.NewRequest("POST",url,strings.NewReader(post_data)) req.Header.Add("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36") req.Header.Add("Content-Type","application/x-www-form-urlencoded; charset=UTF-8") req.Header.Add("Cookie",cookie) resp,err:= http.DefaultClient.Do(req) if err!=nil{ fmt.Println(err) return "" } defer resp.Body.Close() return string(data_encoding(resp.Body)) //body,_:= ioutil.ReadAll(resp.Body) //return string(body) } //请求网页并存入数组 func data_request(url string,ch chan bool){ ch<-true //time.Sleep(time.Second) b:=[]string{} b=append(b,url) cookie:="" page:=request_get(url,cookie) title:=data_findall(`<title>`,`</title>`,page) b=append(b,title) if page==""{ data_lost=append(data_lost,b) fmt.Println("请求失败") }else{ data_all=append(data_all,b) fmt.Println(len(data_all)-1,title) } <-ch wg.Done() } //全局变量 var data_all [][]string var data_lost [][]string var wg sync.WaitGroup func main() { ch:=make(chan bool,3)//设置线程数 start:=time.Now() data_read:=text_read("urls.txt") data_all=append(data_all,[]string{"URL","Title"}) for i:=0;i<len(data_read);i++{ wg.Add(1) go data_request(data_read[i],ch) } wg.Wait() cost:=time.Since(start) if len(data_all)>1{ excel_write(data_all,"data.xlsx") } if len(data_lost)>0{ excel_write(data_lost,"data_lost.xlsx") } fmt.Printf("\n共采集 %d 条数据\n",len(data_all)-1) fmt.Printf("耗时 %s\n",cost) var str string fmt.Scan(&str) }