一:定义全局变量:
const ( //连接数据库用到的常量
USERNAME = "root"
PASSWORD = "123456"
HOST = "127.0.0.1"
PORT = "3306"
DBNAME = "test"
)
var DB *sql.DB
type MovieData struct { //用于储存爬取的数据
Title string `json:"title"`
Director string `json:"director"`
Picture string `json:"picture"`
Actor string `json:"actor"`
Year string `json:"year"`
Score string `json:"score"`
Quote string `json:"quote"`
}
二:初始化mysql:
func InitDB() {
path := strings.Join([]string{USERNAME, ":", PASSWORD, "@tcp(", HOST, ":", PORT, ")/", DBNAME, "?charset=utf8"}, "")
fmt.Println(path)
DB, _ = sql.Open("mysql", path)
DB.SetConnMaxLifetime(10)
DB.SetMaxIdleConns(5)
if err := DB.Ping(); err != nil {
fmt.Println("opon database fail")
return
}
fmt.Println("connnect success")
}
三:实现爬虫
func Spider(page string, ch chan bool) {
client := &http.Client{} //初始化客户端
req, err := http.NewRequest("GET", "https://movie.douban.com/top250?start="+page, nil) //建立连接
if err != nil {
log.Fatal(err)
}
req.Header.Set("Connection", "keep-alive") //设置请求头模拟浏览器登录
req.Header.Set("Cache-Control", "max-age=0")
req.Header.Set("sec-ch-ua-mobile", "?0")
req.Header.Set("Upgrade-Insecure-Requests", "1")
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
req.Header.Set("Sec-Fetch-Site", "same-origin")
req.Header.Set("Sec-Fetch-Mode", "navigate")
req.Header.Set("Sec-Fetch-User", "?1")
req.Header.Set("Sec-Fetch-Dest", "document")
req.Header.Set("Referer", "https://movie.douban.com/chart")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
resp, err := client.Do(req) //拿到返回的内容
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
docDetail, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
fmt.Println("fatal err")
log.Fatal(err)
}
docDetail.Find("#content > div > div.article > ol > li > div"). //定位到html页面指定元素
Each(func(i int, s *goquery.Selection) { //循环遍历每一个指定元素
var movieData MovieData //实例化结构体
title := s.Find("div.info > div.hd > a > span:nth-child(1)").Text()
img := s.Find("div.pic > a > img")
imgTmp, ok := img.Attr("src")
info := strings.Trim(s.Find("div.info > div.bd > p:nth-child(1)").Text(), " ")
director, actor, year := InfoSpite(info)
score := strings.Trim(s.Find("div.info > div.bd > div > span.rating_num").Text(), " ")
score = strings.Trim(score, "\n")
quote := strings.Trim(s.Find("div.info > div.bd > p.quote > span").Text(), " ")
if ok { //将爬取到的内容放进结构体中
movieData.Title = title
movieData.Director = director
movieData.Picture = imgTmp
movieData.Actor = actor
movieData.Year = year
movieData.Score = score
movieData.Quote = quote
InsertSql(movieData) //将数据插入到mysql中
} else {
fmt.Println("not ok")
}
})
if ch != nil {
ch <- true
}
}
四:初始化数据库
func InitDB() {
path := strings.Join([]string{USERNAME, ":", PASSWORD, "@tcp(", HOST, ":", PORT, ")/", DBNAME, "?charset=utf8"}, "")
fmt.Println(path)
DB, _ = sql.Open("mysql", path)
DB.SetConnMaxLifetime(10)
DB.SetMaxIdleConns(5)
if err := DB.Ping(); err != nil {
fmt.Println("opon database fail")
return
}
fmt.Println("connnect success")
}
五:将数据插入到mysql
func InsertSql(movieData MovieData) bool {
fmt.Println("InsertSql")
tx, err := DB.Begin()
if err != nil {
fmt.Println("tx fail", err)
return false
}
stmt, err := tx.Prepare("INSERT INTO movie(`Title`,`Director`,`Picture`,`Actor`,`Year`,`Score`,`Quote`) VALUES (?,?, ?,?,?,?,?)")
if err != nil {
fmt.Println("Prepare fail", err)
return false
}
_, err = stmt.Exec(movieData.Title, movieData.Director, movieData.Picture, movieData.Actor, movieData.Year, movieData.Score, movieData.Quote)
if err != nil {
fmt.Println("Exec fail", err)
return false
}
_ = tx.Commit()
return true
}
完整代码:
package main
import (
"database/sql"
"fmt"
"github.com/PuerkitoBio/goquery"
_ "github.com/jinzhu/gorm/dialects/mysql"
"log"
"net/http"
"regexp"
"strconv"
"strings"
)
const (
USERNAME = "root"
PASSWORD = "123456"
HOST = "127.0.0.1"
PORT = "3306"
DBNAME = "test"
)
var DB *sql.DB
type MovieData struct {
Title string `json:"title"`
Director string `json:"director"`
Picture string `json:"picture"`
Actor string `json:"actor"`
Year string `json:"year"`
Score string `json:"score"`
Quote string `json:"quote"`
}
func main() {
InitDB()
ch := make(chan bool)
go Spider(strconv.Itoa(0*25), ch)
<-ch
DB.Close()
}
func Spider(page string, ch chan bool) {
client := &http.Client{} //初始化客户端
req, err := http.NewRequest("GET", "https://movie.douban.com/top250?start="+page, nil) //建立连接
if err != nil {
log.Fatal(err)
}
req.Header.Set("Connection", "keep-alive") //设置请求头
req.Header.Set("Cache-Control", "max-age=0")
req.Header.Set("sec-ch-ua-mobile", "?0")
req.Header.Set("Upgrade-Insecure-Requests", "1")
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
req.Header.Set("Sec-Fetch-Site", "same-origin")
req.Header.Set("Sec-Fetch-Mode", "navigate")
req.Header.Set("Sec-Fetch-User", "?1")
req.Header.Set("Sec-Fetch-Dest", "document")
req.Header.Set("Referer", "https://movie.douban.com/chart")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
resp, err := client.Do(req) //拿到返回的内容
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
docDetail, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
fmt.Println("fatal err")
log.Fatal(err)
}
docDetail.Find("#content > div > div.article > ol > li > div"). //定位到html页面指定元素
Each(func(i int, s *goquery.Selection) { //循环遍历每一个指定元素
var movieData MovieData //实例化结构体
title := s.Find("div.info > div.hd > a > span:nth-child(1)").Text()
img := s.Find("div.pic > a > img")
imgTmp, ok := img.Attr("src")
info := strings.Trim(s.Find("div.info > div.bd > p:nth-child(1)").Text(), " ")
director, actor, year := InfoSpite(info)
score := strings.Trim(s.Find("div.info > div.bd > div > span.rating_num").Text(), " ")
score = strings.Trim(score, "\n")
quote := strings.Trim(s.Find("div.info > div.bd > p.quote > span").Text(), " ")
if ok { //将爬取到的内容放进结构体中
movieData.Title = title
movieData.Director = director
movieData.Picture = imgTmp
movieData.Actor = actor
movieData.Year = year
movieData.Score = score
movieData.Quote = quote
InsertSql(movieData) //将数据插入到mysql中
} else {
fmt.Println("not ok")
}
})
if ch != nil {
ch <- true
}
}
func InfoSpite(info string) (director, actor, year string) {
directorRe, _ := regexp.Compile(`导演:(.*)主演:`)
if len(director) < 8 {
director = string(directorRe.Find([]byte(info)))
} else {
director = string(directorRe.Find([]byte(info)))[8:]
}
director = strings.Trim(director, "主演:")
actorRe, _ := regexp.Compile(`主演:(.*)`)
if len(actor) < 8 {
actor = string(actorRe.Find([]byte(info)))
} else {
actor = string(actorRe.Find([]byte(info)))[8:]
}
yearRe, _ := regexp.Compile(`(\d+)`)
year = string(yearRe.Find([]byte(info)))
return
}
func InitDB() {
path := strings.Join([]string{USERNAME, ":", PASSWORD, "@tcp(", HOST, ":", PORT, ")/", DBNAME, "?charset=utf8"}, "")
fmt.Println(path)
DB, _ = sql.Open("mysql", path)
DB.SetConnMaxLifetime(10)
DB.SetMaxIdleConns(5)
if err := DB.Ping(); err != nil {
fmt.Println("opon database fail")
return
}
fmt.Println("connnect success")
}
func InsertSql(movieData MovieData) bool {
fmt.Println("InsertSql")
tx, err := DB.Begin()
if err != nil {
fmt.Println("tx fail", err)
return false
}
stmt, err := tx.Prepare("INSERT INTO movie(`Title`,`Director`,`Picture`,`Actor`,`Year`,`Score`,`Quote`) VALUES (?,?, ?,?,?,?,?)")
if err != nil {
fmt.Println("Prepare fail", err)
return false
}
_, err = stmt.Exec(movieData.Title, movieData.Director, movieData.Picture, movieData.Actor, movieData.Year, movieData.Score, movieData.Quote)
if err != nil {
fmt.Println("Exec fail", err)
return false
}
_ = tx.Commit()
return true
}