爬取菜鸟教程文档
最近在菜鸟教程上学习go,网页不是很方便,想要保存pdf文档,方便在ipad上做做笔记啥的。利用python多线程爬取菜鸟教程,pdfkit将HTML网页保存为PDF文档,PyPDF2合并多个pdf文件,正确添加标签。
HTML保存为PDF文档
将html转为pdf,可以利用第三方库pdfkit。pdfkit实质是对wkhtmltopdf的封装,提供了相应的python接口。所以想要使用pdfkit前,先要安装wkhtmltopdf软件,官网下载链接。
pdfkit的主要用法
pdfkit.from_url('http://google.com', 'out.pdf')
pdfkit.from_file('test.html', 'out.pdf')
pdfkit.from_string('Hello!', 'out.pdf')
不过,如果wkhtmltopdf不在环境变量中,想要使用wkhtmltopdf,需要手动输入wkhtmltopdf的安装位置。
config = pdfkit.configuration(wkhtmltopdf = path + '\bin\wkhtmltopdf.exe')
# path为wkhtmltopdf安装目录,比如我安装在D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe
pdfkit.from_url(html_url, output_file, configuration=config)
HTML网页
打开菜鸟的网页(Go 语言教程 | 菜鸟教程),简单看一下,发现挺好处理的,所有章节的url都在左侧菜单栏中,可以很容易的由上层div标签的id进行定位,根据title标签确定文档名。
<html>
<head>
······
<title>Go 语言教程 | 菜鸟教程</title>
······
</head>
<body style="" class="vsc-initialized">
······
<!-- 内容 -->
<div class="container main">
<!-- 中间 -->
<div class="row">
<div class="col left-column">
······
<div class="sidebar-box gallery-list
<!-- 根据这个div的id进行定位 -->
<div class="design" id="leftcolumn">
<a target="_top" title="Go 语言教程" href="/go/go-tutorial.html"
style="background-color: rgb(150, 185, 125); font-weight: bold; color: rgb(255, 255, 255);">
Go 语言教程 </a>
<a target="_top" title="Go 语言环境安装" href="/go/go-environment.html">
Go 语言环境安装 </a>
······
<a target="_top" title="Go 并发" href="/go/go-concurrent.html">Go 并发 </a> <a target="_top"
title="Go 语言开发工具" href="/go/go-ide.html">
Go 语言开发工具 </a>
</div>
</div>
</div>
······
</div>
</div>
</body>
</html>
解析代码
res = requests.get(self.url, headers=self.headers)
RunoodSoup = BeautifulSoup(res.text, 'html.parser')
title = RunoodSoup.find('title').text
title = title[:title.find('|')]
self.name = title.strip()
self.Dir = self.name+'/'
UrlItems = RunoodSoup.find('div', id="leftcolumn").find_all('a')
for item in UrlItems:
try:
url = rootUrl + item.get('href')
title = item.text.strip()
self.UrlList.append({'title': title, 'url': url})
except:
continue
合并pdf
PyPDF2合并pdf文件,并添加标签。
多线程
由于爬取页面可能很多,单线程效率很低,利用多线程提高效率。
完整代码
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileReader, PdfFileWriter
import time
import pdfkit
import threading
import requests
import os
class GetRunoodPDF():
def __init__(self, index):
self.url = index
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
}
self.Threads = []
self.UrlList = []
self.TempDir = 'temp_pages/'
def GenPDF(self):
print("正在解析教程主页\n"+'-'*20)
self.PraseIndex()
print("正在保存所有章节\n"+'-'*20)
self.SavePDFs()
print("正在合并所有章节\n"+'-'*20)
self.MergePDFs()
print("正在删除临时文件\n"+'-'*20)
self.DelTemporary()
print("finished")
def DelTemporary(self):
path = self.Dir+self.TempDir
files = os.listdir(path)
for f in files:
os.remove(path+f)
os.rmdir(path)
def PraseIndex(self):
rootUrl = 'https://www.runoob.com'
res = requests.get(self.url, headers=self.headers)
if res.status_code != 200:
return None
RunoodSoup = BeautifulSoup(res.text, 'html.parser')
title = RunoodSoup.find('title').text
title = title[:title.find('|')]
self.name = title.strip()
self.Dir = self.name+'/'
UrlItems = RunoodSoup.find('div', id="leftcolumn").find_all('a')
for item in UrlItems:
try:
url = rootUrl + item.get('href')
title = item.text.strip()
self.UrlList.append({'title': title, 'url': url})
except:
continue
def SavePDFs(self):
config = pdfkit.configuration(
wkhtmltopdf=r"D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
)
# 判断路径是否存在,不存在则创建
path = self.Dir + self.TempDir
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
# 爬取网页
for page in self.UrlList:
kwargs = {
'url': page['url'],
'output_path': path + page['title'] + '.pdf',
'configuration': config
}
# 多线程保存网页,节省时间
t = threading.Thread(target=pdfkit.from_url, kwargs=kwargs)
t.start()
self.Threads.append(t)
# 等待所有线程结束
self.WaitThreads()
def WaitThreads(self):
isDone = True
while(isDone):
isDone = False
for t in self.Threads:
if t.is_alive():
isDone = True
time.sleep(1)
break
def MergePDFs(self):
path = self.Dir+self.TempDir
files = os.listdir(path) # 列出目录中的所有文件
# 判断是否所有页面都保存
if len(files) != len(self.UrlList):
print("ERROR! 页面缺失!")
return
pdf_fileName = []
for p in self.UrlList:
filename = p['title']+'.pdf'
pdf_fileName.append(filename)
output = PdfFileWriter()
outputPages = 0
fpdfs = []
for each_file in pdf_fileName:
fpdf = open(path+each_file, "rb")
# 读取源pdf文件
input = PdfFileReader(fpdf)
# 获得源pdf文件中页面总数
pageCount = input.getNumPages()
outputPages += pageCount
# 分别将page添加到输出output中
for iPage in range(pageCount):
output.addPage(input.getPage(iPage))
# 添加书签
output.addBookmark(
title=each_file[:-3], pagenum=outputPages - pageCount)
fpdfs.append(fpdf)
print("All Pages Number: " + str(outputPages))
# 最后写pdf文件
with open(self.Dir+self.name+'.pdf', "wb") as fo:
output.write(fo)
fo.close()
for fp in fpdfs:
fp.close()
if __name__ == "__main__":
url = 'https://www.runoob.com/go/go-tutorial.html'
R_PDF = GetRunoodPDF(url)
R_PDF.GenPDF()