爬取菜鸟教程文档

最近在菜鸟教程上学习go,网页不是很方便,想要保存pdf文档,方便在ipad上做做笔记啥的。利用python多线程爬取菜鸟教程,pdfkit将HTML网页保存为PDF文档,PyPDF2合并多个pdf文件,正确添加标签。

HTML保存为PDF文档

将html转为pdf,可以利用第三方库pdfkit。pdfkit实质是对wkhtmltopdf的封装,提供了相应的python接口。所以想要使用pdfkit前,先要安装wkhtmltopdf软件,官网下载链接。

pdfkit的主要用法

pdfkit.from_url('http://google.com', 'out.pdf')
pdfkit.from_file('test.html', 'out.pdf')
pdfkit.from_string('Hello!', 'out.pdf')

不过,如果wkhtmltopdf不在环境变量中,想要使用wkhtmltopdf,需要手动输入wkhtmltopdf的安装位置。

config = pdfkit.configuration(wkhtmltopdf = path + '\bin\wkhtmltopdf.exe')
# path为wkhtmltopdf安装目录,比如我安装在D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe
pdfkit.from_url(html_url, output_file, configuration=config)

HTML网页

打开菜鸟的网页(Go 语言教程 | 菜鸟教程),简单看一下,发现挺好处理的,所有章节的url都在左侧菜单栏中,可以很容易的由上层div标签的id进行定位,根据title标签确定文档名。

<html>

<head>
    ······
    <title>Go 语言教程 | 菜鸟教程</title>
    ······
</head>

<body style="" class="vsc-initialized">
    ······
    <!--  内容  -->
    <div class="container main">
        <!-- 中间 -->
        <div class="row">
            <div class="col left-column">
                ······
                <div class="sidebar-box gallery-list
                    <!-- 根据这个div的id进行定位 -->
                    <div class="design" id="leftcolumn">
                        <a target="_top" title="Go 语言教程" href="/go/go-tutorial.html"
                            style="background-color: rgb(150, 185, 125); font-weight: bold; color: rgb(255, 255, 255);">
                            Go 语言教程 </a>
                        <a target="_top" title="Go 语言环境安装" href="/go/go-environment.html">
                            Go 语言环境安装 </a>
                        ······
                        <a target="_top" title="Go 并发" href="/go/go-concurrent.html">Go 并发 </a> <a target="_top"
                            title="Go 语言开发工具" href="/go/go-ide.html">
                            Go 语言开发工具 </a>

                    </div>
                </div>
            </div>
            ······
        </div>
    </div>

</body>

</html>

解析代码

res = requests.get(self.url, headers=self.headers)
RunoodSoup = BeautifulSoup(res.text, 'html.parser')
title = RunoodSoup.find('title').text
title = title[:title.find('|')]
self.name = title.strip()
self.Dir = self.name+'/'
UrlItems = RunoodSoup.find('div', id="leftcolumn").find_all('a')
for item in UrlItems:
    try:
        url = rootUrl + item.get('href')
        title = item.text.strip()
        self.UrlList.append({'title': title, 'url': url})
        except:
            continue

合并pdf

PyPDF2合并pdf文件,并添加标签。

多线程

由于爬取页面可能很多,单线程效率很低,利用多线程提高效率。

完整代码

from bs4 import BeautifulSoup
from PyPDF2 import PdfFileReader, PdfFileWriter
import time
import pdfkit
import threading
import requests
import os


class GetRunoodPDF():
    def __init__(self, index):
        self.url = index
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
        }
        self.Threads = []
        self.UrlList = []
        self.TempDir = 'temp_pages/'
    
    def GenPDF(self):
        print("正在解析教程主页\n"+'-'*20)
        self.PraseIndex()
        print("正在保存所有章节\n"+'-'*20)
        self.SavePDFs()
        print("正在合并所有章节\n"+'-'*20)
        self.MergePDFs()
        print("正在删除临时文件\n"+'-'*20)
        self.DelTemporary()
        print("finished")

    def DelTemporary(self):
        path = self.Dir+self.TempDir
        files = os.listdir(path)
        for f in files:
            os.remove(path+f)
        os.rmdir(path)

    def PraseIndex(self):
        rootUrl = 'https://www.runoob.com'
        res = requests.get(self.url, headers=self.headers)
        if res.status_code != 200:
            return None
        RunoodSoup = BeautifulSoup(res.text, 'html.parser')
        title = RunoodSoup.find('title').text
        title = title[:title.find('|')]
        self.name = title.strip()
        self.Dir = self.name+'/'
        UrlItems = RunoodSoup.find('div', id="leftcolumn").find_all('a')
        for item in UrlItems:
            try:
                url = rootUrl + item.get('href')
                title = item.text.strip()
                self.UrlList.append({'title': title, 'url': url})
            except:
                continue

    def SavePDFs(self):
        config = pdfkit.configuration(
            wkhtmltopdf=r"D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
        )
        # 判断路径是否存在,不存在则创建
        path = self.Dir + self.TempDir
        isExists = os.path.exists(path)
        if not isExists:
            os.makedirs(path)
        # 爬取网页
        for page in self.UrlList:
            kwargs = {
                'url': page['url'],
                'output_path': path + page['title'] + '.pdf',
                'configuration': config
            }
            # 多线程保存网页,节省时间
            t = threading.Thread(target=pdfkit.from_url, kwargs=kwargs)
            t.start()
            self.Threads.append(t)
        # 等待所有线程结束
        self.WaitThreads()

    def WaitThreads(self):
        isDone = True
        while(isDone):
            isDone = False
            for t in self.Threads:
                if t.is_alive():
                    isDone = True
                    time.sleep(1)
                    break

    def MergePDFs(self):
        path = self.Dir+self.TempDir
        files = os.listdir(path)  # 列出目录中的所有文件
        # 判断是否所有页面都保存
        if len(files) != len(self.UrlList):
            print("ERROR! 页面缺失!")
            return
        pdf_fileName = []
        for p in self.UrlList:
            filename = p['title']+'.pdf'
            pdf_fileName.append(filename)
        output = PdfFileWriter()
        outputPages = 0
        fpdfs = []
        for each_file in pdf_fileName:
            fpdf = open(path+each_file, "rb")
            # 读取源pdf文件
            input = PdfFileReader(fpdf)
            # 获得源pdf文件中页面总数
            pageCount = input.getNumPages()
            outputPages += pageCount
            # 分别将page添加到输出output中
            for iPage in range(pageCount):
                output.addPage(input.getPage(iPage))
            # 添加书签
            output.addBookmark(
                title=each_file[:-3], pagenum=outputPages - pageCount)
            fpdfs.append(fpdf)
        print("All Pages Number: " + str(outputPages))
        # 最后写pdf文件
        with open(self.Dir+self.name+'.pdf', "wb") as fo:
            output.write(fo)
            fo.close()
        for fp in fpdfs:
            fp.close()

if __name__ == "__main__":
    url = 'https://www.runoob.com/go/go-tutorial.html'
    R_PDF = GetRunoodPDF(url)
    R_PDF.GenPDF()