雙執行緒_網站上的C++教學轉換成 PDF 電子書
# 把網站上的C++教學轉換成 PDF 電子書, https://www.runoob.com/python3/python3-tutorial.html
# coding=utf-8
import os,re,time,logging,threading
import pdfkit # 另需 wkhtmltopdf.org 下載安裝,再將其執行檔案路徑加入到系統環境 $PATH 中,並允許其聯網
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver import Firefox #须將驅動檔案放到當前目錄,Firefox需geckodriver.exe,Chrome需chromedriver.exe
from selenium.webdriver.firefox.options import Options
foptions = Options()
foptions.add_argument('-headless') #使用無介面瀏覽器
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</body>
</html>
"""
# 1_下面 下麪輸入網址
url0='https://www.runoob.com/cplusplus/cpp-tutorial.html'
def parse_url_to_html(url, name): # 解析URL,返回HTML內容
try:
browser = Firefox( options=foptions) # executable_path='./geckodriver',
browser.get(url)
sleep(5)
soup = BeautifulSoup(browser.page_source,'html.parser')
# 2_下面 下麪輸入正文中要擷取位置的特徵碼
body = soup.find_all('div', class_='article-intro') # class_ ,
print('網頁上符合特徵碼的位置有:',len(body),' 個')
html = str(body[0])
# body中的img標籤的src相對路徑的改成絕對路徑
pattern = "(<img .*?src=\")(.*?)(\")"
def func(m):
if not m.group(3).startswith("http"):
# 3_此處分析圖片地址結構,完善爲準確網址
rtn = m.group(1) + "https:" + m.group(2) + m.group(3)
# rtn = m.group(1) + "https://" + url.split('/')[2] + m.group(2) + m.group(3) #與url主址同用這句
# rtn = "<p></p>" # 若圖片不能下載,可用此句取消圖片,避免 pdfkit 顯示錯誤
return rtn
else:
return m.group(1)+m.group(2)+m.group(3)
html = re.compile(pattern).sub(func, html)
# 3.1_增加或替換爲換行回車程式碼'<br/>',修改替換部分內容,文字中分號"需加\,其它不用加。
html=html.replace('<h3>範例</h3>', '<p>範例</p>')
html=html.replace('<h2 class="tutheader">範例</h2>', '<p>範例</p>')
html=html.replace('<h2 class="example">範例</h2>', '<p>範例</p>') #
# ss=re.search(r'<h1>C 標準庫 - <span class="color_h1"><.*?></span></h1>', html) # 用正則表達式搜尋,配合下句進行替換,提升開頭h4爲一級目錄
# if ss: html = html.replace(ss.group(), ss.group().replace('h1','h2'))
html=html.replace('https:https:', 'https:')
html = html.replace('https:/wp-content', 'https://www.runoob.com/wp-content')
# 3.2_修正範例排版格式錯誤
pattern_1 = r"(<span class=\"hl-code\">)(.*?)(</span>)"
def func_1(m_1):
x=m_1.group(2)
if '\n' in x:
x=x.replace(' ',' ')
x=x.replace('\n','<br/>')
thwz=m_1.group(1) + x + m_1.group(3)
else:
thwz=x
return thwz
html = re.compile(pattern_1,re.DOTALL).sub(func_1, html)
html = html_template.format(content=html)
html = html.encode("utf-8")
browser.close()
with open(name, 'wb') as f:
f.write(html)
return name
except:
logging.error("解析錯誤", exc_info=True)
def get_url_list(): # 獲取所有URL目錄列表
browser = Firefox( options=foptions) # executable_path='./geckodriver',
browser.get(url0)
sleep(3)
soup = BeautifulSoup(browser.page_source,'html.parser')
# 4_下面 下麪輸入目錄區域位置的特徵碼
menu_tag = soup.find_all('div', class_='design')[0] # ,class_="_2rhmJa"
urls = [] # 此處應檢查首頁是否重複,若重複,改爲 urls = []
# 5_下面 下麪輸入各子目錄位置的特徵碼
for li in menu_tag.find_all('a'):
href0=li.get('href')
if href0[0]=='/':
url = "https://www.runoob.com" + href0
else:
url = "https://www.runoob.com/python3/" + li.get('href')
urls.append(url)
print('目錄地址清單:',urls,'地址數量:',len(urls))
browser.close()
return urls
def save_pdf(htmls, file_name): # 把所有html檔案儲存到pdf檔案
options = {
'page-size': 'A4',
'margin-top': '0.5in',
'margin-right': '0.5in',
'margin-bottom': '0.5in',
'margin-left': '0.7in',
'encoding': "UTF-8",
'custom-header': [('Accept-Encoding', 'gzip')],
'cookie': [('cookie-name1', 'cookie-value1'),('cookie-name2','cookie-value2'),],
'outline-depth': 10,
}
pdfkit.from_file(htmls, file_name, options=options)
def dxchxz(urls0,startno):
# print(urls0,startno) # 各執行緒內部數據相對其它執行緒是獨立的
[parse_url_to_html(url, str(startno+index) + ".html") for index, url in enumerate(urls0)]
def main():
start = time.time()
urls = get_url_list()
# 6_下面 下麪輸入要儲存的pdf檔名
file_name = u"C++教學runoob_OK.pdf"
# 7_輸入偵錯碼,0 爲偵錯檢查網頁地址是否正確,1 爲正常完整執行
tsm = 1
if tsm==1 :
# 先刪除原有html
htmls=[x for x in os.listdir('.') if os.path.isfile(x) and re.match(r'\d+\.html',os.path.basename(x))]
for html in htmls:
os.remove(html)
#建立雙執行緒下載
t1 = threading.Thread(target=dxchxz, args=(urls[:int(len(urls)/2)],0))
sleep(1.5)
t2 = threading.Thread(target=dxchxz, args=(urls[int(len(urls)/2):],int(len(urls)/2)))
t1.start()
t2.start()
t1.join()
t2.join()
# 轉pdf並刪除html
htmls=[x for x in os.listdir('.') if os.path.isfile(x) and re.match(r'\d+\.html',os.path.basename(x))]
htmls = [str(x)+".html" for x in range(len(htmls))]
save_pdf(htmls, file_name)
for html in htmls:
os.remove(html)
pass
total_time = time.time() - start
print(u"總共耗時:%f 秒" % total_time)
if __name__ == '__main__':
main()