需求三:备份贴吧内的某个主题帖子,正常一页的直接右键另存为mhtml就完事了,如果多页的可以用下面的方法。
# coding = utf-8
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
hv = {'user-agent': 'Mozilla/5.0'}
r = requests.get(url, timeout=30, headers=hv)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return "产生异常"
browser = webdriver.Chrome('C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
mainUrl = "
https://tieba.baidu.com/p/6023446113" # 这个地方改需要下载的帖子链接。一般都是这个格式的
browser.get(mainUrl)
html = getHTMLText(mainUrl)
soup = BeautifulSoup(html,"html.parser")
title = soup.find('h3').string
for fu in r'\/:*?"<>|':
title = title.replace(fu, '')
html_page = getHTMLText(mainUrl)
print(mainUrl)
tf = soup.find('span', 'red', style=None)
if tf != None:
td = tf.string
i = int(td)
print("共" + str(i) + "页")
while i >= 1:
url = mainUrl + '?pn=' + str(i)
browser.get(url)
res = browser.execute_cdp_cmd('Page.captureSnapshot', {}) # 获取为mhtml数据。
filename = r'download/' + title +'_第'+str(i)+'页'+ '.mhtml'
with open(filename, 'w', newline='') as f: # 定义存入名称。
f.write(res['data']) # 写入文件
print("已备份第"+str(i)+"页")
i=i-1
browser.quit()