百度贴吧帖子备份
source link: https://jingyig.com/tech/baidu_tieba_backup/
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
百度贴吧帖子备份
2020年8月,我为了缓解压力,开始养月季。入坑后,我关注了一位介绍月季品种的博主。他的帖子图文并茂、内容详实且文采斐然。他推荐的品种大多抗病性强,与苗商大力宣传的娇弱品种对比鲜明。因此他所处的舆论环境多不平静,明嘲暗讽时而有之。2020年10月,时任某吧吧务的苗商设局陷害他,意欲删除那些介绍贴。我出于保护资料的目的,对他的绝大部分帖子进行了备份,并将其转换为本地文件。在这一过程中,我发现网上很少有详细说明如何备份贴吧帖子的文章,于是准备自己动手写一篇,以便后人之需。
在此简要概述我所做的工作。我手工整理了需要备份的帖子链接,然后使用A的代码生成html文件,使用B的代码批量下载帖子中的图片。随后,核对图片与html是否一一对应,将html中图片和贴吧表情的url改为本地路径,并清洗数据中的噪音。网友S使用印象笔记保存了该博主的部分帖子。我将他的文件与我保存的文件进行核对,整理出已备份帖子合集。
- 生成html文件
使用hjhee的tiebaSpider代码。由于网络原因,源代码的dependency可能需要手动从官网下载并解压至指定目录。
- 下载帖子图片
使用zhaohui8969的tiebaImageGet代码。原代码默认每次只下载一个链接中的图片。我对其进行了一些修改,以实现单次下载多个链接中的图片。
def main():
#usr_name = "relu"
#txt_name = "urls.txt"
txt_path = './backup//urls//202101//urls.txt'
with open(txt_path, "rb") as file:
lines = file.readlines()
lines = [x.strip() for x in lines]
# item in lines: https://tieba.baidu.com/p/6100954692
pids = []
for item in range(len(lines)):
url = lines[item]
pid = url[-10:]
pids.append(int(pid))
print(u"\nData has been processed")
max_thread_num = 20
save_directory = './backup//202101//img'
try:
image_get_obj = ImageGet(max_thread_num, save_directory)
for id in range(len(pids)):
print(u'\n开始下载')
image_get_obj(pids[id])
print(u'\n休眠5秒钟')
time.sleep(5)
print(u'\n已下载当前文档链接中的图片。请更换文档名称和IP地址')
except:
print(u'\n出了一些问题, 你可以自己去main()里的try块改改自己看看bug\n')
- 核对文件完整性
由于前两个步骤使用的代码不会输出错误日志,我需要检查url/html文件/图片三者之间是否一一对应。代码如下。
import codecs
from os import listdir
from os.path import isfile, join
def get_htmlPid(html_folders_path, html_file_name):
# html_file_name = title + ".html"(with length of 5)
title_len = len(html_file_name) - 5
# 447: plain marks in html file before pid in urls
# length of file name is not included
begin = 447 + (2 * title_len)
end = begin + 10
html_file_path = html_folders_path + "//" + html_file_name
with open(html_file_path, 'r', encoding='utf-8') as HtmlFile:
html_source_code = HtmlFile.read()
html_pid = int(html_source_code[begin: end])
return html_pid
def get_imgPid(img_folders_path):
# get all folder names
img_pid = listdir(img_folders_path)
img_pid_int = []
for id in range(len(img_pid)):
img_pid_int.append(int(img_pid[id]))
return img_pid_int
def get_urlPid(url_path):
url_pid = []
with open(url_path, "r") as load_url_file:
plain_urls = load_url_file.readlines()
plain_urls = [x.strip() for x in plain_urls]
for url_id in range(len(plain_urls)):
single_url = plain_urls[url_id]
url_pid.append(int(single_url[-10:]))
return url_pid
def check_integrity(url_pid, html_pid, img_pid):
# remove duplicates
final_url_pid = list(set(url_pid))
final_html_pid = list(set(html_pid))
final_img_pid = list(set(img_pid))
missing_html = []
missing_img = []
# check html files
for url_item in range(len(final_url_pid)):
if final_url_pid[url_item] in final_html_pid:
pass
else:
missing_html.append(final_url_pid[url_item])
if final_url_pid[url_item] in final_img_pid:
pass
else:
missing_img.append(final_url_pid[url_item])
return missing_html, missing_img
def main():
usr_name = "relu"
base_path = "./2020-10-25-tieba-data-processing//rose-tieba-backup" + "//" + usr_name
store_path = "./2020-10-25-tieba-data-processing//rose-tieba-backup" + "//z-missing-files"
folders = listdir(base_path)
html_pid = []
# store missing_html and missing_img
all_missing_html_pid = []
all_missing_img_pid = []
for folder_id in range(len(folders)):
# initialize paths
html_path = base_path + "//" + folders[folder_id]
img_path = base_path + "//" + folders[folder_id] + "//img"
url_path = base_path + "//" + folders[folder_id] + "//urls.txt"
# store html names
html_file_names = []
# get all html file names in a folder
file_names = listdir(html_path)
for name in file_names:
if name.endswith(".html"):
html_file_names.append(name)
for html_name in range(len(html_file_names)):
html_pid_single = get_htmlPid(html_path, html_file_names[html_name])
html_pid.append(html_pid_single)
img_pid = get_imgPid(img_path)
url_pid = get_urlPid(url_path)
missing_html_pid, missing_img_pid = check_integrity(url_pid, html_pid, img_pid)
all_missing_html_pid.extend(missing_html_pid)
all_missing_img_pid.extend(missing_img_pid)
store_html_path = store_path + "//" + usr_name + "-missing-html.txt"
store_img_path = store_path + "//" + usr_name + "-missing-img.txt"
with open(store_html_path, "w", encoding="utf-8") as store_html:
for html in range(len(all_missing_html_pid)):
complete_url_1 = "https://tieba.baidu.com/p/" + str(all_missing_html_pid[html])
store_html.write("%s\n" % complete_url_1)
with open(store_img_path, "w", encoding="utf-8") as store_img:
for img in range(len(all_missing_img_pid)):
complete_url_2 = "https://tieba.baidu.com/p/" + str(all_missing_img_pid[img])
store_img.write("%s\n" % complete_url_2)
print("\n Data integrity of %s has been checked." % usr_name)
if __name__ == "__main__":
main()
- 修改图片路径
Html文件中的图片url指向百度图床,需要将其修改为本地路径。
from bs4 import BeautifulSoup
from os.path import basename, splitext
from os import listdir
import re
def modify_src(folder_path, file_name):
file_path = folder_path + '//' + file_name
soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")
# pid_link = soup.find_all("a", href=re.compile(r"^https://tieba.baidu.com/p/"))
# t = soup.select('a[href^="https://tieba.baidu.com/p/"]')
# below is correct
url = [elm.get_text() for elm in soup.find_all("a", href=re.compile(r"^https://tieba.baidu.com/p/"))]
# get pid
pid = url[0][-10:]
# modify image src
# unmodified src: https://imgsa.baidu.com/forum/w%3D580/sign=4d3033fbbdde9c82a665f9875c8080d2/4417d558ccbf6c815f62fb2ab23eb13532fa4035.jpg
# modified: ./img/6233150605/09d6a94bd11373f0a6c6bb5daa0f4bfbf9ed0488.jpg
# pattern: ./img/pid/img_name
# img_name: img["src"][-44:]
# unmodified emoticon src :https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon72.png
# modified: ../emoticon/image_emoticon72.png
for img in soup.findAll('img',{"src":True}):
if img["src"].endswith(".jpg"):
modified = './img/' + pid + '/' + img['src'][-44:]
img['src'] = modified
if img['src'].endswith('.png'):
splited = img['src'].split('/')
emoticon_name = splited[-1]
emoti_modified = '../tieba_emoticon/' + emoticon_name
img['src'] = emoti_modified
with open(file_path, "w", encoding = "utf-8") as file:
file.write(str(soup))
def main():
base_path = './rose_tieba_data_processing//data//tiezi_downloaded'
#file_name = "鹅黄美人 Buff Beauty.html"
#file_path = base_path + "//" + file_name
folder_names = listdir(base_path)
for folder_item in range(len(folder_names)):
if folder_names[folder_item] == 'tieba_emoticon':
pass
else:
print('Processing files in %s' % folder_names[folder_item])
folder_path = base_path + '//' + folder_names[folder_item]
all_files = listdir(folder_path)
# get all html files in a folder
file_name = []
for item in range(len(all_files)):
if all_files[item].endswith('.html'):
file_name.append(all_files[item])
# processing html files
for file_id in range(len(file_name)):
modify_src(folder_path, file_name[file_id])
print('%s has been processed' % file_name[file_id])
file_name.clear()
if __name__ == "__main__":
main()
Html文件中的标题包含“【图片】”“XX吧”内容,需要将其清除。
def modify_title(folder_path, file_name):
file_path = folder_path + '//' + file_name
soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")
new_title = str(soup.find('title').string)
print(new_title)
new_title = new_title.replace('【图片】', '')
new_title = new_title.replace('【月季花吧】_百度贴吧', '')
new_title = new_title.replace('【天狼月季吧】_百度贴吧', '')
soup.title.string = new_title
new_h1 = str(soup.find('h1').string)
new_h1 = new_h1.replace('【图片】', '')
new_h1 = new_h1.replace('【月季花吧】_百度贴吧', '')
new_h1 = new_h1.replace('【天狼月季吧】_百度贴吧', '')
soup.h1.string = new_h1
with open(file_path, "w", encoding = "utf-8") as file:
file.write(str(soup))
另外,帖子中“希望各位吧友能支持魔吧月刊。”也需要清除:
def remove_noise(folder_path, file_name):
file_path = folder_path + '//' + file_name
soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")
for div in soup.find_all("img", {'class':'nicknameEmoji'}):
div.decompose()
noise = '<div>\n<div>\n<div> #3: <b></b></div>\n<div>希望各位吧友能支持魔吧月刊。</div>\n</div>\n<hr/>\n</div>'
cleaned = str(soup).replace(noise, '')
with open(file_path, "w", encoding = "utf-8") as file:
file.write(cleaned)
我采用核对文件标题的方式寻找我和S的备份文件之间的差异。由于印象笔记生成的文件名十分混乱,我使用了正则表达式对其进行清洗。
import os
from os import listdir
from os.path import isfile, join
import re
# collect spider data
spider_path = "./tieba-download//html-only"
spider_original_names = []
spider_names = []
spider_folders = listdir(spider_path)
for spider_folder_id in range(len(spider_folders)):
spider_sub_path = spider_path + "//" + spider_folders[spider_folder_id]
spider_files = listdir(spider_sub_path)
spider_original_names.extend(spider_files)
# remove unnecessary suffix
for spider_item in range(len(spider_original_names)):
spider_names.append(spider_original_names[spider_item].replace("【月季花吧】_百度贴吧", ""))
# remove duplicate names in spider_data
spider_names = list(set(spider_names))
# collect evernote data
evernote_path = "G://ddd-data-evernote"
evernote_original_names = []
evernote_names = []
for file in os.listdir(evernote_path):
if file.endswith(".html"):
evernote_original_names.append(file)
# compile regex expression
pattern_string = r"【月季花吧】_\w{1,4}\s\[\d{1}\]|【月季花吧】_\w{1,4}|_\w{4}_\w{1,4}\s\[\d{1}\]|_\w{4}_\w{0,4}|【月季花吧】"
pattern = re.compile(pattern_string)
# remove unnecessary suffix
for item in range(len(evernote_original_names)):
evernote_names.append(pattern.sub("", evernote_original_names[item]))
# remove duplicate names in spider_data
evernote_names = list(set(evernote_names))
# double check files
spider_minus_evernote = []
evernote_minus_spider = []
for evernote_id in range(len(evernote_names)):
if evernote_names[evernote_id] in spider_names:
pass
else:
evernote_minus_spider.append(evernote_names[evernote_id])
for spider_id in range(len(spider_names)):
if spider_names[spider_id] in evernote_names:
pass
else:
spider_minus_evernote.append(spider_names[spider_id])
# set basic paths
evernote_store_path = "./evernote_minus_spider.txt"
spider_store_path = "./spider_minus_evernote.txt"
# store data which is in evernote but not in spider
with open(evernote_store_path, "w", encoding='utf-8') as evernote_save:
for evernote_save_item in evernote_minus_spider:
evernote_save.write("%s\n" % evernote_save_item)
# store data which is not in evernote but in spider
with open(spider_store_path, "w", encoding='utf-8') as spider_save:
for spider_save_item in spider_minus_evernote:
spider_save.write("%s\n" % spider_save_item)
print("Missing files in evernote and spider have been checked.")
我按帖子的发表日期对其排序,生成了一份目录。
import pickle
all_temp_data = pickle.load( open( "ordered_temp_data.p", "rb" ) )
# data structure:
# [year, month, day, title, category, path]
# e.g. [2018, 10, 14, '巴黎七月的粉龙沙', '品种介绍-梅昂 (Meilland)', './品种介绍-梅昂 (Meilland)//巴黎七月的粉龙沙.html']
hrefs = []
# href :
# <p> 10月14日 <a href="./品种介绍-梅昂 (Meilland)//巴黎七月的粉龙沙.html">巴黎七月的粉龙沙</a></p>
for item in range(len(all_temp_data)):
href = '<p> ' + str(all_temp_data[item][1]) + '月' + str(all_temp_data[item][2]) + '日 ' + '<a href=\"' + all_temp_data[item][5] + "\">" + all_temp_data[item][3] + '</a></p>'
hrefs.append(href)
save_path = 'G://rose_tieba_data_processing//codes//href-three.txt'
with open(save_path, "w", encoding="utf-8") as store_hrefs:
for href_id in range(len(hrefs)):
store_hrefs.write("%s\n" % hrefs[href_id])
Recommend
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK