3

同步WordPress到微信公众号

 9 months ago
source link: https://www.biaodianfu.com/wordpress2weixin.html
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
neoserver,ios ssh client

术→技巧, 研发

同步WordPress到微信公众号

钱魏Way · 2023-12-19 · 0 次浏览

很早之前就申请了微信公众号,写了几篇就中断了,主要原因是微信提供的公众号不太适合写技术类的文章,比如添加代码片段LATEX数学公式等。

wordpress-wechat.png

针对 LATEX数学公式的支持我看反馈论坛上已经提了5~6年了就是没有实现,哎~

先前有一段时间通过复制粘贴的方式将Wordpress的几篇文章同步过去,但每次复制过去又会出现格式错乱,每次调整都要花很多时间。最近周末比较有时间,稍微研究了下公众号的接口,发现可以通过程序的方式将WordPress的文章处理后同步过去。具体处理流程如下:

  • 使用Python获取Wordpress的文章
  • 处理文章中的格式,将其转化为公众号接受的格式
    • 去除无用的标签属性
    • 针对<ul><ol>标签格式问题修复
    • 修复表格呈现问题
    • 修复H1~H6标签大小问题
    • 提取文章中链接,将其放到文章最后,并删除文章中的超链接
  • 获取文章中的图片,将图片上传到微信公众号,更新文章中的图片链接。

当前暂未处理的:将LATEX公式转化为图片,先前实现了采用base64格式的图片数据加载方式,测试下来微信公众号不支持,解决方案是将公式转换后的每个图片上传到素材库,原理上是可以实现,但是一篇文章中涉及到的公式太多,觉得不太适合。所以暂时先没有处理。

具体代码如下:(代码比较Ugly,但好在能用)

import requests
import re
from io import BytesIO
from PIL import Image
import urllib.parse
import os
from bs4 import BeautifulSoup
import json
from urllib.parse import urlparse
import base64
import matplotlib.pyplot as plt
wp_headers = {
'User-Agent': 'WordPress2Weixin'
# 获取接口TOKEN
def _get_token():
app_id = ''
app_secret = ''
token_url = "https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid={}&secret={}".format(app_id,
app_secret)
r = requests.get(token_url)
print(r.json())
return r.json()["access_token"]
# 上传图片的微信
def _upload_image(image_url):
:param image_url:
:return: {"media_id":"","url"}
upload_url = "https://api.weixin.qq.com/cgi-bin/material/add_material?access_token={}&type=image".format(
_get_token())
r_img = requests.get(image_url, stream=True, headers=wp_headers)
image_file = BytesIO(r_img.content)
image_file.seek(0)
image = Image.open(image_file)
image_type = image.format.lower()
if image_type == 'webp':
image = image.convert("RGBA")
image_file = BytesIO()
image.save(image_file, format='PNG')
image_type = 'png'
image_file.seek(0)
filename = os.path.splitext(os.path.basename(urllib.parse.urlparse(image_url).path))[0] + '.' + image_type
mime_type = 'image/' + image_type if image_type else 'application/octet-stream'
files = {'media': (filename, image_file, mime_type)}
headers = {"content-type": "multipart/form-data"}
r_url = requests.post(upload_url, files=files, headers=headers)
return r_url.json()
# 获取封面图片
def _get_featured_media(media_id,parsed_url):
media_url = "{}://{}/wp-json/wp/v2/media/{}".format(parsed_url.scheme,parsed_url.netloc,media_id)
r = requests.get(media_url, headers=wp_headers)
return _upload_image(r.json()["guid"]["rendered"]).get("media_id")
# 替换文章中的图片
def _replace_image_urls(content):
pattern = re.compile('src="(.*?)"')
image_urls = re.findall(pattern, content)
print(image_urls)
new_image_links = []
for link in image_urls:
new_link = _upload_image(link).get("url")
new_image_links.append(new_link)
for i in range(len(image_urls)):
content = content.replace(image_urls[i], new_image_links[i])
return content
# 修复微信公众号的格式
def _fix_tags(content):
# 去除ez-toc生成的目录信息
pattern = re.compile('(<div id="ez-toc-container"[\S\s]*?</div>\n)')
content = re.sub(pattern, '', content)
soup = BeautifulSoup(content, 'html.parser')
# 去除无用的标签属性
for tag in soup.find_all(True):
for attr in ['class', 'id', 'style', 'data-enlighter-language', 'decoding', 'loading', 'alt']:
del tag[attr]
# 去除无效的span标签
for span in soup.find_all('span'):
span.extract()
# 修复列表呈现问题
list_tags = soup.find_all(['ul', 'ol'])
for original_list_tag in list_tags:
new_list_tag = soup.new_tag(original_list_tag.name)
for li_tag in original_list_tag.find_all('li', recursive=False):
new_list_tag.append(li_tag)
original_list_tag.replace_with(new_list_tag)
# 修复表格呈现
for table_tag in soup.find_all('table'):
del table_tag['width']
table_tag.attrs['style'] = "width: 100%; overflow-x: auto; display: block;"
# 修复H1~H6标签
font_sizes = {
'h1': '2.0em',
'h2': '1.8em',
'h3': '1.6em',
'h4': '1.4em',
'h5': '1.2em',
'h6': '1.0em',
for tag_name, font_size in font_sizes.items():
for tag in soup.find_all(tag_name):
tag['style'] = f'font-size: {font_size};'
# 获取文章中的链接,将链接放在文章最后
links = soup.find_all('a')
links_div = soup.new_tag('div')
p_tag = soup.new_tag('p')
p_tag.string = "可以点击阅读原文查看正文相关链接"
links_div.append(p_tag)
counter = 1
for link in links:
href = link.get("href")
if href and not href.startswith("#") and not re.match(r'https?://', link.text):
link_str = f'{counter}. {link.text} {href}'
p_tag = soup.new_tag('p')
p_tag.string = link_str
links_div.append(p_tag)
counter += 1
soup.append(links_div)
# 删除文章中的链接
for a_tag in soup.find_all('a'):
del a_tag['href']
return str(soup)
# 定义函数,将 LaTeX 公式转换为图片
# def _latex_to_image(latex_str):
# print(latex_str)
# fig = plt.figure(figsize=(2, 0.5))
# fig.text(0, 0, f'${latex_str}$', fontsize=12, color='black')
# buffer = BytesIO()
# plt.savefig(buffer, format='png', bbox_inches='tight', pad_inches=0.0)
# assert isinstance(fig, object)
# plt.close(fig)
# buffer.seek(0)
# return buffer.read()
# # 定义函数,将文章中的 LaTeX 公式转换为图片
# def _replace_latex_with_images(text):
# inline_pattern = re.compile(r'$(.*?)$')
# display_pattern = re.compile(r'$$(.*?)$$')
# def replace_func(match):
# latex_str = match.group(1)
# try:
# img_data = _latex_to_image(latex_str)
# encoded_img_data = base64.b64encode(img_data).decode()
# return f'<img src="data:image/png;base64,{encoded_img_data}" />'
# except:
# return latex_str
# # 替换行间 LaTeX 公式
# text = display_pattern.sub(replace_func, text)
# # 替换行内 LaTeX 公式
# text = inline_pattern.sub(replace_func, text)
# return text
# 获取WordPress文章内容
def add_draft(post_url):
r = requests.get(post_url, headers=wp_headers)
pattern = re.compile('<div class="container site-content"><div id="post-(\d+)">')
post_id = re.findall(pattern, r.text)[0]
parsed_url = urlparse(post_url)
post_url_json = "{}://{}/wp-json/wp/v2/posts/{}".format(parsed_url.scheme, parsed_url.netloc, post_id)
r_json = requests.get(post_url_json, headers=wp_headers)
post_raw = r_json.json()
post_data = {
"title": post_raw['title']['rendered'],
"author": "biaodianfu",
"content": _replace_image_urls(_fix_tags(post_raw['content']['rendered'])),
"content_source_url": post_url,
"thumb_media_id": _get_featured_media(post_raw['featured_media'], parsed_url),
"need_open_comment": 0,
"only_fans_can_comment": 0
url = "https://api.weixin.qq.com/cgi-bin/draft/add?access_token={}".format(_get_token())
headers = {'Content-Type': 'application/json'}
data = json.dumps({"articles": [post_data]}, ensure_ascii=False).encode('utf-8')
print(data)
response = requests.post(url, data=data, headers=headers) # Modify this line
print(response.content)
if response and "media_id" in response.json():
print("Draft created successfully.")
else:
print(f"Failed to create draft. Error response: {response}")
if __name__ == "__main__":
post_url = ""
add_draft(post_url)
import requests
import re
from io import BytesIO
from PIL import Image
import urllib.parse
import os
from bs4 import BeautifulSoup
import json
from urllib.parse import urlparse
import base64
import matplotlib.pyplot as plt

wp_headers = {
    'User-Agent': 'WordPress2Weixin'
}


# 获取接口TOKEN
def _get_token():
    app_id = ''
    app_secret = ''
    token_url = "https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid={}&secret={}".format(app_id,
                                                                                                                 app_secret)
    r = requests.get(token_url)
    print(r.json())
    return r.json()["access_token"]


# 上传图片的微信
def _upload_image(image_url):
    """
    :param image_url:
    :return: {"media_id":"","url"}
    """
    upload_url = "https://api.weixin.qq.com/cgi-bin/material/add_material?access_token={}&type=image".format(
        _get_token())
    r_img = requests.get(image_url, stream=True, headers=wp_headers)
    image_file = BytesIO(r_img.content)
    image_file.seek(0)
    image = Image.open(image_file)
    image_type = image.format.lower()
    if image_type == 'webp':
        image = image.convert("RGBA")
        image_file = BytesIO()
        image.save(image_file, format='PNG')
        image_type = 'png'
    image_file.seek(0)
    filename = os.path.splitext(os.path.basename(urllib.parse.urlparse(image_url).path))[0] + '.' + image_type
    mime_type = 'image/' + image_type if image_type else 'application/octet-stream'
    files = {'media': (filename, image_file, mime_type)}
    headers = {"content-type": "multipart/form-data"}
    r_url = requests.post(upload_url, files=files, headers=headers)
    return r_url.json()


# 获取封面图片
def _get_featured_media(media_id,parsed_url):
    media_url = "{}://{}/wp-json/wp/v2/media/{}".format(parsed_url.scheme,parsed_url.netloc,media_id)
    r = requests.get(media_url, headers=wp_headers)
    return _upload_image(r.json()["guid"]["rendered"]).get("media_id")


# 替换文章中的图片
def _replace_image_urls(content):
    pattern = re.compile('src="(.*?)"')
    image_urls = re.findall(pattern, content)
    print(image_urls)
    new_image_links = []
    for link in image_urls:
        new_link = _upload_image(link).get("url")
        new_image_links.append(new_link)
    for i in range(len(image_urls)):
        content = content.replace(image_urls[i], new_image_links[i])
    return content


# 修复微信公众号的格式
def _fix_tags(content):
    # 去除ez-toc生成的目录信息
    pattern = re.compile('(<div id="ez-toc-container"[\S\s]*?</div>\n)')
    content = re.sub(pattern, '', content)

    soup = BeautifulSoup(content, 'html.parser')
    # 去除无用的标签属性
    for tag in soup.find_all(True):
        for attr in ['class', 'id', 'style', 'data-enlighter-language', 'decoding', 'loading', 'alt']:
            del tag[attr]
    # 去除无效的span标签
    for span in soup.find_all('span'):
        span.extract()
    # 修复列表呈现问题
    list_tags = soup.find_all(['ul', 'ol'])
    for original_list_tag in list_tags:
        new_list_tag = soup.new_tag(original_list_tag.name)
        for li_tag in original_list_tag.find_all('li', recursive=False):
            new_list_tag.append(li_tag)
        original_list_tag.replace_with(new_list_tag)
    # 修复表格呈现
    for table_tag in soup.find_all('table'):
        del table_tag['width']
        table_tag.attrs['style'] = "width: 100%; overflow-x: auto; display: block;"
    # 修复H1~H6标签
    font_sizes = {
        'h1': '2.0em',
        'h2': '1.8em',
        'h3': '1.6em',
        'h4': '1.4em',
        'h5': '1.2em',
        'h6': '1.0em',
    }
    for tag_name, font_size in font_sizes.items():
        for tag in soup.find_all(tag_name):
            tag['style'] = f'font-size: {font_size};'
    # 获取文章中的链接,将链接放在文章最后
    links = soup.find_all('a')
    links_div = soup.new_tag('div')
    p_tag = soup.new_tag('p')
    p_tag.string = "可以点击阅读原文查看正文相关链接"
    links_div.append(p_tag)
    counter = 1
    for link in links:
        href = link.get("href")
        if href and not href.startswith("#") and not re.match(r'https?://', link.text):
            link_str = f'{counter}. {link.text} {href}'
            p_tag = soup.new_tag('p')
            p_tag.string = link_str
            links_div.append(p_tag)
            counter += 1
    soup.append(links_div)
    # 删除文章中的链接
    for a_tag in soup.find_all('a'):
        del a_tag['href']

    return str(soup)


# 定义函数,将 LaTeX 公式转换为图片
# def _latex_to_image(latex_str):
#     print(latex_str)
#     fig = plt.figure(figsize=(2, 0.5))
#     fig.text(0, 0, f'${latex_str}$', fontsize=12, color='black')
#     buffer = BytesIO()
#     plt.savefig(buffer, format='png', bbox_inches='tight', pad_inches=0.0)
#     assert isinstance(fig, object)
#     plt.close(fig)
#     buffer.seek(0)
#     return buffer.read()
#
#
# # 定义函数,将文章中的 LaTeX 公式转换为图片
# def _replace_latex_with_images(text):
#     inline_pattern = re.compile(r'$(.*?)$')
#     display_pattern = re.compile(r'$$(.*?)$$')
#
#     def replace_func(match):
#         latex_str = match.group(1)
#         try:
#             img_data = _latex_to_image(latex_str)
#             encoded_img_data = base64.b64encode(img_data).decode()
#             return f'<img src="data:image/png;base64,{encoded_img_data}" />'
#         except:
#             return latex_str
#
#     # 替换行间 LaTeX 公式
#     text = display_pattern.sub(replace_func, text)
#     # 替换行内 LaTeX 公式
#     text = inline_pattern.sub(replace_func, text)
#
#     return text


#  获取WordPress文章内容
def add_draft(post_url):
    r = requests.get(post_url, headers=wp_headers)
    pattern = re.compile('<div class="container site-content"><div id="post-(\d+)">')
    post_id = re.findall(pattern, r.text)[0]
    parsed_url = urlparse(post_url)
    post_url_json = "{}://{}/wp-json/wp/v2/posts/{}".format(parsed_url.scheme, parsed_url.netloc, post_id)
    r_json = requests.get(post_url_json, headers=wp_headers)
    post_raw = r_json.json()
    post_data = {
        "title": post_raw['title']['rendered'],
        "author": "biaodianfu",
        "content": _replace_image_urls(_fix_tags(post_raw['content']['rendered'])),
        "content_source_url": post_url,
        "thumb_media_id": _get_featured_media(post_raw['featured_media'], parsed_url),
        "need_open_comment": 0,
        "only_fans_can_comment": 0
    }
    url = "https://api.weixin.qq.com/cgi-bin/draft/add?access_token={}".format(_get_token())
    headers = {'Content-Type': 'application/json'}
    data = json.dumps({"articles": [post_data]}, ensure_ascii=False).encode('utf-8')
    print(data)
    response = requests.post(url, data=data, headers=headers)  # Modify this line
    print(response.content)
    if response and "media_id" in response.json():
        print("Draft created successfully.")
    else:
        print(f"Failed to create draft. Error response: {response}")


if __name__ == "__main__":
    post_url = ""
    add_draft(post_url)

About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK