简书Markdown迁移至Ghost

简书一直用着挺好的，唯一让我难受的点就是管理文章的页面没有搜索，文章多了就很难找之前写过的文章。另外，有好长时间了，简书的图片一直加载不出来，我起初以为是我的网络开代理造成，搜索一番才知道是简书截停了Firefox的访问，不清楚处于什么原因，但是坚定了我撤离简书的想法。

简书还是很大度的，可以设置-账号设置里可以打包下载所有文章，结果是纯markdown文本，很容易做迁移。

困难来到Ghost这边， Ghost支持几个国外平台的导入，国内的就不用想了。Ghost提供导入和导出选项，我这里仿造Ghost的导出格式，把简书的文章塞进去，再导回去。

Ghost内容生成

文章末尾提供了一个python脚本，用于生成Ghost的导入文件。

声明：脚本和本文所述内容可能造成未知问题，使用前确保你清楚其中的功能并做好备份，本人不对造成的任何损失负责，转载请注明出处。

首先介绍一下这个脚本的输入输出：

输入
1. 简书导出的rar文件
2. Ghost导出的json文件，用于获取Ghost 中的配置信息
输出
1. json格式的Ghost导入文件，包含文章信息
2. zip格式的Ghost导入文件，包含图片信息，两个文件需要分开单独导入

脚本依赖

系统中7z命令进行压缩和解压，所以运行前保证你在系统命令行中可以使用7z。
需要使用requests下载简书的图片，使用pip install requests进行安装

脚本运行

找到main函数，这里有四个参数，改成你的，执行就好了，生成文件放在了简书导出的rar文件同级的目录，文章名在下载时简书出于通用性考虑把特殊字符换成了“-”，和本文无关。

设置参数

去我的Ghost，看看效果吧：http://ray.twig.ink


import os
import json
from pathlib import Path
import datetime
import subprocess


def handle_img (post_info, save_path, featured_first_img):
    """下载图片并替换链接"""
    md_str = post_info['markdown']
    if 'https://upload-images' not in md_str:
        return md_str

    import re
    import requests
    # 匹配Markdown图片链接
    pattern = r'!\[(.*?)\]\((.*?)\)'  # 匹配 ![alt text](image_url) 格式的图片链接

    now = datetime.datetime.now()
    _rel_path = f'/content/images/{now.year}/{now.month}/'
    ghost_image_path = f'__GHOST_URL__{_rel_path}'
    image_save_path = f'{save_path}{_rel_path}'
    if not os.path.exists(image_save_path):
        os.makedirs(image_save_path)

    # 下载图片
    matches = re.findall(pattern, md_str)
    for alt, url in matches:
        img_url = url.split('?')[0]
        img_file_name = img_url.split('/')[-1]
        image_save_url = f'{image_save_path}/{img_file_name}'
        print(f'downloading.. {url}')
        response = requests.get(url)
        if response.status_code == 200:
            with open(image_save_url, 'wb') as file:
                file.write(response.content)

        if featured_first_img and post_info['feature_image'] is None:
            post_info['feature_image'] = f'{ghost_image_path}/{img_file_name}'

    # 替换原文图片链接
    def replace_image_url(match):
        alt_text = match.group(1)
        original_url = match.group(2)
        # 提取图片名
        image_name = os.path.basename(original_url.split('?')[0])
        # 构建新的图片链接
        new_url = f'{ghost_image_path}{image_name}'
        return f'![{alt_text}]({new_url})'
    res = re.sub(pattern, replace_image_url, md_str)
    return res

def md_to_mobiledoc(markdown, mobiledoc_version):
    mobiledoc = json.dumps({
        'version': mobiledoc_version,
        'markups': [],
        'atoms': [],
        'cards': [['markdown', {'cardName': 'markdown', 'markdown': markdown}]],
        'sections': [[10, 0]]
    }, ensure_ascii=False)
    return mobiledoc

def generate_uuid():
    import uuid
    return str(uuid.uuid4())

def generate_id():
    """生成ghost格式的id，但是这个导入的时候并没有用，系统会自动再生成一个"""
    custom_id = generate_uuid().replace('-', '')[-24:]
    return custom_id

def read_jianshu(zip_path: str):
    """将简书的所有markdown文件读出来"""
    _path = Path(zip_path)

    extract_to = os.path.join(_path.parent, _path.stem)
    unzip_file(zip_path, extract_to)
    posts = []
    tags = {}
    for md_file in find_md_files(extract_to):
        # print(f"Found MD file: {md_file}")
        __path = Path(md_file)
        with open(md_file, 'r', encoding='utf-8') as file:
            tag = __path.parent.name
            if tag not in tags.keys():
                tags[tag] = generate_id()
            tag_id = tags[tag]
            posts.append({
                'id': generate_id(),
                'tag': tag,
                'tag_id': tag_id,
                'title': __path.stem,
                'markdown': file.read(),
                'feature_image': None
            })
    return posts, tags

def unzip_file(zip_path, extract_to):
    """解压rar文件到指定目录"""
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)
    res = subprocess.run(['7z', 'x', zip_path, f'-o{extract_to}', '-aoa'], capture_output=True, text=True)
    print(res.stdout)

def zip_file(folder_to_compress, compress_to):
    """压缩文件"""
    res = subprocess.run(['7z', 'a', compress_to, folder_to_compress], capture_output=True, text=True)
    print(res.stdout)

def find_md_files(directory):
    """递归遍历目录，找到所有的.md文件"""
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.md'):
                yield os.path.join(root, file)

def build_ghost(post_infos: list[dict], ghost_config: dict, tags) -> dict:
    """使用已知的信息组装post"""
    from datetime import datetime, timezone
    # 格式化时间为指定格式
    current_time = datetime.now(timezone.utc)
    formatted_time = current_time.strftime('%Y-%m-%dT%H:%M:%S.000Z')

    author_id = ghost_config['db'][0]['data']['users'][0]['id']

    _model = {
        'posts_authors': [{
            'id': generate_id(),
            "post_id": post['id'],
            "author_id": author_id,
            "sort_order": 0
        }for post in post_infos],
        'posts': [{
            "id": post['id'],
            "uuid": generate_uuid(),
            "title": post['title'],
            "feature_image": post['feature_image'],
            "mobiledoc": post['mobiledoc'],
            "type": 'post',
            "status": post['post_status'],
            "visibility": "public",
            "email_recipient_filter": "all",
            "created_at": formatted_time,
            "updated_at": formatted_time,
            "published_at": formatted_time,
            "show_title_and_feature_image": 1
        } for post in post_infos],
        'posts_tags': [{
            "id": generate_id(),
            "post_id": post['id'],
            "tag_id": post['tag_id'],
            "sort_order": 0
        } for post in post_infos],
        'tags': [{
            'id': tag_id,
            'name': tag,
            "visibility": "public",
            "created_at": formatted_time,
            "updated_at": formatted_time

        } for tag, tag_id in tags.items()],
    }
    res = ghost_config

    res_post = res['db'][0]['data']
    # ghost导入本身就是增量更新，不需要保留之前的文章
    res_post['posts'] = _model['posts']
    res_post['tags'] = _model['tags']
    res_post['posts_tags'] = _model['posts_tags']
    return res

def get_mobiledoc_version(ghost_config):
    _mobiledoc_str = ghost_config['db'][0]['data']['posts'][0]['mobiledoc']
    _mobiledoc = json.loads(_mobiledoc_str)
    return _mobiledoc['version']

def main():
    # 简书文件路径
    zip_path = '/Users/era/Downloads/user-7914065-1730503948.rar'
    # ghost 导出文件，需要文章里的数据，保证导出的文件中有文章
    ghost_json_path = '/Users/era/Downloads/tui-ge.ghost.2024-11-02-00-00-48.json'
    # 导入的文章设置为 草稿 或者 已发布 draft published
    post_status = 'published'
    # 第一张图片作为封面
    first_img_as_feature = True

    post_infos, tags = read_jianshu(zip_path)
    with open(ghost_json_path) as file:
        ghost_config = json.load(file)
    # mobiledoc version
    mobiledoc_version = get_mobiledoc_version(ghost_config)

    for info in post_infos:
        # 先替换markdown中的图片链接，再转换成mobiledoc
        md_str  = handle_img(info, Path(zip_path).parent, first_img_as_feature)
        info['mobiledoc'] = md_to_mobiledoc(md_str, mobiledoc_version)
        info['post_status'] = post_status
    print('download completed.')

    ghost_res = build_ghost(post_infos, ghost_config, tags)

    # 指定写入文件路径
    output_json_path = zip_path.replace('.rar', '.json')
    output_zip_path = zip_path.replace('.rar', '-pictures.zip')
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(ghost_res, json_file, indent=4, ensure_ascii=False)

    zip_file(f'{Path(zip_path).parent}/content', output_zip_path)

    print(f"All done! Data saved to {output_json_path},{output_zip_path}")



if __name__ == "__main__":
    """
        pip install requests
        保证7z命令可用
    """
    main()

参考

json结构 https://ghost.org/docs/migration/custom/
导入图片 https://ghost.org/help/imports/#image-imports
导入内容 https://ghost.org/docs/migration/content/

简书Markdown迁移至Ghost

简书Markdown迁移至Ghost

Ghost内容生成

脚本依赖

脚本运行

参考

推荐阅读更多精彩内容

友情链接更多精彩内容