a-yasui · April 26, 2025 12:08
diff --git a/json_to_markdown_converter.py b/json_to_markdown_converter.py
 #!/usr/bin/env python3
 # coding: -*- coding:utf_8 -*-
 #
 # whats this?
 #   this fetch the contents from wordpress using rest api and output to a article one by one to `output` directory.
 # how to use?
 #   > mkdir output
 #   > curl -L https://example.com/wordpress_json > test.json
 #   > python3 json_to_markdown_converter.py test.json

 import json
 import os
 import re
 import sys
 import logging
 import urllib.parse
 from datetime import datetime
 from bs4 import BeautifulSoup
 import requests
 from dotenv import load_dotenv

 # ロガーの設定
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

 # 出力する要素リスト
 ELEMENTS_TO_INCLUDE = [
    'date',
    'date_gmt',
    'modified',
    'modified_gmt',
    'slug',
    'link',
    'title',
    'content',
    'excerpt'
 ]


 def html_to_text(html_content):
    """HTMLコンテンツをプレーンテキストに変換する

    Args:
        html_content (str): HTMLコンテンツ

    Returns:
        str: プレーンテキスト
    """
    if not html_content:
        return ""

    # BeautifulSoupを使用してHTMLを解析
    soup = BeautifulSoup(html_content, 'html.parser')

    # スクリプトと styleタグを削除
    for script_or_style in soup(["script", "style"]):
        script_or_style.extract()

    # テキスト抽出
    text = soup.get_text()

    # 複数行の空白を削除
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text


 def create_markdown_files(json_file_path):
    """JSONファイルからMarkdownファイルを作成する

    Args:
        json_file_path (str): JSONファイルのパス
    """
    # JSONファイルを読み込む
    try:
        with open(json_file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
    except Exception as e:
        logger.error(f"JSONファイルの読み込みに失敗しました: {e}")
        return

    # 出力ディレクトリを作成
    output_dir = 'output'
    os.makedirs(output_dir, exist_ok=True)

    # データが配列でない場合は配列に変換
    if not isinstance(data, list):
        data = [data]

    # 各記事に対してMarkdownファイルを作成
    for article in data:
        # ファイル名の作成（日付情報がある場合はそれを使用）
        if 'date' in article:
            try:
                date_str = article['date']
                date_obj = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
                file_name = f"{date_obj.strftime('%Y-%m-%d')}.md"
            except Exception:
                # 日付の解析に失敗した場合は現在時刻を使用
                file_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.md"
        else:
            # 日付がない場合は現在時刻とIDを使用
            article_id = article.get('id', datetime.now().timestamp())
            file_name = f"{datetime.now().strftime('%Y-%m-%d')}_{article_id}.md"

        file_path = os.path.join(output_dir, file_name)

        # Markdownコンテンツの作成
        markdown_content = ""

        # 指定された要素のみをフィルタリング
        filtered_article = {k: v for k, v in article.items() if k in ELEMENTS_TO_INCLUDE}

        for key, value in filtered_article.items():
            # 特殊処理：title, content, excerptの場合は'rendered'フィールドを抽出
            if key in ['title', 'content', 'excerpt'] and isinstance(value, dict) and 'rendered' in value:
                value = value['rendered']

            # slugの場合はURLデコード
            if key == 'slug' and isinstance(value, str):
                value = urllib.parse.unquote(value)

            # 値がHTMLの場合はテキストに変換
            if isinstance(value, str) and re.search(r'<[^>]+>', value):
                value = html_to_text(value)

            # ヘッダーと内容を追加
            markdown_content += f"# {key}\n\n{value}\n\n"

        # ファイルへの書き込み
        try:
            with open(file_path, 'w', encoding='utf-8') as md_file:
                md_file.write(markdown_content)
            logger.info(f"Markdownファイルを作成しました: {file_path}")
        except Exception as e:
            logger.error(f"Markdownファイルの作成に失敗しました: {e}")

 def main():
    """メイン関数"""
    if len(sys.argv) < 2:
        logger.error("使用方法: python output.py <json_file_path>")
        sys.exit(1)

    json_file_path = sys.argv[1]
    create_markdown_files(json_file_path)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# coding: -- coding:utf_8 --
	#
	# whats this?
	# this fetch the contents from wordpress using rest api and output to a article one by one to `output` directory.
	# how to use?
	# > mkdir output
	# > curl -L https://example.com/wordpress_json > test.json
	# > python3 json_to_markdown_converter.py test.json

	import json
	import os
	import re
	import sys
	import logging
	import urllib.parse
	from datetime import datetime
	from bs4 import BeautifulSoup
	import requests
	from dotenv import load_dotenv

	# ロガーの設定
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# 出力する要素リスト
	ELEMENTS_TO_INCLUDE = [
	'date',
	'date_gmt',
	'modified',
	'modified_gmt',
	'slug',
	'link',
	'title',
	'content',
	'excerpt'
	]


	def html_to_text(html_content):
	"""HTMLコンテンツをプレーンテキストに変換する

	Args:
	html_content (str): HTMLコンテンツ

	Returns:
	str: プレーンテキスト
	"""
	if not html_content:
	return ""

	# BeautifulSoupを使用してHTMLを解析
	soup = BeautifulSoup(html_content, 'html.parser')

	# スクリプトと styleタグを削除
	for script_or_style in soup(["script", "style"]):
	script_or_style.extract()

	# テキスト抽出
	text = soup.get_text()

	# 複数行の空白を削除
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = '\n'.join(chunk for chunk in chunks if chunk)

	return text


	def create_markdown_files(json_file_path):
	"""JSONファイルからMarkdownファイルを作成する

	Args:
	json_file_path (str): JSONファイルのパス
	"""
	# JSONファイルを読み込む
	try:
	with open(json_file_path, 'r', encoding='utf-8') as file:
	data = json.load(file)
	except Exception as e:
	logger.error(f"JSONファイルの読み込みに失敗しました: {e}")
	return

	# 出力ディレクトリを作成
	output_dir = 'output'
	os.makedirs(output_dir, exist_ok=True)

	# データが配列でない場合は配列に変換
	if not isinstance(data, list):
	data = [data]

	# 各記事に対してMarkdownファイルを作成
	for article in data:
	# ファイル名の作成（日付情報がある場合はそれを使用）
	if 'date' in article:
	try:
	date_str = article['date']
	date_obj = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
	file_name = f"{date_obj.strftime('%Y-%m-%d')}.md"
	except Exception:
	# 日付の解析に失敗した場合は現在時刻を使用
	file_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.md"
	else:
	# 日付がない場合は現在時刻とIDを使用
	article_id = article.get('id', datetime.now().timestamp())
	file_name = f"{datetime.now().strftime('%Y-%m-%d')}_{article_id}.md"

	file_path = os.path.join(output_dir, file_name)

	# Markdownコンテンツの作成
	markdown_content = ""

	# 指定された要素のみをフィルタリング
	filtered_article = {k: v for k, v in article.items() if k in ELEMENTS_TO_INCLUDE}

	for key, value in filtered_article.items():
	# 特殊処理：title, content, excerptの場合は'rendered'フィールドを抽出
	if key in ['title', 'content', 'excerpt'] and isinstance(value, dict) and 'rendered' in value:
	value = value['rendered']

	# slugの場合はURLデコード
	if key == 'slug' and isinstance(value, str):
	value = urllib.parse.unquote(value)

	# 値がHTMLの場合はテキストに変換
	if isinstance(value, str) and re.search(r'<[^>]+>', value):
	value = html_to_text(value)

	# ヘッダーと内容を追加
	markdown_content += f"# {key}\n\n{value}\n\n"

	# ファイルへの書き込み
	try:
	with open(file_path, 'w', encoding='utf-8') as md_file:
	md_file.write(markdown_content)
	logger.info(f"Markdownファイルを作成しました: {file_path}")
	except Exception as e:
	logger.error(f"Markdownファイルの作成に失敗しました: {e}")

	def main():
	"""メイン関数"""
	if len(sys.argv) < 2:
	logger.error("使用方法: python output.py <json_file_path>")
	sys.exit(1)

	json_file_path = sys.argv[1]
	create_markdown_files(json_file_path)


	if __name__ == "__main__":
	main()
No results found