Last active
April 26, 2025 12:08
-
-
Save a-yasui/ccbfb077f45a5e56f5d651e4df3f55a9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # coding: -*- coding:utf_8 -*- | |
| # | |
| # whats this? | |
| # this fetch the contents from wordpress using rest api and output to a article one by one to `output` directory. | |
| # how to use? | |
| # > mkdir output | |
| # > curl -L https://example.com/wordpress_json > test.json | |
| # > python3 json_to_markdown_converter.py test.json | |
| import json | |
| import os | |
| import re | |
| import sys | |
| import logging | |
| import urllib.parse | |
| from datetime import datetime | |
| from bs4 import BeautifulSoup | |
| import requests | |
| from dotenv import load_dotenv | |
| # ロガーの設定 | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # 出力する要素リスト | |
| ELEMENTS_TO_INCLUDE = [ | |
| 'date', | |
| 'date_gmt', | |
| 'modified', | |
| 'modified_gmt', | |
| 'slug', | |
| 'link', | |
| 'title', | |
| 'content', | |
| 'excerpt' | |
| ] | |
| def html_to_text(html_content): | |
| """HTMLコンテンツをプレーンテキストに変換する | |
| Args: | |
| html_content (str): HTMLコンテンツ | |
| Returns: | |
| str: プレーンテキスト | |
| """ | |
| if not html_content: | |
| return "" | |
| # BeautifulSoupを使用してHTMLを解析 | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # スクリプトと styleタグを削除 | |
| for script_or_style in soup(["script", "style"]): | |
| script_or_style.extract() | |
| # テキスト抽出 | |
| text = soup.get_text() | |
| # 複数行の空白を削除 | |
| lines = (line.strip() for line in text.splitlines()) | |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
| text = '\n'.join(chunk for chunk in chunks if chunk) | |
| return text | |
| def create_markdown_files(json_file_path): | |
| """JSONファイルからMarkdownファイルを作成する | |
| Args: | |
| json_file_path (str): JSONファイルのパス | |
| """ | |
| # JSONファイルを読み込む | |
| try: | |
| with open(json_file_path, 'r', encoding='utf-8') as file: | |
| data = json.load(file) | |
| except Exception as e: | |
| logger.error(f"JSONファイルの読み込みに失敗しました: {e}") | |
| return | |
| # 出力ディレクトリを作成 | |
| output_dir = 'output' | |
| os.makedirs(output_dir, exist_ok=True) | |
| # データが配列でない場合は配列に変換 | |
| if not isinstance(data, list): | |
| data = [data] | |
| # 各記事に対してMarkdownファイルを作成 | |
| for article in data: | |
| # ファイル名の作成(日付情報がある場合はそれを使用) | |
| if 'date' in article: | |
| try: | |
| date_str = article['date'] | |
| date_obj = datetime.fromisoformat(date_str.replace('Z', '+00:00')) | |
| file_name = f"{date_obj.strftime('%Y-%m-%d')}.md" | |
| except Exception: | |
| # 日付の解析に失敗した場合は現在時刻を使用 | |
| file_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.md" | |
| else: | |
| # 日付がない場合は現在時刻とIDを使用 | |
| article_id = article.get('id', datetime.now().timestamp()) | |
| file_name = f"{datetime.now().strftime('%Y-%m-%d')}_{article_id}.md" | |
| file_path = os.path.join(output_dir, file_name) | |
| # Markdownコンテンツの作成 | |
| markdown_content = "" | |
| # 指定された要素のみをフィルタリング | |
| filtered_article = {k: v for k, v in article.items() if k in ELEMENTS_TO_INCLUDE} | |
| for key, value in filtered_article.items(): | |
| # 特殊処理:title, content, excerptの場合は'rendered'フィールドを抽出 | |
| if key in ['title', 'content', 'excerpt'] and isinstance(value, dict) and 'rendered' in value: | |
| value = value['rendered'] | |
| # slugの場合はURLデコード | |
| if key == 'slug' and isinstance(value, str): | |
| value = urllib.parse.unquote(value) | |
| # 値がHTMLの場合はテキストに変換 | |
| if isinstance(value, str) and re.search(r'<[^>]+>', value): | |
| value = html_to_text(value) | |
| # ヘッダーと内容を追加 | |
| markdown_content += f"# {key}\n\n{value}\n\n" | |
| # ファイルへの書き込み | |
| try: | |
| with open(file_path, 'w', encoding='utf-8') as md_file: | |
| md_file.write(markdown_content) | |
| logger.info(f"Markdownファイルを作成しました: {file_path}") | |
| except Exception as e: | |
| logger.error(f"Markdownファイルの作成に失敗しました: {e}") | |
| def main(): | |
| """メイン関数""" | |
| if len(sys.argv) < 2: | |
| logger.error("使用方法: python output.py <json_file_path>") | |
| sys.exit(1) | |
| json_file_path = sys.argv[1] | |
| create_markdown_files(json_file_path) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment