Last active
July 31, 2025 19:30
-
-
Save precondition/4a65e04a3e02283aaa044339038f3c86 to your computer and use it in GitHub Desktop.
Python script to convert pitch-accent color-coded Japanese HTML (from https://kotu.io/media/reader/) into LaTeX for insertion into `color_coded_ja_text_template.tex`. Compile w/ XeLaTeX. 
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| \documentclass[11pt]{extarticle} | |
| \usepackage{parskip} | |
| \usepackage[japanese]{babel} | |
| \usepackage[a4paper,width=160mm,top=20mm, bottom=20mm]{geometry} | |
| \usepackage{fancyhdr} | |
| \usepackage{xeCJK} | |
| \usepackage{ruby} | |
| \setCJKmainfont{Noto Serif CJK JP} | |
| %\setCJKmainfont{IPAMincho} | |
| \setCJKsansfont{IPAGothic} | |
| \setCJKmonofont{IPAGothic} | |
| \usepackage{xcolor} | |
| \usepackage{color, soul} | |
| \usepackage{ulem} | |
| \usepackage{setspace} | |
| \usepackage{xeCJKfntef} | |
| \makeatletter | |
| \newcommand{\globalcolor}[1]{% | |
| \color{#1}\global\let\default@color\current@color | |
| } | |
| \makeatother | |
| \setstretch{1.5} % 1.5 line spacing | |
| %\AtBeginDocument{\globalcolor{blue}} | |
| \definecolor{heiban_word_color}{HTML}{59b2ff} % blue | |
| \definecolor{atamadaka_word_color}{HTML}{ff6666} % red | |
| \definecolor{nakadaka_word_color}{HTML}{ff9b54} % orange | |
| \definecolor{odaka_word_color}{HTML}{67e47d} % green | |
| \definecolor{kihuku_word_color}{HTML}{563298} % pale mint green | |
| \newcommand\heibanuline{\bgroup\markoverwith | |
| {\textcolor{heiban_word_color}{\rule[-0.5ex]{1.4pt}{\ULthickness}}}\ULon} | |
| \newcommand\atamadakauline{\bgroup\markoverwith | |
| {\textcolor{atamadaka_word_color}{\rule[-0.5ex]{1.4pt}{\ULthickness}}}\ULon} | |
| \newcommand\nakadakauline{\bgroup\markoverwith | |
| {\textcolor{nakadaka_word_color}{\rule[-0.5ex]{1.4pt}{\ULthickness}}}\ULon} | |
| \newcommand\odakauline{\bgroup\markoverwith | |
| {\textcolor{odaka_word_color}{\rule[-0.5ex]{1.4pt}{\ULthickness}}}\ULon} | |
| \newcommand\kihukuuline{\bgroup\markoverwith | |
| {\textcolor{kihuku_word_color}{\rule[-0.5ex]{1.4pt}{\ULthickness}}}\ULon} | |
| \begin{document} | |
| \renewcommand{\ULthickness}{2.0pt} | |
| \renewcommand{\rubysize}{0.7} % default: 0.4 | |
| \renewcommand\rubysep{-0.7em} | |
| \title{\textbf{タイトル}} | |
| \author{Mako Tanaka\\\small 田中・真子} | |
| \maketitle | |
| \thispagestyle{fancy} | |
| %... then configure it. | |
| \fancyhead{} % clear all header fields | |
| \fancyhead[LO]{音読原稿} | |
| \fancyfoot{} % clear all footer fields | |
| \begin{center} | |
| 色の凡例: \heibanuline{平板}\ \kihukuuline{起伏}\ \odakauline{尾高}\ \nakadakauline{中高}\ \atamadakauline{頭高} | |
| \end{center} | |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
| % INSERT OUTPUT OF kotu_analyzed_text_extract.py HERE | |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import sys | |
| from bs4 import BeautifulSoup | |
| # Your HTML content | |
| with open(sys.argv[1], "r") as f: | |
| html_content = f.read() | |
| # Function to remove <rt> tags from <ruby> tags | |
| def clean_ruby_tags(component): | |
| ruby_tags = component.find_all('ruby') | |
| if not ruby_tags: | |
| return component.get_text(strip=True) | |
| for ruby_tag in ruby_tags: | |
| for rt_tag in ruby_tag.find_all('rt'): | |
| rt_tag.decompose() # Remove <rt> tag from the tree | |
| return component.get_text(strip=True) # Get remaining text without <rt> | |
| # Parse the HTML | |
| soup = BeautifulSoup(html_content, 'lxml') | |
| # Define color mapping based on pitch | |
| color_mapping = { | |
| 'heiban': 'heiban', # 'blue', | |
| 'nakadaka': 'nakadaka', # orange', | |
| 'odaka': 'odaka', # 'lime', | |
| 'atamadaka': 'atamadaka', # 'red', | |
| 'kihuku': 'kihuku', | |
| } | |
| # Extract and generate LaTeX code | |
| latex_output = [] | |
| # Header for LaTeX document | |
| #latex_output.append(r'\documentclass{article}') | |
| #latex_output.append(r'\usepackage{xcolor}') # for colors | |
| #latex_output.append(r'\begin{document}') | |
| # Find all component elements | |
| components = soup.find_all('component') | |
| global_text_color = "black" | |
| for component in components: | |
| # Check if there's a ruby tag | |
| text = clean_ruby_tags(component) | |
| classes = component.get('class', []) | |
| # Determine color based on class | |
| color = 'unknown' # Default color | |
| for class_name in classes: | |
| if class_name.startswith('underline-pitch-'): | |
| pitch_type = class_name.split('-')[-1] | |
| if pitch_type in color_mapping: | |
| color = color_mapping[pitch_type] | |
| break | |
| # Create LaTeX underlined text with color | |
| if color != 'unknown': | |
| latex_output.append('\\' + color + r'uline{' + text + r'}') | |
| else: | |
| latex_output.append(text) | |
| # Footer for LaTeX document | |
| latex_output.append(r'\end{document}') | |
| # Join the output into a single string | |
| final_latex_code = '\n'.join(latex_output) | |
| # Print the final LaTeX code | |
| print(final_latex_code) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment