Skip to content

Instantly share code, notes, and snippets.

@precondition
Last active July 31, 2025 19:30
Show Gist options
  • Select an option

  • Save precondition/4a65e04a3e02283aaa044339038f3c86 to your computer and use it in GitHub Desktop.

Select an option

Save precondition/4a65e04a3e02283aaa044339038f3c86 to your computer and use it in GitHub Desktop.
Python script to convert pitch-accent color-coded Japanese HTML (from https://kotu.io/media/reader/) into LaTeX for insertion into `color_coded_ja_text_template.tex`. Compile w/ XeLaTeX. ![Preview](https://do4ryrayvtpwi.cloudfront.net/original/3X/2/d/2d7d2f399ddd4c66550a32f71ca126218845cb31.png)
\documentclass[11pt]{extarticle}
\usepackage{parskip}
\usepackage[japanese]{babel}
\usepackage[a4paper,width=160mm,top=20mm, bottom=20mm]{geometry}
\usepackage{fancyhdr}
\usepackage{xeCJK}
\usepackage{ruby}
\setCJKmainfont{Noto Serif CJK JP}
%\setCJKmainfont{IPAMincho}
\setCJKsansfont{IPAGothic}
\setCJKmonofont{IPAGothic}
\usepackage{xcolor}
\usepackage{color, soul}
\usepackage{ulem}
\usepackage{setspace}
\usepackage{xeCJKfntef}
\makeatletter
\newcommand{\globalcolor}[1]{%
\color{#1}\global\let\default@color\current@color
}
\makeatother
\setstretch{1.5} % 1.5 line spacing
%\AtBeginDocument{\globalcolor{blue}}
\definecolor{heiban_word_color}{HTML}{59b2ff} % blue
\definecolor{atamadaka_word_color}{HTML}{ff6666} % red
\definecolor{nakadaka_word_color}{HTML}{ff9b54} % orange
\definecolor{odaka_word_color}{HTML}{67e47d} % green
\definecolor{kihuku_word_color}{HTML}{563298} % pale mint green
\newcommand\heibanuline{\bgroup\markoverwith
{\textcolor{heiban_word_color}{\rule[-0.5ex]{1.4pt}{\ULthickness}}}\ULon}
\newcommand\atamadakauline{\bgroup\markoverwith
{\textcolor{atamadaka_word_color}{\rule[-0.5ex]{1.4pt}{\ULthickness}}}\ULon}
\newcommand\nakadakauline{\bgroup\markoverwith
{\textcolor{nakadaka_word_color}{\rule[-0.5ex]{1.4pt}{\ULthickness}}}\ULon}
\newcommand\odakauline{\bgroup\markoverwith
{\textcolor{odaka_word_color}{\rule[-0.5ex]{1.4pt}{\ULthickness}}}\ULon}
\newcommand\kihukuuline{\bgroup\markoverwith
{\textcolor{kihuku_word_color}{\rule[-0.5ex]{1.4pt}{\ULthickness}}}\ULon}
\begin{document}
\renewcommand{\ULthickness}{2.0pt}
\renewcommand{\rubysize}{0.7} % default: 0.4
\renewcommand\rubysep{-0.7em}
\title{\textbf{タイトル}}
\author{Mako Tanaka\\\small 田中・真子}
\maketitle
\thispagestyle{fancy}
%... then configure it.
\fancyhead{} % clear all header fields
\fancyhead[LO]{音読原稿}
\fancyfoot{} % clear all footer fields
\begin{center}
色の凡例: \heibanuline{平板}\ \kihukuuline{起伏}\ \odakauline{尾高}\ \nakadakauline{中高}\ \atamadakauline{頭高}
\end{center}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% INSERT OUTPUT OF kotu_analyzed_text_extract.py HERE
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#!/usr/bin/env python3
import sys
from bs4 import BeautifulSoup
# Your HTML content
with open(sys.argv[1], "r") as f:
html_content = f.read()
# Function to remove <rt> tags from <ruby> tags
def clean_ruby_tags(component):
ruby_tags = component.find_all('ruby')
if not ruby_tags:
return component.get_text(strip=True)
for ruby_tag in ruby_tags:
for rt_tag in ruby_tag.find_all('rt'):
rt_tag.decompose() # Remove <rt> tag from the tree
return component.get_text(strip=True) # Get remaining text without <rt>
# Parse the HTML
soup = BeautifulSoup(html_content, 'lxml')
# Define color mapping based on pitch
color_mapping = {
'heiban': 'heiban', # 'blue',
'nakadaka': 'nakadaka', # orange',
'odaka': 'odaka', # 'lime',
'atamadaka': 'atamadaka', # 'red',
'kihuku': 'kihuku',
}
# Extract and generate LaTeX code
latex_output = []
# Header for LaTeX document
#latex_output.append(r'\documentclass{article}')
#latex_output.append(r'\usepackage{xcolor}') # for colors
#latex_output.append(r'\begin{document}')
# Find all component elements
components = soup.find_all('component')
global_text_color = "black"
for component in components:
# Check if there's a ruby tag
text = clean_ruby_tags(component)
classes = component.get('class', [])
# Determine color based on class
color = 'unknown' # Default color
for class_name in classes:
if class_name.startswith('underline-pitch-'):
pitch_type = class_name.split('-')[-1]
if pitch_type in color_mapping:
color = color_mapping[pitch_type]
break
# Create LaTeX underlined text with color
if color != 'unknown':
latex_output.append('\\' + color + r'uline{' + text + r'}')
else:
latex_output.append(text)
# Footer for LaTeX document
latex_output.append(r'\end{document}')
# Join the output into a single string
final_latex_code = '\n'.join(latex_output)
# Print the final LaTeX code
print(final_latex_code)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment