Skip to content

Instantly share code, notes, and snippets.

@kujirahand
Last active November 21, 2025 23:36
Show Gist options
  • Select an option

  • Save kujirahand/14172fde92148c83363c667ec8c6f469 to your computer and use it in GitHub Desktop.

Select an option

Save kujirahand/14172fde92148c83363c667ec8c6f469 to your computer and use it in GitHub Desktop.
簡単なボイスチェンジャーのプログラム。
import sounddevice as sd
import numpy as np
import librosa
import blessed
import sys
import queue
import time
from scipy.signal import butter, sosfilt_zi, sosfilt
# --- 定数 ---
SAMPLE_RATE = 44100
CHANNELS = 1
BLOCK_SIZE = 2048
NOISE_GATE_THRESHOLD = 0.04
LOWCUT = 300.0
HIGCUT = 3400.0
FILTER_ORDER = 4
OUTPUT_GAIN = 0.5
FRAME_RATE = 20.0 # 描画のフレームレート
# --- ターミナルオブジェクト ---
term = blessed.Terminal()
# --- グローバル変数 ---
pitch_shift_semitones = 8.0
data_queue = queue.Queue()
sos_filter_coeffs = butter(FILTER_ORDER, [LOWCUT, HIGCUT], btype='band', fs=SAMPLE_RATE, output='sos')
zi_filter_state = sosfilt_zi(sos_filter_coeffs)
def get_color_from_amplitude(amplitude):
"""振幅に応じた色を返す"""
normalized = min(1.0, amplitude * 4)
if normalized < 0.3:
return term.green
elif normalized < 0.6:
return term.yellow
else:
return term.red
def draw_waveform(data):
"""波形の描画に専念する関数"""
global pitch_shift_semitones
# ヘッダー表示
header = f" VOICE CHANGER | Pitch Shift: {pitch_shift_semitones:.1f} semitones "
print(term.move_y(0) + term.center(term.bold(header)))
num_cols = term.width
if data is None or data.size == 0:
return
step = len(data) // num_cols if len(data) > num_cols else 1
if step == 0: step = 1
sliced_data = data[::step]
center_y = term.height // 2
# 波形描画
for x, value in enumerate(sliced_data):
if x >= num_cols:
break
amplitude = int(value * (center_y - 2))
color = get_color_from_amplitude(abs(value))
for y in range(abs(amplitude)):
pos_y = center_y - np.sign(amplitude) * y
if 1 < pos_y < term.height:
with term.location(x, int(pos_y)):
print(color('█'))
def callback(indata, outdata, frames, time, status):
"""音声処理とキューへのデータ投入に専念する関数"""
global zi_filter_state, pitch_shift_semitones, data_queue
if status:
print(status, file=sys.stderr)
try:
mono_indata = indata.flatten()
if np.abs(mono_indata).max() < NOISE_GATE_THRESHOLD:
outdata.fill(0)
if data_queue.qsize() < 2:
data_queue.put(np.zeros_like(mono_indata))
return
filtered_data, zi_filter_state = sosfilt(sos_filter_coeffs, mono_indata, zi=zi_filter_state)
shifted_data = librosa.effects.pitch_shift(
y=filtered_data,
sr=SAMPLE_RATE,
n_steps=pitch_shift_semitones
)
len_shifted = len(shifted_data)
outdata_len = len(outdata.flatten())
if len_shifted >= outdata_len:
reshaped_data = shifted_data[:outdata_len].reshape(outdata.shape)
else:
padding = np.zeros(outdata_len - len_shifted)
padded_data = np.concatenate((shifted_data, padding))
reshaped_data = padded_data.reshape(outdata.shape)
outdata[:] = reshaped_data * OUTPUT_GAIN
if data_queue.qsize() < 2:
data_queue.put(mono_indata)
except Exception:
outdata.fill(0)
if data_queue.qsize() < 2:
data_queue.put(np.zeros(BLOCK_SIZE, dtype=np.float32))
def main():
"""メインループで描画とキー入力を制御する関数"""
global pitch_shift_semitones
try:
with sd.Stream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, channels=CHANNELS, callback=callback):
with term.fullscreen(), term.cbreak(), term.hidden_cursor():
running = True
last_data = np.zeros(BLOCK_SIZE, dtype=np.float32)
while running:
# --- キューから最新のデータを取得 ---
try:
while not data_queue.empty():
last_data = data_queue.get_nowait()
except queue.Empty:
pass
# --- 描画処理 ---
print(term.clear, end='')
draw_waveform(last_data)
with term.location(0, term.height - 1):
controls = " [h] Pitch Up | [l] Pitch Down | [Enter] Exit "
print(term.center(term.bold_white_on_blue(controls)))
sys.stdout.flush()
# --- キー入力とフレームレート制御 ---
key = term.inkey(timeout=1/FRAME_RATE)
if key:
if key == 'h':
pitch_shift_semitones = min(24.0, pitch_shift_semitones + 1.0)
elif key == 'l':
pitch_shift_semitones = max(-24.0, pitch_shift_semitones - 1.0)
elif key.name == "KEY_ENTER" or key == '\n':
running = False
except Exception as e:
# term.normalを呼び出してターミナルを通常状態に戻す
print(term.normal)
print(f"\nAn error occurred: {e}")
finally:
print("\nVoice changer stopped.")
if __name__ == "__main__":
main()
@kujirahand
Copy link
Author

kujirahand commented Nov 21, 2025

ライブラリは下記のものをインストールしてから実行してください。以下のテキストを「requirements.txt」という名前で保存します。

sounddevice
librosa
numpy
blessed
scipy

そして、下記のコマンドを実行してプログラムを起動できます。

pip install -r requirements.txt
python voice_changer.py

@kujirahand
Copy link
Author

kujirahand commented Nov 21, 2025

以下、pywordを利用した改良版です。

import sounddevice as sd
import numpy as np
import pyworld as pw
import blessed
import sys
import queue
import time

# --- 定数 ---
SAMPLE_RATE = 16000
CHANNELS = 1
BLOCK_SIZE = 4096
NOISE_GATE_THRESHOLD = 0.01
OUTPUT_GAIN = 0.2
FRAME_RATE = 20.0

# --- ターミナルオブジェクト ---
term = blessed.Terminal()

# --- グローバル変数 ---
pitch_ratio = 1.0
formant_ratio = 1.0
data_queue = queue.Queue()

def get_color_from_amplitude(amplitude):
    """振幅に応じた色を返す"""
    normalized = min(1.0, amplitude * 4)
    if normalized < 0.3: return term.green
    elif normalized < 0.6: return term.yellow
    else: return term.red

def draw_waveform(data):
    """波形の描画"""
    global pitch_ratio, formant_ratio
    
    header = (f"PYWORLD VOX | Pitch: {pitch_ratio:.2f} | "
              f"Formant: {formant_ratio:.2f}")
    print(term.move_y(0) + term.center(term.bold(header)))

    num_cols = term.width
    if data is None or data.size == 0: return
    
    step = len(data) // num_cols
    if step == 0: step = 1
    sliced_data = data[::step]
    center_y = term.height // 2

    for x, value in enumerate(sliced_data):
        if x >= num_cols: break
        
        amplitude = int(value * (center_y - 2))
        color = get_color_from_amplitude(abs(value))

        for y in range(abs(amplitude)):
            pos_y = center_y - np.sign(amplitude) * y
            if 1 < pos_y < term.height:
                with term.location(x, int(pos_y)):
                    print(color('█'))

def callback(indata, outdata, frames, time, status):
    """pyworldを使った音声処理"""
    global pitch_ratio, formant_ratio, data_queue

    if status: print(status, file=sys.stderr)

    try:
        x = indata.flatten().astype(np.float64)

        if np.abs(x).max() < NOISE_GATE_THRESHOLD:
            outdata.fill(0)
            if data_queue.qsize() < 2: data_queue.put(np.zeros_like(x))
            return

        # WORLDによる分析
        f0, t = pw.harvest(x, SAMPLE_RATE)
        sp = pw.cheaptrick(x, f0, t, SAMPLE_RATE)
        ap = pw.d4c(x, f0, t, SAMPLE_RATE)

        # パラメータの操作
        modified_f0 = f0 * pitch_ratio
        
        # --- 新しいフォルマントシフト処理 ---
        if formant_ratio != 1.0:
            fftlen = sp.shape[1]
            # 各フレームに補間を適用
            def shift_spectrum(spectrum):
                new_axis = np.linspace(0, fftlen, int(fftlen / formant_ratio))
                orig_axis = np.arange(fftlen)
                shifted_spectrum = np.interp(new_axis, orig_axis, spectrum)
                if len(shifted_spectrum) < fftlen:
                    shifted_spectrum = np.pad(shifted_spectrum, (0, fftlen - len(shifted_spectrum)), 'constant')
                return shifted_spectrum[:fftlen]
            modified_sp = np.apply_along_axis(shift_spectrum, 1, sp)
        else:
            modified_sp = sp

        # WORLDによる再合成
        y = pw.synthesize(modified_f0, modified_sp, ap, SAMPLE_RATE)
        
        y = y[:len(x)]
        reshaped_data = y.reshape(outdata.shape).astype(np.float32)

        # 位相の反転とゲインの適用
        outdata[:] = -reshaped_data * OUTPUT_GAIN
        
        if data_queue.qsize() < 2:
            data_queue.put(x)

    except Exception as e:
        print(f"Error in callback: {e}", file=sys.stderr)
        outdata.fill(0)
        if data_queue.qsize() < 2:
            data_queue.put(np.zeros(BLOCK_SIZE, dtype=np.float32))

def main():
    """メインループ"""
    global pitch_ratio, formant_ratio
    try:
        with sd.Stream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, channels=CHANNELS, callback=callback):
            with term.fullscreen(), term.cbreak(), term.hidden_cursor():
                running = True
                last_data = np.zeros(BLOCK_SIZE, dtype=np.float32)
                
                while running:
                    try:
                        while not data_queue.empty():
                           last_data = data_queue.get_nowait()
                    except queue.Empty:
                        pass
                    
                    print(term.clear, end='')
                    draw_waveform(last_data)
                    
                    with term.location(0, term.height - 1):
                        controls = ("Pitch [h/l] | Formant [k/j] | [Enter] Exit")
                        print(term.center(term.bold_white_on_blue(controls)))
                    
                    sys.stdout.flush()

                    key = term.inkey(timeout=1/FRAME_RATE)
                    if key:
                        if key == 'h': pitch_ratio = min(3.0, pitch_ratio + 0.05)
                        elif key == 'l': pitch_ratio = max(0.1, pitch_ratio - 0.05)
                        elif key == 'k': formant_ratio = min(1.5, formant_ratio + 0.05)
                        elif key == 'j': formant_ratio = max(0.5, formant_ratio - 0.05)
                        elif key.name == "KEY_ENTER" or key == '\n':
                            running = False
    except Exception as e:
        print(term.normal)
        print(f"\nAn error occurred: {e}")
    finally:
        print("\nVoice changer stopped.")

if __name__ == "__main__":
    main()

@kujirahand
Copy link
Author

requirements.txtを次のように修正します:

sounddevice
librosa
numpy
blessed
scipy
pyworld
cython

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment