Last active
November 21, 2025 23:36
-
-
Save kujirahand/14172fde92148c83363c667ec8c6f469 to your computer and use it in GitHub Desktop.
簡単なボイスチェンジャーのプログラム。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sounddevice as sd | |
| import numpy as np | |
| import librosa | |
| import blessed | |
| import sys | |
| import queue | |
| import time | |
| from scipy.signal import butter, sosfilt_zi, sosfilt | |
| # --- 定数 --- | |
| SAMPLE_RATE = 44100 | |
| CHANNELS = 1 | |
| BLOCK_SIZE = 2048 | |
| NOISE_GATE_THRESHOLD = 0.04 | |
| LOWCUT = 300.0 | |
| HIGCUT = 3400.0 | |
| FILTER_ORDER = 4 | |
| OUTPUT_GAIN = 0.5 | |
| FRAME_RATE = 20.0 # 描画のフレームレート | |
| # --- ターミナルオブジェクト --- | |
| term = blessed.Terminal() | |
| # --- グローバル変数 --- | |
| pitch_shift_semitones = 8.0 | |
| data_queue = queue.Queue() | |
| sos_filter_coeffs = butter(FILTER_ORDER, [LOWCUT, HIGCUT], btype='band', fs=SAMPLE_RATE, output='sos') | |
| zi_filter_state = sosfilt_zi(sos_filter_coeffs) | |
| def get_color_from_amplitude(amplitude): | |
| """振幅に応じた色を返す""" | |
| normalized = min(1.0, amplitude * 4) | |
| if normalized < 0.3: | |
| return term.green | |
| elif normalized < 0.6: | |
| return term.yellow | |
| else: | |
| return term.red | |
| def draw_waveform(data): | |
| """波形の描画に専念する関数""" | |
| global pitch_shift_semitones | |
| # ヘッダー表示 | |
| header = f" VOICE CHANGER | Pitch Shift: {pitch_shift_semitones:.1f} semitones " | |
| print(term.move_y(0) + term.center(term.bold(header))) | |
| num_cols = term.width | |
| if data is None or data.size == 0: | |
| return | |
| step = len(data) // num_cols if len(data) > num_cols else 1 | |
| if step == 0: step = 1 | |
| sliced_data = data[::step] | |
| center_y = term.height // 2 | |
| # 波形描画 | |
| for x, value in enumerate(sliced_data): | |
| if x >= num_cols: | |
| break | |
| amplitude = int(value * (center_y - 2)) | |
| color = get_color_from_amplitude(abs(value)) | |
| for y in range(abs(amplitude)): | |
| pos_y = center_y - np.sign(amplitude) * y | |
| if 1 < pos_y < term.height: | |
| with term.location(x, int(pos_y)): | |
| print(color('█')) | |
| def callback(indata, outdata, frames, time, status): | |
| """音声処理とキューへのデータ投入に専念する関数""" | |
| global zi_filter_state, pitch_shift_semitones, data_queue | |
| if status: | |
| print(status, file=sys.stderr) | |
| try: | |
| mono_indata = indata.flatten() | |
| if np.abs(mono_indata).max() < NOISE_GATE_THRESHOLD: | |
| outdata.fill(0) | |
| if data_queue.qsize() < 2: | |
| data_queue.put(np.zeros_like(mono_indata)) | |
| return | |
| filtered_data, zi_filter_state = sosfilt(sos_filter_coeffs, mono_indata, zi=zi_filter_state) | |
| shifted_data = librosa.effects.pitch_shift( | |
| y=filtered_data, | |
| sr=SAMPLE_RATE, | |
| n_steps=pitch_shift_semitones | |
| ) | |
| len_shifted = len(shifted_data) | |
| outdata_len = len(outdata.flatten()) | |
| if len_shifted >= outdata_len: | |
| reshaped_data = shifted_data[:outdata_len].reshape(outdata.shape) | |
| else: | |
| padding = np.zeros(outdata_len - len_shifted) | |
| padded_data = np.concatenate((shifted_data, padding)) | |
| reshaped_data = padded_data.reshape(outdata.shape) | |
| outdata[:] = reshaped_data * OUTPUT_GAIN | |
| if data_queue.qsize() < 2: | |
| data_queue.put(mono_indata) | |
| except Exception: | |
| outdata.fill(0) | |
| if data_queue.qsize() < 2: | |
| data_queue.put(np.zeros(BLOCK_SIZE, dtype=np.float32)) | |
| def main(): | |
| """メインループで描画とキー入力を制御する関数""" | |
| global pitch_shift_semitones | |
| try: | |
| with sd.Stream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, channels=CHANNELS, callback=callback): | |
| with term.fullscreen(), term.cbreak(), term.hidden_cursor(): | |
| running = True | |
| last_data = np.zeros(BLOCK_SIZE, dtype=np.float32) | |
| while running: | |
| # --- キューから最新のデータを取得 --- | |
| try: | |
| while not data_queue.empty(): | |
| last_data = data_queue.get_nowait() | |
| except queue.Empty: | |
| pass | |
| # --- 描画処理 --- | |
| print(term.clear, end='') | |
| draw_waveform(last_data) | |
| with term.location(0, term.height - 1): | |
| controls = " [h] Pitch Up | [l] Pitch Down | [Enter] Exit " | |
| print(term.center(term.bold_white_on_blue(controls))) | |
| sys.stdout.flush() | |
| # --- キー入力とフレームレート制御 --- | |
| key = term.inkey(timeout=1/FRAME_RATE) | |
| if key: | |
| if key == 'h': | |
| pitch_shift_semitones = min(24.0, pitch_shift_semitones + 1.0) | |
| elif key == 'l': | |
| pitch_shift_semitones = max(-24.0, pitch_shift_semitones - 1.0) | |
| elif key.name == "KEY_ENTER" or key == '\n': | |
| running = False | |
| except Exception as e: | |
| # term.normalを呼び出してターミナルを通常状態に戻す | |
| print(term.normal) | |
| print(f"\nAn error occurred: {e}") | |
| finally: | |
| print("\nVoice changer stopped.") | |
| if __name__ == "__main__": | |
| main() |
Author
Author
以下、pywordを利用した改良版です。
import sounddevice as sd
import numpy as np
import pyworld as pw
import blessed
import sys
import queue
import time
# --- 定数 ---
SAMPLE_RATE = 16000
CHANNELS = 1
BLOCK_SIZE = 4096
NOISE_GATE_THRESHOLD = 0.01
OUTPUT_GAIN = 0.2
FRAME_RATE = 20.0
# --- ターミナルオブジェクト ---
term = blessed.Terminal()
# --- グローバル変数 ---
pitch_ratio = 1.0
formant_ratio = 1.0
data_queue = queue.Queue()
def get_color_from_amplitude(amplitude):
"""振幅に応じた色を返す"""
normalized = min(1.0, amplitude * 4)
if normalized < 0.3: return term.green
elif normalized < 0.6: return term.yellow
else: return term.red
def draw_waveform(data):
"""波形の描画"""
global pitch_ratio, formant_ratio
header = (f"PYWORLD VOX | Pitch: {pitch_ratio:.2f} | "
f"Formant: {formant_ratio:.2f}")
print(term.move_y(0) + term.center(term.bold(header)))
num_cols = term.width
if data is None or data.size == 0: return
step = len(data) // num_cols
if step == 0: step = 1
sliced_data = data[::step]
center_y = term.height // 2
for x, value in enumerate(sliced_data):
if x >= num_cols: break
amplitude = int(value * (center_y - 2))
color = get_color_from_amplitude(abs(value))
for y in range(abs(amplitude)):
pos_y = center_y - np.sign(amplitude) * y
if 1 < pos_y < term.height:
with term.location(x, int(pos_y)):
print(color('█'))
def callback(indata, outdata, frames, time, status):
"""pyworldを使った音声処理"""
global pitch_ratio, formant_ratio, data_queue
if status: print(status, file=sys.stderr)
try:
x = indata.flatten().astype(np.float64)
if np.abs(x).max() < NOISE_GATE_THRESHOLD:
outdata.fill(0)
if data_queue.qsize() < 2: data_queue.put(np.zeros_like(x))
return
# WORLDによる分析
f0, t = pw.harvest(x, SAMPLE_RATE)
sp = pw.cheaptrick(x, f0, t, SAMPLE_RATE)
ap = pw.d4c(x, f0, t, SAMPLE_RATE)
# パラメータの操作
modified_f0 = f0 * pitch_ratio
# --- 新しいフォルマントシフト処理 ---
if formant_ratio != 1.0:
fftlen = sp.shape[1]
# 各フレームに補間を適用
def shift_spectrum(spectrum):
new_axis = np.linspace(0, fftlen, int(fftlen / formant_ratio))
orig_axis = np.arange(fftlen)
shifted_spectrum = np.interp(new_axis, orig_axis, spectrum)
if len(shifted_spectrum) < fftlen:
shifted_spectrum = np.pad(shifted_spectrum, (0, fftlen - len(shifted_spectrum)), 'constant')
return shifted_spectrum[:fftlen]
modified_sp = np.apply_along_axis(shift_spectrum, 1, sp)
else:
modified_sp = sp
# WORLDによる再合成
y = pw.synthesize(modified_f0, modified_sp, ap, SAMPLE_RATE)
y = y[:len(x)]
reshaped_data = y.reshape(outdata.shape).astype(np.float32)
# 位相の反転とゲインの適用
outdata[:] = -reshaped_data * OUTPUT_GAIN
if data_queue.qsize() < 2:
data_queue.put(x)
except Exception as e:
print(f"Error in callback: {e}", file=sys.stderr)
outdata.fill(0)
if data_queue.qsize() < 2:
data_queue.put(np.zeros(BLOCK_SIZE, dtype=np.float32))
def main():
"""メインループ"""
global pitch_ratio, formant_ratio
try:
with sd.Stream(samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE, channels=CHANNELS, callback=callback):
with term.fullscreen(), term.cbreak(), term.hidden_cursor():
running = True
last_data = np.zeros(BLOCK_SIZE, dtype=np.float32)
while running:
try:
while not data_queue.empty():
last_data = data_queue.get_nowait()
except queue.Empty:
pass
print(term.clear, end='')
draw_waveform(last_data)
with term.location(0, term.height - 1):
controls = ("Pitch [h/l] | Formant [k/j] | [Enter] Exit")
print(term.center(term.bold_white_on_blue(controls)))
sys.stdout.flush()
key = term.inkey(timeout=1/FRAME_RATE)
if key:
if key == 'h': pitch_ratio = min(3.0, pitch_ratio + 0.05)
elif key == 'l': pitch_ratio = max(0.1, pitch_ratio - 0.05)
elif key == 'k': formant_ratio = min(1.5, formant_ratio + 0.05)
elif key == 'j': formant_ratio = max(0.5, formant_ratio - 0.05)
elif key.name == "KEY_ENTER" or key == '\n':
running = False
except Exception as e:
print(term.normal)
print(f"\nAn error occurred: {e}")
finally:
print("\nVoice changer stopped.")
if __name__ == "__main__":
main()
Author
requirements.txtを次のように修正します:
sounddevice
librosa
numpy
blessed
scipy
pyworld
cython
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
ライブラリは下記のものをインストールしてから実行してください。以下のテキストを「requirements.txt」という名前で保存します。
そして、下記のコマンドを実行してプログラムを起動できます。