Skip to content

Instantly share code, notes, and snippets.

@jsbeaudry
Created October 18, 2025 18:21
Show Gist options
  • Select an option

  • Save jsbeaudry/5434e9f6042e226867cb37b1cfbabc6b to your computer and use it in GitHub Desktop.

Select an option

Save jsbeaudry/5434e9f6042e226867cb37b1cfbabc6b to your computer and use it in GitHub Desktop.
import librosa
import json
import warnings
import requests
import io
import numpy as np
import mlx.core as mx
import soundfile as sf
from nanocodec_mlx.models.audio_codec import AudioCodecModel
warnings.filterwarnings("ignore", category=UserWarning)
# ---------------- Input ----------------
audio_inputs = [
{
"file": "files/input1.wav",
"text": "text example.",
"speaker": "speaker1"
},
{
"file": "files/input2.wav",
"text": "text example.",
"speaker": "speaker2"
},
#For url
# {
# "file": "https://......wav",
# "text": "text",
# "speaker": "speaker3"
# }
]
output_json = "mlx_encode_out.json"
# ---------------- Load Model ----------------
print("🔄 Loading MLX codec model...")
model = AudioCodecModel.from_pretrained(
"nineninesix/nemo-nano-codec-22khz-0.6kbps-12.5fps-MLX"
)
sample_rate = 22050 # Model’s expected sample rate
print(f"✅ Model loaded (sample rate: {sample_rate}Hz)")
# ---------------- Helper Function ----------------
def load_audio(file_path: str):
"""Load audio from local or remote URL and resample if needed."""
if file_path.startswith("http://") or file_path.startswith("https://"):
print(f"🌐 Downloading audio from URL: {file_path}")
response = requests.get(file_path)
response.raise_for_status()
audio_bytes = io.BytesIO(response.content)
audio, sr = librosa.load(audio_bytes, sr=sample_rate)
else:
audio, sr = librosa.load(file_path, sr=sample_rate)
return audio.astype(np.float32), sr
# ---------------- Encode Function ----------------
def encode_audio_file(item: dict):
"""Encode a single audio file into discrete nano layers using MLX."""
file_path = item["file"]
text = item.get("text", "")
speaker = item.get("speaker", "unknown")
print(f"\n🎧 Encoding: {file_path}")
audio, sr = load_audio(file_path)
# Convert to MLX tensor [B, C, T]
audio_mlx = mx.array(audio, dtype=mx.float32)[None, None, :]
audio_len = mx.array([audio_mlx.shape[-1]], dtype=mx.int32)
# Encode
tokens, tokens_len = model.encode(audio_mlx, audio_len)
# Convert MLX tensors → numpy lists
tokens_np = np.array(tokens)
encoded_tokens = tokens_np.squeeze(0).astype(int).tolist()
encoded_len = int(np.array(tokens_len)[0])
# Build structured output
data = {
"file": file_path,
"nano_layer_1": encoded_tokens[0],
"nano_layer_2": encoded_tokens[1],
"nano_layer_3": encoded_tokens[2],
"nano_layer_4": encoded_tokens[3],
"encoded_len": encoded_len,
"text": text,
"speaker": speaker
}
print(f"✅ Encoded {file_path} ({encoded_len} frames)")
return data
# ---------------- Encode All Files ----------------
results = []
for item in audio_inputs:
try:
encoded = encode_audio_file(item)
results.append(encoded)
except Exception as e:
print(f"❌ Error encoding {item['file']}: {e}")
# ---------------- Save to JSON ----------------
with open(output_json, "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\n💾 Encoded data saved to: {output_json}")
# ---------------- Print Summary ----------------
print("\n✅ Encoding completed:")
for r in results:
print(f"File: {r['file']}, Frames: {r['encoded_len']}, Speaker: {r['speaker']}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment