Created
October 18, 2025 18:21
-
-
Save jsbeaudry/5434e9f6042e226867cb37b1cfbabc6b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import librosa | |
| import json | |
| import warnings | |
| import requests | |
| import io | |
| import numpy as np | |
| import mlx.core as mx | |
| import soundfile as sf | |
| from nanocodec_mlx.models.audio_codec import AudioCodecModel | |
| warnings.filterwarnings("ignore", category=UserWarning) | |
| # ---------------- Input ---------------- | |
| audio_inputs = [ | |
| { | |
| "file": "files/input1.wav", | |
| "text": "text example.", | |
| "speaker": "speaker1" | |
| }, | |
| { | |
| "file": "files/input2.wav", | |
| "text": "text example.", | |
| "speaker": "speaker2" | |
| }, | |
| #For url | |
| # { | |
| # "file": "https://......wav", | |
| # "text": "text", | |
| # "speaker": "speaker3" | |
| # } | |
| ] | |
| output_json = "mlx_encode_out.json" | |
| # ---------------- Load Model ---------------- | |
| print("🔄 Loading MLX codec model...") | |
| model = AudioCodecModel.from_pretrained( | |
| "nineninesix/nemo-nano-codec-22khz-0.6kbps-12.5fps-MLX" | |
| ) | |
| sample_rate = 22050 # Model’s expected sample rate | |
| print(f"✅ Model loaded (sample rate: {sample_rate}Hz)") | |
| # ---------------- Helper Function ---------------- | |
| def load_audio(file_path: str): | |
| """Load audio from local or remote URL and resample if needed.""" | |
| if file_path.startswith("http://") or file_path.startswith("https://"): | |
| print(f"🌐 Downloading audio from URL: {file_path}") | |
| response = requests.get(file_path) | |
| response.raise_for_status() | |
| audio_bytes = io.BytesIO(response.content) | |
| audio, sr = librosa.load(audio_bytes, sr=sample_rate) | |
| else: | |
| audio, sr = librosa.load(file_path, sr=sample_rate) | |
| return audio.astype(np.float32), sr | |
| # ---------------- Encode Function ---------------- | |
| def encode_audio_file(item: dict): | |
| """Encode a single audio file into discrete nano layers using MLX.""" | |
| file_path = item["file"] | |
| text = item.get("text", "") | |
| speaker = item.get("speaker", "unknown") | |
| print(f"\n🎧 Encoding: {file_path}") | |
| audio, sr = load_audio(file_path) | |
| # Convert to MLX tensor [B, C, T] | |
| audio_mlx = mx.array(audio, dtype=mx.float32)[None, None, :] | |
| audio_len = mx.array([audio_mlx.shape[-1]], dtype=mx.int32) | |
| # Encode | |
| tokens, tokens_len = model.encode(audio_mlx, audio_len) | |
| # Convert MLX tensors → numpy lists | |
| tokens_np = np.array(tokens) | |
| encoded_tokens = tokens_np.squeeze(0).astype(int).tolist() | |
| encoded_len = int(np.array(tokens_len)[0]) | |
| # Build structured output | |
| data = { | |
| "file": file_path, | |
| "nano_layer_1": encoded_tokens[0], | |
| "nano_layer_2": encoded_tokens[1], | |
| "nano_layer_3": encoded_tokens[2], | |
| "nano_layer_4": encoded_tokens[3], | |
| "encoded_len": encoded_len, | |
| "text": text, | |
| "speaker": speaker | |
| } | |
| print(f"✅ Encoded {file_path} ({encoded_len} frames)") | |
| return data | |
| # ---------------- Encode All Files ---------------- | |
| results = [] | |
| for item in audio_inputs: | |
| try: | |
| encoded = encode_audio_file(item) | |
| results.append(encoded) | |
| except Exception as e: | |
| print(f"❌ Error encoding {item['file']}: {e}") | |
| # ---------------- Save to JSON ---------------- | |
| with open(output_json, "w") as f: | |
| json.dump(results, f, indent=2, ensure_ascii=False) | |
| print(f"\n💾 Encoded data saved to: {output_json}") | |
| # ---------------- Print Summary ---------------- | |
| print("\n✅ Encoding completed:") | |
| for r in results: | |
| print(f"File: {r['file']}, Frames: {r['encoded_len']}, Speaker: {r['speaker']}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment