jsbeaudry · October 18, 2025 18:21
diff --git a/nemo-codec-mlx-local-url-data.py b/nemo-codec-mlx-local-url-data.py
 import librosa
 import json
 import warnings
 import requests
 import io
 import numpy as np
 import mlx.core as mx
 import soundfile as sf
 from nanocodec_mlx.models.audio_codec import AudioCodecModel

 warnings.filterwarnings("ignore", category=UserWarning)

 # ---------------- Input ----------------
 audio_inputs = [
    {
        "file": "files/input1.wav",
        "text": "text example.",
        "speaker": "speaker1"
    },
    {
        "file": "files/input2.wav",
        "text": "text example.",
        "speaker": "speaker2"
    },
    #For url
    # {
    #     "file": "https://......wav",
    #     "text": "text",
    #     "speaker": "speaker3"
    # }
 ]

 output_json = "mlx_encode_out.json"

 # ---------------- Load Model ----------------
 print("🔄 Loading MLX codec model...")
 model = AudioCodecModel.from_pretrained(
    "nineninesix/nemo-nano-codec-22khz-0.6kbps-12.5fps-MLX"
 )
 sample_rate = 22050  # Model’s expected sample rate
 print(f"✅ Model loaded (sample rate: {sample_rate}Hz)")

 # ---------------- Helper Function ----------------
 def load_audio(file_path: str):
    """Load audio from local or remote URL and resample if needed."""
    if file_path.startswith("http://") or file_path.startswith("https://"):
        print(f"🌐 Downloading audio from URL: {file_path}")
        response = requests.get(file_path)
        response.raise_for_status()
        audio_bytes = io.BytesIO(response.content)
        audio, sr = librosa.load(audio_bytes, sr=sample_rate)
    else:
        audio, sr = librosa.load(file_path, sr=sample_rate)

    return audio.astype(np.float32), sr

 # ---------------- Encode Function ----------------
 def encode_audio_file(item: dict):
    """Encode a single audio file into discrete nano layers using MLX."""
    file_path = item["file"]
    text = item.get("text", "")
    speaker = item.get("speaker", "unknown")

    print(f"\n🎧 Encoding: {file_path}")
    audio, sr = load_audio(file_path)

    # Convert to MLX tensor [B, C, T]
    audio_mlx = mx.array(audio, dtype=mx.float32)[None, None, :]
    audio_len = mx.array([audio_mlx.shape[-1]], dtype=mx.int32)

    # Encode
    tokens, tokens_len = model.encode(audio_mlx, audio_len)

    # Convert MLX tensors → numpy lists
    tokens_np = np.array(tokens)
    encoded_tokens = tokens_np.squeeze(0).astype(int).tolist()
    encoded_len = int(np.array(tokens_len)[0])

    # Build structured output
    data = {
        "file": file_path,
        "nano_layer_1": encoded_tokens[0],
        "nano_layer_2": encoded_tokens[1],
        "nano_layer_3": encoded_tokens[2],
        "nano_layer_4": encoded_tokens[3],
        "encoded_len": encoded_len,
        "text": text,
        "speaker": speaker
    }

    print(f"✅ Encoded {file_path} ({encoded_len} frames)")
    return data

 # ---------------- Encode All Files ----------------
 results = []
 for item in audio_inputs:
    try:
        encoded = encode_audio_file(item)
        results.append(encoded)
    except Exception as e:
        print(f"❌ Error encoding {item['file']}: {e}")

 # ---------------- Save to JSON ----------------
 with open(output_json, "w") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

 print(f"\n💾 Encoded data saved to: {output_json}")

 # ---------------- Print Summary ----------------
 print("\n✅ Encoding completed:")
 for r in results:
    print(f"File: {r['file']}, Frames: {r['encoded_len']}, Speaker: {r['speaker']}")
	import librosa
	import json
	import warnings
	import requests
	import io
	import numpy as np
	import mlx.core as mx
	import soundfile as sf
	from nanocodec_mlx.models.audio_codec import AudioCodecModel

	warnings.filterwarnings("ignore", category=UserWarning)

	# ---------------- Input ----------------
	audio_inputs = [
	{
	"file": "files/input1.wav",
	"text": "text example.",
	"speaker": "speaker1"
	},
	{
	"file": "files/input2.wav",
	"text": "text example.",
	"speaker": "speaker2"
	},
	#For url
	# {
	# "file": "https://......wav",
	# "text": "text",
	# "speaker": "speaker3"
	# }
	]

	output_json = "mlx_encode_out.json"

	# ---------------- Load Model ----------------
	print("🔄 Loading MLX codec model...")
	model = AudioCodecModel.from_pretrained(
	"nineninesix/nemo-nano-codec-22khz-0.6kbps-12.5fps-MLX"
	)
	sample_rate = 22050 # Model’s expected sample rate
	print(f"✅ Model loaded (sample rate: {sample_rate}Hz)")

	# ---------------- Helper Function ----------------
	def load_audio(file_path: str):
	"""Load audio from local or remote URL and resample if needed."""
	if file_path.startswith("http://") or file_path.startswith("https://"):
	print(f"🌐 Downloading audio from URL: {file_path}")
	response = requests.get(file_path)
	response.raise_for_status()
	audio_bytes = io.BytesIO(response.content)
	audio, sr = librosa.load(audio_bytes, sr=sample_rate)
	else:
	audio, sr = librosa.load(file_path, sr=sample_rate)

	return audio.astype(np.float32), sr

	# ---------------- Encode Function ----------------
	def encode_audio_file(item: dict):
	"""Encode a single audio file into discrete nano layers using MLX."""
	file_path = item["file"]
	text = item.get("text", "")
	speaker = item.get("speaker", "unknown")

	print(f"\n🎧 Encoding: {file_path}")
	audio, sr = load_audio(file_path)

	# Convert to MLX tensor [B, C, T]
	audio_mlx = mx.array(audio, dtype=mx.float32)[None, None, :]
	audio_len = mx.array([audio_mlx.shape[-1]], dtype=mx.int32)

	# Encode
	tokens, tokens_len = model.encode(audio_mlx, audio_len)

	# Convert MLX tensors → numpy lists
	tokens_np = np.array(tokens)
	encoded_tokens = tokens_np.squeeze(0).astype(int).tolist()
	encoded_len = int(np.array(tokens_len)[0])

	# Build structured output
	data = {
	"file": file_path,
	"nano_layer_1": encoded_tokens[0],
	"nano_layer_2": encoded_tokens[1],
	"nano_layer_3": encoded_tokens[2],
	"nano_layer_4": encoded_tokens[3],
	"encoded_len": encoded_len,
	"text": text,
	"speaker": speaker
	}

	print(f"✅ Encoded {file_path} ({encoded_len} frames)")
	return data

	# ---------------- Encode All Files ----------------
	results = []
	for item in audio_inputs:
	try:
	encoded = encode_audio_file(item)
	results.append(encoded)
	except Exception as e:
	print(f"❌ Error encoding {item['file']}: {e}")

	# ---------------- Save to JSON ----------------
	with open(output_json, "w") as f:
	json.dump(results, f, indent=2, ensure_ascii=False)

	print(f"\n💾 Encoded data saved to: {output_json}")

	# ---------------- Print Summary ----------------
	print("\n✅ Encoding completed:")
	for r in results:
	print(f"File: {r['file']}, Frames: {r['encoded_len']}, Speaker: {r['speaker']}")
No results found