Created
March 27, 2025 05:32
-
-
Save muchanem/f46e498a541f05c7705acdb4518c71fe to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.13" | |
| # dependencies = [ | |
| # "audioop-lts", | |
| # "llm", | |
| # "llm-gemini", | |
| # "pyaudio", | |
| # "pydub", | |
| # ] | |
| # /// | |
| import llm | |
| from pydub import AudioSegment | |
| import io | |
| import json | |
| def split_audio_into_chunks(audio_path, chunk_length_minutes=5): | |
| """ | |
| Split an m4a audio file into N-minute chunks and return each chunk as binary data. | |
| """ | |
| # Load the audio file | |
| audio = AudioSegment.from_file(audio_path, format="m4a") | |
| # Calculate chunk length in milliseconds | |
| chunk_length_ms = chunk_length_minutes * 60 * 1000 | |
| # Initialize list for binary chunks | |
| binary_chunks = [] | |
| # Calculate how many chunks we'll have | |
| total_length_ms = len(audio) | |
| # Split the audio into chunks | |
| for start_ms in range(0, total_length_ms, chunk_length_ms): | |
| end_ms = min(start_ms + chunk_length_ms, total_length_ms) | |
| chunk = audio[start_ms:end_ms] | |
| # Convert chunk to binary data in mp3 format | |
| buffer = io.BytesIO() | |
| chunk.export(buffer, format="mp3") | |
| binary_chunks.append(buffer.getvalue()) | |
| return binary_chunks | |
| # Example usage: | |
| audio_path = "31.m4a" | |
| model = llm.get_model("gemini-2.0-flash") | |
| schema = llm.schema_dsl("timestamp str: mm:ss,text,speaker: SPK_0 or SPK_1,language: two letter code", multi=True) | |
| prompt = "transcribe" | |
| chunk_length_minutes = 5 | |
| chunks = split_audio_into_chunks(audio_path, chunk_length_minutes=chunk_length_minutes) | |
| output_data = [] | |
| for i, chunk in enumerate(chunks): | |
| try: | |
| attachment = llm.Attachment(type="audio/mp3", content=chunk) | |
| response = model.prompt("transcribe", attachments=[attachment], schema=schema) | |
| # Parse JSON for the current chunk | |
| response_json = json.loads(response.text())["items"] | |
| # --- COMPUTE AND APPLY OFFSET --- | |
| # How many milliseconds in each chunk | |
| chunk_length_ms = chunk_length_minutes * 60 * 1000 | |
| # The start of the current chunk (in ms) | |
| chunk_start_ms = i * chunk_length_ms | |
| # Convert that to total seconds for easy arithmetic | |
| chunk_start_seconds = chunk_start_ms // 1000 | |
| for item in response_json: | |
| # item["timestamp"] is something like "mm:ss" | |
| mm_str, ss_str = item["timestamp"].split(":") | |
| mm = int(mm_str) | |
| ss = int(ss_str) | |
| original_seconds = mm * 60 + ss | |
| # Add the offset | |
| updated_seconds = original_seconds + chunk_start_seconds | |
| # Convert back to mm:ss | |
| new_mm = updated_seconds // 60 | |
| new_ss = updated_seconds % 60 | |
| item["timestamp"] = f"{new_mm:02d}:{new_ss:02d}" | |
| # Now that timestamps are offset, extend the main list | |
| output_data.extend(response_json) | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| break | |
| # Optionally break early | |
| #if i >= 2: | |
| # break | |
| # Finally, write out to JSON | |
| with open("31.json", 'w', encoding='utf-8') as file: | |
| json.dump(output_data, file, indent=2, ensure_ascii=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment