Skip to content

Instantly share code, notes, and snippets.

@cometothed4rkside
Created June 2, 2025 15:27
Show Gist options
  • Select an option

  • Save cometothed4rkside/8cd391bd3dbca15fc1699635bca32bed to your computer and use it in GitHub Desktop.

Select an option

Save cometothed4rkside/8cd391bd3dbca15fc1699635bca32bed to your computer and use it in GitHub Desktop.
Youtube to data
from yt_dlp import YoutubeDL
import time
def get_video_urls(channel_url):
ydl_opts = {
'extract_flat': True,
'quiet': True
}
with YoutubeDL(ydl_opts) as ydl:
try:
result = ydl.extract_info(channel_url, download=False)
if 'entries' in result:
return [video['url'] for video in result['entries']]
except Exception as e:
print(f"Kanal video listesi alınamadı: {str(e)}")
return []
return []
def download_audio(url, index, total):
ydl_opts = {
'format': 'm4a/bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
}],
'outtmpl': '%(title)s.%(ext)s',
}
with YoutubeDL(ydl_opts) as ydl:
try:
print(f"\n[{index}/{total}] Video indiriliyor...")
ydl.download([url])
print(f"[{index}/{total}] MP3 dönüşümü tamamlandı!")
return True
except Exception as e:
print(f"Hata: {str(e)}")
return False
def main():
channel_url = "https://www.youtube.com/@NeBuTantana/videos"
print(f"Kanal: {channel_url}")
print("Video listesi alınıyor...")
video_urls = get_video_urls(channel_url)
total_videos = len(video_urls)
if total_videos == 0:
print("Video bulunamadı!")
return
print(f"Toplam {total_videos} video bulundu")
print("İndirme başlıyor...")
success_count = 0
for index, url in enumerate(video_urls, 1):
if download_audio(url, index, total_videos):
success_count += 1
print(f"\nToplam {total_videos} videodan {success_count} tanesi başarıyla indirildi!")
if __name__ == "__main__":
main()
from faster_whisper import WhisperModel
from pathlib import Path
def transcribe_audio_files(input_folder="downloaded_mp3s", output_file="transcripts.txt"):
# GPU ayarları ve model yükleme
print("Model yükleniyor...")
model = WhisperModel("large-v3", device="cuda", compute_type="float16")
# Input klasöründeki MP3'leri bul
input_dir = Path(input_folder)
mp3_files = list(input_dir.glob("*.mp3"))
if not mp3_files:
print("MP3 dosyası bulunamadı!")
return
print(f"Toplam {len(mp3_files)} MP3 dosyası bulundu")
# Transcript dosyasını oluştur
with open(output_file, "w", encoding="utf-8") as f:
for i, mp3_file in enumerate(mp3_files, 1):
print(f"\n[{i}/{len(mp3_files)}] İşleniyor: {mp3_file.name}")
try:
# Transcribe işlemi
segments, info = model.transcribe(
str(mp3_file),
language="tr",
beam_size=5,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500)
)
# Dosya adını ve transcripti yaz
f.write(f"\n{'='*50}\n")
f.write(f"Dosya: {mp3_file.name}\n")
f.write(f"Dil: {info.language} (Olasılık: {info.language_probability:.2f})\n")
f.write(f"{'='*50}\n\n")
# Segmentleri sırayla yaz
for segment in segments:
f.write(f"[{segment.start:.1f}s -> {segment.end:.1f}s] {segment.text}\n")
print(f"✓ Tamamlandı: {mp3_file.name}")
except Exception as e:
print(f"Hata - {mp3_file.name}: {str(e)}")
f.write(f"\nHATA - {mp3_file.name}: {str(e)}\n")
print(f"\nTüm transcriptler '{output_file}' dosyasına kaydedildi!")
if __name__ == "__main__":
transcribe_audio_files()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment