Created
June 2, 2025 15:27
-
-
Save cometothed4rkside/8cd391bd3dbca15fc1699635bca32bed to your computer and use it in GitHub Desktop.
Youtube to data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from yt_dlp import YoutubeDL | |
| import time | |
| def get_video_urls(channel_url): | |
| ydl_opts = { | |
| 'extract_flat': True, | |
| 'quiet': True | |
| } | |
| with YoutubeDL(ydl_opts) as ydl: | |
| try: | |
| result = ydl.extract_info(channel_url, download=False) | |
| if 'entries' in result: | |
| return [video['url'] for video in result['entries']] | |
| except Exception as e: | |
| print(f"Kanal video listesi alınamadı: {str(e)}") | |
| return [] | |
| return [] | |
| def download_audio(url, index, total): | |
| ydl_opts = { | |
| 'format': 'm4a/bestaudio/best', | |
| 'postprocessors': [{ | |
| 'key': 'FFmpegExtractAudio', | |
| 'preferredcodec': 'mp3', | |
| }], | |
| 'outtmpl': '%(title)s.%(ext)s', | |
| } | |
| with YoutubeDL(ydl_opts) as ydl: | |
| try: | |
| print(f"\n[{index}/{total}] Video indiriliyor...") | |
| ydl.download([url]) | |
| print(f"[{index}/{total}] MP3 dönüşümü tamamlandı!") | |
| return True | |
| except Exception as e: | |
| print(f"Hata: {str(e)}") | |
| return False | |
| def main(): | |
| channel_url = "https://www.youtube.com/@NeBuTantana/videos" | |
| print(f"Kanal: {channel_url}") | |
| print("Video listesi alınıyor...") | |
| video_urls = get_video_urls(channel_url) | |
| total_videos = len(video_urls) | |
| if total_videos == 0: | |
| print("Video bulunamadı!") | |
| return | |
| print(f"Toplam {total_videos} video bulundu") | |
| print("İndirme başlıyor...") | |
| success_count = 0 | |
| for index, url in enumerate(video_urls, 1): | |
| if download_audio(url, index, total_videos): | |
| success_count += 1 | |
| print(f"\nToplam {total_videos} videodan {success_count} tanesi başarıyla indirildi!") | |
| if __name__ == "__main__": | |
| main() | |
| from faster_whisper import WhisperModel | |
| from pathlib import Path | |
| def transcribe_audio_files(input_folder="downloaded_mp3s", output_file="transcripts.txt"): | |
| # GPU ayarları ve model yükleme | |
| print("Model yükleniyor...") | |
| model = WhisperModel("large-v3", device="cuda", compute_type="float16") | |
| # Input klasöründeki MP3'leri bul | |
| input_dir = Path(input_folder) | |
| mp3_files = list(input_dir.glob("*.mp3")) | |
| if not mp3_files: | |
| print("MP3 dosyası bulunamadı!") | |
| return | |
| print(f"Toplam {len(mp3_files)} MP3 dosyası bulundu") | |
| # Transcript dosyasını oluştur | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| for i, mp3_file in enumerate(mp3_files, 1): | |
| print(f"\n[{i}/{len(mp3_files)}] İşleniyor: {mp3_file.name}") | |
| try: | |
| # Transcribe işlemi | |
| segments, info = model.transcribe( | |
| str(mp3_file), | |
| language="tr", | |
| beam_size=5, | |
| vad_filter=True, | |
| vad_parameters=dict(min_silence_duration_ms=500) | |
| ) | |
| # Dosya adını ve transcripti yaz | |
| f.write(f"\n{'='*50}\n") | |
| f.write(f"Dosya: {mp3_file.name}\n") | |
| f.write(f"Dil: {info.language} (Olasılık: {info.language_probability:.2f})\n") | |
| f.write(f"{'='*50}\n\n") | |
| # Segmentleri sırayla yaz | |
| for segment in segments: | |
| f.write(f"[{segment.start:.1f}s -> {segment.end:.1f}s] {segment.text}\n") | |
| print(f"✓ Tamamlandı: {mp3_file.name}") | |
| except Exception as e: | |
| print(f"Hata - {mp3_file.name}: {str(e)}") | |
| f.write(f"\nHATA - {mp3_file.name}: {str(e)}\n") | |
| print(f"\nTüm transcriptler '{output_file}' dosyasına kaydedildi!") | |
| if __name__ == "__main__": | |
| transcribe_audio_files() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment