study-notes-api / src /audio /downloader.py
ALI7ADEL's picture
Upload 51 files
ed147e2 verified
"""
YouTube video downloader and audio extraction module.
Uses yt-dlp for robust YouTube video handling.
"""
import re
from pathlib import Path
from typing import Dict, Optional
import yt_dlp
from src.utils.logger import setup_logger
from src.utils.config import settings
logger = setup_logger(__name__)
class YouTubeDownloader:
"""Handles YouTube video downloading and audio extraction."""
def __init__(self, output_dir: Optional[Path] = None):
"""
Initialize the YouTube downloader.
Args:
output_dir: Directory to save downloaded audio files
"""
self.output_dir = output_dir or settings.temp_dir
self.output_dir.mkdir(parents=True, exist_ok=True)
@staticmethod
def is_valid_youtube_url(url: str) -> bool:
"""
Validate if the URL is a valid YouTube link.
Args:
url: YouTube URL to validate
Returns:
True if valid YouTube URL, False otherwise
"""
youtube_regex = (
r'(https?://)?(www\.)?'
r'(youtube|youtu|youtube-nocookie)\.(com|be)/'
r'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})'
)
match = re.match(youtube_regex, url)
return bool(match)
def get_video_info(self, url: str) -> Dict[str, any]:
"""
Get video information without downloading.
Args:
url: YouTube video URL
Returns:
Dictionary containing video metadata
Raises:
ValueError: If URL is invalid or video is unavailable
"""
if not self.is_valid_youtube_url(url):
raise ValueError(f"Invalid YouTube URL: {url}")
ydl_opts = {
'quiet': True,
'no_warnings': True,
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
return {
'title': info.get('title', 'Unknown'),
'duration': info.get('duration', 0),
'uploader': info.get('uploader', 'Unknown'),
'description': info.get('description', ''),
'thumbnail': info.get('thumbnail', ''),
'upload_date': info.get('upload_date', ''),
}
except Exception as e:
logger.error(f"Failed to get video info: {e}")
raise ValueError(f"Could not access video: {str(e)}")
def download_audio(self, url: str, video_id: Optional[str] = None) -> Path:
"""
Download YouTube video and extract audio.
Args:
url: YouTube video URL
video_id: Optional custom identifier for the output file
Returns:
Path to the downloaded audio file
Raises:
ValueError: If URL is invalid or download fails
RuntimeError: If video exceeds maximum duration
"""
if not self.is_valid_youtube_url(url):
raise ValueError(f"Invalid YouTube URL: {url}")
# Get video info to check duration
info = self.get_video_info(url)
duration = info['duration']
if duration > settings.max_video_duration:
raise RuntimeError(
f"Video duration ({duration}s) exceeds maximum allowed "
f"({settings.max_video_duration}s)"
)
# Generate output filename
if video_id:
output_template = str(self.output_dir / f"{video_id}.%(ext)s")
else:
output_template = str(self.output_dir / "%(id)s.%(ext)s")
# yt-dlp options for audio extraction
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav',
'preferredquality': '192',
}],
'outtmpl': output_template,
'quiet': False,
'no_warnings': False,
'extract_flat': False,
}
try:
logger.info(f"Downloading audio from: {url}")
logger.info(f"Video title: {info['title']}")
logger.info(f"Duration: {duration}s ({duration/60:.1f} minutes)")
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
result = ydl.extract_info(url, download=True)
# Get the output filename
if video_id:
audio_file = self.output_dir / f"{video_id}.wav"
else:
audio_file = self.output_dir / f"{result['id']}.wav"
if not audio_file.exists():
raise RuntimeError("Audio file was not created")
logger.info(f"Audio downloaded successfully: {audio_file}")
return audio_file
except Exception as e:
logger.error(f"Failed to download audio: {e}")
raise ValueError(f"Download failed: {str(e)}")
def cleanup(self, file_path: Path) -> None:
"""
Remove downloaded audio file.
Args:
file_path: Path to the file to remove
"""
try:
if file_path.exists():
file_path.unlink()
logger.info(f"Cleaned up file: {file_path}")
except Exception as e:
logger.warning(f"Failed to cleanup file {file_path}: {e}")