- Introduced a new test suite for the PlaylistService covering various functionalities including creation, retrieval, updating, and deletion of playlists. - Added tests for handling sounds within playlists, ensuring correct behavior when adding/removing sounds and managing current playlists. - Refactored socket service tests for improved readability by adjusting function signatures. - Cleaned up unnecessary whitespace in sound normalizer and sound scanner tests for consistency. - Enhanced audio utility tests to ensure accurate hash and size calculations, including edge cases for nonexistent files. - Removed redundant blank lines in cookie utility tests for cleaner code.
521 lines
18 KiB
Python
521 lines
18 KiB
Python
"""Extraction service for audio extraction from external services using yt-dlp."""
|
|
|
|
import asyncio
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import TypedDict
|
|
|
|
import yt_dlp
|
|
from sqlmodel.ext.asyncio.session import AsyncSession
|
|
|
|
from app.core.config import settings
|
|
from app.core.logging import get_logger
|
|
from app.models.extraction import Extraction
|
|
from app.models.sound import Sound
|
|
from app.repositories.extraction import ExtractionRepository
|
|
from app.repositories.sound import SoundRepository
|
|
from app.services.playlist import PlaylistService
|
|
from app.services.sound_normalizer import SoundNormalizerService
|
|
from app.utils.audio import get_audio_duration, get_file_hash, get_file_size
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class ExtractionInfo(TypedDict):
|
|
"""Type definition for extraction information."""
|
|
|
|
id: int
|
|
url: str
|
|
service: str | None
|
|
service_id: str | None
|
|
title: str | None
|
|
status: str
|
|
error: str | None
|
|
sound_id: int | None
|
|
|
|
|
|
class ExtractionService:
|
|
"""Service for extracting audio from external services using yt-dlp."""
|
|
|
|
def __init__(self, session: AsyncSession) -> None:
|
|
"""Initialize the extraction service."""
|
|
self.session = session
|
|
self.extraction_repo = ExtractionRepository(session)
|
|
self.sound_repo = SoundRepository(session)
|
|
self.playlist_service = PlaylistService(session)
|
|
|
|
# Ensure required directories exist
|
|
self._ensure_directories()
|
|
|
|
def _ensure_directories(self) -> None:
|
|
"""Ensure all required directories exist."""
|
|
directories = [
|
|
settings.EXTRACTION_TEMP_DIR,
|
|
"sounds/originals/extracted",
|
|
settings.EXTRACTION_THUMBNAILS_DIR,
|
|
]
|
|
|
|
for directory in directories:
|
|
Path(directory).mkdir(parents=True, exist_ok=True)
|
|
logger.debug("Ensured directory exists: %s", directory)
|
|
|
|
async def create_extraction(self, url: str, user_id: int) -> ExtractionInfo:
|
|
"""Create a new extraction job."""
|
|
logger.info("Creating extraction for URL: %s (user: %d)", url, user_id)
|
|
|
|
try:
|
|
# Create the extraction record without service detection for fast response
|
|
extraction_data = {
|
|
"url": url,
|
|
"user_id": user_id,
|
|
"service": None, # Will be detected during processing
|
|
"service_id": None, # Will be detected during processing
|
|
"title": None, # Will be detected during processing
|
|
"status": "pending",
|
|
}
|
|
|
|
extraction = await self.extraction_repo.create(extraction_data)
|
|
logger.info("Created extraction with ID: %d", extraction.id)
|
|
|
|
return {
|
|
"id": extraction.id or 0, # Should never be None for created extraction
|
|
"url": extraction.url,
|
|
"service": extraction.service,
|
|
"service_id": extraction.service_id,
|
|
"title": extraction.title,
|
|
"status": extraction.status,
|
|
"error": extraction.error,
|
|
"sound_id": extraction.sound_id,
|
|
}
|
|
|
|
except Exception:
|
|
logger.exception("Failed to create extraction for URL: %s", url)
|
|
raise
|
|
|
|
async def _detect_service_info(self, url: str) -> dict[str, str | None] | None:
|
|
"""Detect service information from URL using yt-dlp."""
|
|
try:
|
|
# Configure yt-dlp for info extraction only
|
|
ydl_opts = {
|
|
"quiet": True,
|
|
"no_warnings": True,
|
|
"extract_flat": False,
|
|
}
|
|
|
|
def _extract_info() -> dict | None:
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
# Extract info without downloading
|
|
return ydl.extract_info(url, download=False)
|
|
|
|
# Run the blocking operation in a thread pool
|
|
info = await asyncio.to_thread(_extract_info)
|
|
|
|
if not info:
|
|
return None
|
|
|
|
return {
|
|
"service": info.get("extractor", ""),
|
|
"service_id": str(info.get("id", "")),
|
|
"title": info.get("title"),
|
|
}
|
|
|
|
except Exception:
|
|
logger.exception("Failed to detect service info for URL: %s", url)
|
|
return None
|
|
|
|
async def process_extraction(self, extraction_id: int) -> ExtractionInfo:
|
|
"""Process an extraction job."""
|
|
extraction = await self.extraction_repo.get_by_id(extraction_id)
|
|
if not extraction:
|
|
raise ValueError(f"Extraction {extraction_id} not found")
|
|
|
|
if extraction.status != "pending":
|
|
raise ValueError(f"Extraction {extraction_id} is not pending")
|
|
|
|
# Store all needed values early to avoid session detachment issues
|
|
user_id = extraction.user_id
|
|
extraction_url = extraction.url
|
|
extraction_service = extraction.service
|
|
extraction_service_id = extraction.service_id
|
|
extraction_title = extraction.title
|
|
|
|
logger.info("Processing extraction %d: %s", extraction_id, extraction_url)
|
|
|
|
try:
|
|
# Update status to processing
|
|
await self.extraction_repo.update(extraction, {"status": "processing"})
|
|
|
|
# Detect service info if not already available
|
|
if not extraction_service or not extraction_service_id:
|
|
logger.info("Detecting service info for extraction %d", extraction_id)
|
|
service_info = await self._detect_service_info(extraction_url)
|
|
|
|
if not service_info:
|
|
raise ValueError("Unable to detect service information from URL")
|
|
|
|
# Check if extraction already exists for this service
|
|
existing = await self.extraction_repo.get_by_service_and_id(
|
|
service_info["service"], service_info["service_id"]
|
|
)
|
|
if existing and existing.id != extraction_id:
|
|
error_msg = f"Extraction already exists for {service_info['service']}:{service_info['service_id']}"
|
|
logger.warning(error_msg)
|
|
raise ValueError(error_msg)
|
|
|
|
# Update extraction with service info
|
|
update_data = {
|
|
"service": service_info["service"],
|
|
"service_id": service_info["service_id"],
|
|
"title": service_info.get("title") or extraction_title,
|
|
}
|
|
await self.extraction_repo.update(extraction, update_data)
|
|
|
|
# Update values for processing
|
|
extraction_service = service_info["service"]
|
|
extraction_service_id = service_info["service_id"]
|
|
extraction_title = service_info.get("title") or extraction_title
|
|
|
|
# Extract audio and thumbnail
|
|
audio_file, thumbnail_file = await self._extract_media(
|
|
extraction_id, extraction_url
|
|
)
|
|
|
|
# Move files to final locations
|
|
(
|
|
final_audio_path,
|
|
final_thumbnail_path,
|
|
) = await self._move_files_to_final_location(
|
|
audio_file,
|
|
thumbnail_file,
|
|
extraction_title,
|
|
extraction_service,
|
|
extraction_service_id,
|
|
)
|
|
|
|
# Create Sound record
|
|
sound = await self._create_sound_record(
|
|
final_audio_path,
|
|
extraction_title,
|
|
extraction_service,
|
|
extraction_service_id,
|
|
)
|
|
|
|
# Store sound_id early to avoid session detachment issues
|
|
sound_id = sound.id
|
|
|
|
# Normalize the sound
|
|
await self._normalize_sound(sound)
|
|
|
|
# Add to main playlist
|
|
await self._add_to_main_playlist(sound, user_id)
|
|
|
|
# Update extraction with success
|
|
await self.extraction_repo.update(
|
|
extraction,
|
|
{
|
|
"status": "completed",
|
|
"sound_id": sound_id,
|
|
"error": None,
|
|
},
|
|
)
|
|
|
|
logger.info("Successfully processed extraction %d", extraction_id)
|
|
|
|
return {
|
|
"id": extraction_id,
|
|
"url": extraction_url,
|
|
"service": extraction_service,
|
|
"service_id": extraction_service_id,
|
|
"title": extraction_title,
|
|
"status": "completed",
|
|
"error": None,
|
|
"sound_id": sound_id,
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = str(e)
|
|
logger.exception(
|
|
"Failed to process extraction %d: %s", extraction_id, error_msg
|
|
)
|
|
|
|
# Update extraction with error
|
|
await self.extraction_repo.update(
|
|
extraction,
|
|
{
|
|
"status": "failed",
|
|
"error": error_msg,
|
|
},
|
|
)
|
|
|
|
return {
|
|
"id": extraction_id,
|
|
"url": extraction_url,
|
|
"service": extraction_service,
|
|
"service_id": extraction_service_id,
|
|
"title": extraction_title,
|
|
"status": "failed",
|
|
"error": error_msg,
|
|
"sound_id": None,
|
|
}
|
|
|
|
async def _extract_media(
|
|
self, extraction_id: int, extraction_url: str
|
|
) -> tuple[Path, Path | None]:
|
|
"""Extract audio and thumbnail using yt-dlp."""
|
|
temp_dir = Path(settings.EXTRACTION_TEMP_DIR)
|
|
|
|
# Create unique filename based on extraction ID
|
|
output_template = str(
|
|
temp_dir / f"extraction_{extraction_id}_%(title)s.%(ext)s"
|
|
)
|
|
|
|
# Configure yt-dlp options
|
|
ydl_opts = {
|
|
"format": "bestaudio/best",
|
|
"outtmpl": output_template,
|
|
"extractaudio": True,
|
|
"audioformat": settings.EXTRACTION_AUDIO_FORMAT,
|
|
"audioquality": settings.EXTRACTION_AUDIO_BITRATE,
|
|
"writethumbnail": True,
|
|
"writeinfojson": False,
|
|
"writeautomaticsub": False,
|
|
"writesubtitles": False,
|
|
"postprocessors": [
|
|
{
|
|
"key": "FFmpegExtractAudio",
|
|
"preferredcodec": settings.EXTRACTION_AUDIO_FORMAT,
|
|
"preferredquality": settings.EXTRACTION_AUDIO_BITRATE.rstrip("k"),
|
|
},
|
|
],
|
|
}
|
|
|
|
def _download_media() -> None:
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
# Download and extract
|
|
ydl.download([extraction_url])
|
|
|
|
try:
|
|
# Run the blocking download operation in a thread pool
|
|
await asyncio.to_thread(_download_media)
|
|
|
|
# Find the extracted files
|
|
audio_files = list(
|
|
temp_dir.glob(
|
|
f"extraction_{extraction_id}_*.{settings.EXTRACTION_AUDIO_FORMAT}"
|
|
)
|
|
)
|
|
thumbnail_files = (
|
|
list(temp_dir.glob(f"extraction_{extraction_id}_*.webp"))
|
|
+ list(temp_dir.glob(f"extraction_{extraction_id}_*.jpg"))
|
|
+ list(temp_dir.glob(f"extraction_{extraction_id}_*.png"))
|
|
)
|
|
|
|
if not audio_files:
|
|
raise RuntimeError("No audio file was created during extraction")
|
|
|
|
audio_file = audio_files[0]
|
|
thumbnail_file = thumbnail_files[0] if thumbnail_files else None
|
|
|
|
logger.info(
|
|
"Extracted audio: %s, thumbnail: %s",
|
|
audio_file,
|
|
thumbnail_file or "None",
|
|
)
|
|
|
|
return audio_file, thumbnail_file
|
|
|
|
except Exception as e:
|
|
logger.exception("yt-dlp extraction failed for %s", extraction_url)
|
|
raise RuntimeError(f"Audio extraction failed: {e}") from e
|
|
|
|
async def _move_files_to_final_location(
|
|
self,
|
|
audio_file: Path,
|
|
thumbnail_file: Path | None,
|
|
title: str | None,
|
|
service: str | None,
|
|
service_id: str | None,
|
|
) -> tuple[Path, Path | None]:
|
|
"""Move extracted files to their final locations."""
|
|
# Generate clean filename based on title and service
|
|
safe_title = self._sanitize_filename(
|
|
title or f"{service or 'unknown'}_{service_id or 'unknown'}"
|
|
)
|
|
|
|
# Move audio file
|
|
final_audio_path = (
|
|
Path("sounds/originals/extracted")
|
|
/ f"{safe_title}.{settings.EXTRACTION_AUDIO_FORMAT}"
|
|
)
|
|
final_audio_path = self._ensure_unique_filename(final_audio_path)
|
|
|
|
shutil.move(str(audio_file), str(final_audio_path))
|
|
logger.info("Moved audio file to: %s", final_audio_path)
|
|
|
|
# Move thumbnail file if it exists
|
|
final_thumbnail_path = None
|
|
if thumbnail_file:
|
|
thumbnail_ext = thumbnail_file.suffix
|
|
final_thumbnail_path = (
|
|
Path(settings.EXTRACTION_THUMBNAILS_DIR)
|
|
/ f"{safe_title}{thumbnail_ext}"
|
|
)
|
|
final_thumbnail_path = self._ensure_unique_filename(final_thumbnail_path)
|
|
|
|
shutil.move(str(thumbnail_file), str(final_thumbnail_path))
|
|
logger.info("Moved thumbnail file to: %s", final_thumbnail_path)
|
|
|
|
return final_audio_path, final_thumbnail_path
|
|
|
|
def _sanitize_filename(self, filename: str) -> str:
|
|
"""Sanitize filename for filesystem."""
|
|
# Remove or replace problematic characters
|
|
invalid_chars = '<>:"/\\|?*'
|
|
for char in invalid_chars:
|
|
filename = filename.replace(char, "_")
|
|
|
|
# Limit length and remove leading/trailing spaces
|
|
filename = filename.strip()[:100]
|
|
|
|
return filename or "untitled"
|
|
|
|
def _ensure_unique_filename(self, filepath: Path) -> Path:
|
|
"""Ensure filename is unique by adding counter if needed."""
|
|
if not filepath.exists():
|
|
return filepath
|
|
|
|
stem = filepath.stem
|
|
suffix = filepath.suffix
|
|
parent = filepath.parent
|
|
counter = 1
|
|
|
|
while True:
|
|
new_path = parent / f"{stem}_{counter}{suffix}"
|
|
if not new_path.exists():
|
|
return new_path
|
|
counter += 1
|
|
|
|
async def _create_sound_record(
|
|
self,
|
|
audio_path: Path,
|
|
title: str | None,
|
|
service: str | None,
|
|
service_id: str | None,
|
|
) -> Sound:
|
|
"""Create a Sound record for the extracted audio."""
|
|
# Get audio metadata
|
|
duration = get_audio_duration(audio_path)
|
|
size = get_file_size(audio_path)
|
|
file_hash = get_file_hash(audio_path)
|
|
|
|
# Create sound data
|
|
sound_data = {
|
|
"type": "EXT",
|
|
"name": title or f"{service or 'unknown'}_{service_id or 'unknown'}",
|
|
"filename": audio_path.name,
|
|
"duration": duration,
|
|
"size": size,
|
|
"hash": file_hash,
|
|
"is_deletable": True, # Extracted sounds can be deleted
|
|
"is_music": True, # Assume extracted content is music
|
|
"is_normalized": False,
|
|
"play_count": 0,
|
|
}
|
|
|
|
sound = await self.sound_repo.create(sound_data)
|
|
logger.info("Created sound record with ID: %d", sound.id)
|
|
|
|
return sound
|
|
|
|
async def _normalize_sound(self, sound: Sound) -> None:
|
|
"""Normalize the extracted sound."""
|
|
try:
|
|
normalizer_service = SoundNormalizerService(self.session)
|
|
result = await normalizer_service.normalize_sound(sound)
|
|
|
|
if result["status"] == "error":
|
|
logger.warning(
|
|
"Failed to normalize sound %d: %s",
|
|
sound.id,
|
|
result.get("error"),
|
|
)
|
|
else:
|
|
logger.info("Successfully normalized sound %d", sound.id)
|
|
|
|
except Exception as e:
|
|
logger.exception("Error normalizing sound %d: %s", sound.id, e)
|
|
# Don't fail the extraction if normalization fails
|
|
|
|
async def _add_to_main_playlist(self, sound: Sound, user_id: int) -> None:
|
|
"""Add the sound to the user's main playlist."""
|
|
try:
|
|
await self.playlist_service.add_sound_to_main_playlist(sound.id, user_id)
|
|
logger.info(
|
|
"Added sound %d to main playlist for user %d",
|
|
sound.id,
|
|
user_id,
|
|
)
|
|
|
|
except Exception:
|
|
logger.exception(
|
|
"Error adding sound %d to main playlist for user %d",
|
|
sound.id,
|
|
user_id,
|
|
)
|
|
# Don't fail the extraction if playlist addition fails
|
|
|
|
async def get_extraction_by_id(self, extraction_id: int) -> ExtractionInfo | None:
|
|
"""Get extraction information by ID."""
|
|
extraction = await self.extraction_repo.get_by_id(extraction_id)
|
|
if not extraction:
|
|
return None
|
|
|
|
return {
|
|
"id": extraction.id or 0, # Should never be None for existing extraction
|
|
"url": extraction.url,
|
|
"service": extraction.service,
|
|
"service_id": extraction.service_id,
|
|
"title": extraction.title,
|
|
"status": extraction.status,
|
|
"error": extraction.error,
|
|
"sound_id": extraction.sound_id,
|
|
}
|
|
|
|
async def get_user_extractions(self, user_id: int) -> list[ExtractionInfo]:
|
|
"""Get all extractions for a user."""
|
|
extractions = await self.extraction_repo.get_by_user(user_id)
|
|
|
|
return [
|
|
{
|
|
"id": extraction.id
|
|
or 0, # Should never be None for existing extraction
|
|
"url": extraction.url,
|
|
"service": extraction.service,
|
|
"service_id": extraction.service_id,
|
|
"title": extraction.title,
|
|
"status": extraction.status,
|
|
"error": extraction.error,
|
|
"sound_id": extraction.sound_id,
|
|
}
|
|
for extraction in extractions
|
|
]
|
|
|
|
async def get_pending_extractions(self) -> list[ExtractionInfo]:
|
|
"""Get all pending extractions."""
|
|
extractions = await self.extraction_repo.get_pending_extractions()
|
|
|
|
return [
|
|
{
|
|
"id": extraction.id
|
|
or 0, # Should never be None for existing extraction
|
|
"url": extraction.url,
|
|
"service": extraction.service,
|
|
"service_id": extraction.service_id,
|
|
"title": extraction.title,
|
|
"status": extraction.status,
|
|
"error": extraction.error,
|
|
"sound_id": extraction.sound_id,
|
|
}
|
|
for extraction in extractions
|
|
]
|