feat: Implement background extraction processor with concurrency control
- Added `ExtractionProcessor` class to handle extraction queue processing in the background. - Implemented methods for starting, stopping, and queuing extractions with concurrency limits. - Integrated logging for monitoring the processor's status and actions. - Created tests for the extraction processor to ensure functionality and error handling. test: Add unit tests for extraction API endpoints - Created tests for successful extraction creation, authentication checks, and processor status retrieval. - Ensured proper responses for authenticated and unauthenticated requests. test: Implement unit tests for extraction repository - Added tests for creating, retrieving, and updating extractions in the repository. - Mocked database interactions to validate repository behavior without actual database access. test: Add comprehensive tests for extraction service - Developed tests for extraction creation, service detection, and sound record creation. - Included tests for handling duplicate extractions and invalid URLs. test: Add unit tests for extraction background processor - Created tests for the `ExtractionProcessor` class to validate its behavior under various conditions. - Ensured proper handling of extraction queuing, processing, and completion callbacks. fix: Update OAuth service tests to use AsyncMock - Modified OAuth provider tests to use `AsyncMock` for mocking asynchronous HTTP requests.
This commit is contained in:
517
app/services/extraction.py
Normal file
517
app/services/extraction.py
Normal file
@@ -0,0 +1,517 @@
|
||||
"""Extraction service for audio extraction from external services using yt-dlp."""
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import TypedDict
|
||||
|
||||
import yt_dlp
|
||||
from sqlmodel.ext.asyncio.session import AsyncSession
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.logging import get_logger
|
||||
from app.models.extraction import Extraction
|
||||
from app.models.sound import Sound
|
||||
from app.repositories.extraction import ExtractionRepository
|
||||
from app.repositories.sound import SoundRepository
|
||||
from app.services.sound_normalizer import SoundNormalizerService
|
||||
from app.utils.audio import get_audio_duration, get_file_hash, get_file_size
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ExtractionInfo(TypedDict):
|
||||
"""Type definition for extraction information."""
|
||||
|
||||
id: int
|
||||
url: str
|
||||
service: str
|
||||
service_id: str
|
||||
title: str | None
|
||||
status: str
|
||||
error: str | None
|
||||
sound_id: int | None
|
||||
|
||||
|
||||
class ExtractionService:
|
||||
"""Service for extracting audio from external services using yt-dlp."""
|
||||
|
||||
def __init__(self, session: AsyncSession) -> None:
|
||||
"""Initialize the extraction service."""
|
||||
self.session = session
|
||||
self.extraction_repo = ExtractionRepository(session)
|
||||
self.sound_repo = SoundRepository(session)
|
||||
|
||||
# Ensure required directories exist
|
||||
self._ensure_directories()
|
||||
|
||||
def _ensure_directories(self) -> None:
|
||||
"""Ensure all required directories exist."""
|
||||
directories = [
|
||||
settings.EXTRACTION_TEMP_DIR,
|
||||
"sounds/originals/extracted",
|
||||
settings.EXTRACTION_THUMBNAILS_DIR,
|
||||
]
|
||||
|
||||
for directory in directories:
|
||||
Path(directory).mkdir(parents=True, exist_ok=True)
|
||||
logger.debug("Ensured directory exists: %s", directory)
|
||||
|
||||
async def create_extraction(self, url: str, user_id: int) -> ExtractionInfo:
|
||||
"""Create a new extraction job."""
|
||||
logger.info("Creating extraction for URL: %s (user: %d)", url, user_id)
|
||||
|
||||
try:
|
||||
# First, detect service and service_id using yt-dlp
|
||||
service_info = self._detect_service_info(url)
|
||||
|
||||
if not service_info:
|
||||
raise ValueError("Unable to detect service information from URL")
|
||||
|
||||
service = service_info["service"]
|
||||
service_id = service_info["service_id"]
|
||||
title = service_info.get("title")
|
||||
|
||||
logger.info(
|
||||
"Detected service: %s, service_id: %s, title: %s",
|
||||
service,
|
||||
service_id,
|
||||
title,
|
||||
)
|
||||
|
||||
# Check if extraction already exists
|
||||
existing = await self.extraction_repo.get_by_service_and_id(
|
||||
service, service_id
|
||||
)
|
||||
if existing:
|
||||
error_msg = f"Extraction already exists for {service}:{service_id}"
|
||||
logger.warning(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# Create the extraction record
|
||||
extraction_data = {
|
||||
"url": url,
|
||||
"user_id": user_id,
|
||||
"service": service,
|
||||
"service_id": service_id,
|
||||
"title": title,
|
||||
"status": "pending",
|
||||
}
|
||||
|
||||
extraction = await self.extraction_repo.create(extraction_data)
|
||||
logger.info("Created extraction with ID: %d", extraction.id)
|
||||
|
||||
return {
|
||||
"id": extraction.id or 0, # Should never be None for created extraction
|
||||
"url": extraction.url,
|
||||
"service": extraction.service,
|
||||
"service_id": extraction.service_id,
|
||||
"title": extraction.title,
|
||||
"status": extraction.status,
|
||||
"error": extraction.error,
|
||||
"sound_id": extraction.sound_id,
|
||||
}
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to create extraction for URL: %s", url)
|
||||
raise
|
||||
|
||||
def _detect_service_info(self, url: str) -> dict | None:
|
||||
"""Detect service information from URL using yt-dlp."""
|
||||
try:
|
||||
# Configure yt-dlp for info extraction only
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"extract_flat": False,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
# Extract info without downloading
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
if not info:
|
||||
return None
|
||||
|
||||
# Map extractor names to our service names
|
||||
extractor_map = {
|
||||
"youtube": "youtube",
|
||||
"dailymotion": "dailymotion",
|
||||
"vimeo": "vimeo",
|
||||
"soundcloud": "soundcloud",
|
||||
"twitter": "twitter",
|
||||
"tiktok": "tiktok",
|
||||
"instagram": "instagram",
|
||||
}
|
||||
|
||||
extractor = info.get("extractor", "").lower()
|
||||
service = extractor_map.get(extractor, extractor)
|
||||
|
||||
return {
|
||||
"service": service,
|
||||
"service_id": str(info.get("id", "")),
|
||||
"title": info.get("title"),
|
||||
"duration": info.get("duration"),
|
||||
"uploader": info.get("uploader"),
|
||||
"description": info.get("description"),
|
||||
}
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to detect service info for URL: %s", url)
|
||||
return None
|
||||
|
||||
async def process_extraction(self, extraction_id: int) -> ExtractionInfo:
|
||||
"""Process an extraction job."""
|
||||
extraction = await self.extraction_repo.get_by_id(extraction_id)
|
||||
if not extraction:
|
||||
raise ValueError(f"Extraction {extraction_id} not found")
|
||||
|
||||
if extraction.status != "pending":
|
||||
raise ValueError(f"Extraction {extraction_id} is not pending")
|
||||
|
||||
# Store all needed values early to avoid session detachment issues
|
||||
user_id = extraction.user_id
|
||||
extraction_url = extraction.url
|
||||
extraction_title = extraction.title
|
||||
extraction_service = extraction.service
|
||||
extraction_service_id = extraction.service_id
|
||||
|
||||
logger.info("Processing extraction %d: %s", extraction_id, extraction_url)
|
||||
|
||||
try:
|
||||
# Update status to processing
|
||||
await self.extraction_repo.update(extraction, {"status": "processing"})
|
||||
|
||||
# Extract audio and thumbnail
|
||||
audio_file, thumbnail_file = await self._extract_media(
|
||||
extraction_id, extraction_url
|
||||
)
|
||||
|
||||
# Move files to final locations
|
||||
final_audio_path, final_thumbnail_path = (
|
||||
await self._move_files_to_final_location(
|
||||
audio_file,
|
||||
thumbnail_file,
|
||||
extraction_title,
|
||||
extraction_service,
|
||||
extraction_service_id,
|
||||
)
|
||||
)
|
||||
|
||||
# Create Sound record
|
||||
sound = await self._create_sound_record(
|
||||
final_audio_path,
|
||||
extraction_title,
|
||||
extraction_service,
|
||||
extraction_service_id,
|
||||
)
|
||||
|
||||
# Store sound_id early to avoid session detachment issues
|
||||
sound_id = sound.id
|
||||
|
||||
# Normalize the sound
|
||||
await self._normalize_sound(sound)
|
||||
|
||||
# Add to main playlist
|
||||
await self._add_to_main_playlist(sound, user_id)
|
||||
|
||||
# Update extraction with success
|
||||
await self.extraction_repo.update(
|
||||
extraction,
|
||||
{
|
||||
"status": "completed",
|
||||
"sound_id": sound_id,
|
||||
"error": None,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info("Successfully processed extraction %d", extraction_id)
|
||||
|
||||
return {
|
||||
"id": extraction_id,
|
||||
"url": extraction_url,
|
||||
"service": extraction_service,
|
||||
"service_id": extraction_service_id,
|
||||
"title": extraction_title,
|
||||
"status": "completed",
|
||||
"error": None,
|
||||
"sound_id": sound_id,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
logger.exception(
|
||||
"Failed to process extraction %d: %s", extraction_id, error_msg
|
||||
)
|
||||
|
||||
# Update extraction with error
|
||||
await self.extraction_repo.update(
|
||||
extraction,
|
||||
{
|
||||
"status": "failed",
|
||||
"error": error_msg,
|
||||
},
|
||||
)
|
||||
|
||||
return {
|
||||
"id": extraction_id,
|
||||
"url": extraction_url,
|
||||
"service": extraction_service,
|
||||
"service_id": extraction_service_id,
|
||||
"title": extraction_title,
|
||||
"status": "failed",
|
||||
"error": error_msg,
|
||||
"sound_id": None,
|
||||
}
|
||||
|
||||
async def _extract_media(
|
||||
self, extraction_id: int, extraction_url: str
|
||||
) -> tuple[Path, Path | None]:
|
||||
"""Extract audio and thumbnail using yt-dlp."""
|
||||
temp_dir = Path(settings.EXTRACTION_TEMP_DIR)
|
||||
|
||||
# Create unique filename based on extraction ID
|
||||
output_template = str(
|
||||
temp_dir / f"extraction_{extraction_id}_%(title)s.%(ext)s"
|
||||
)
|
||||
|
||||
# Configure yt-dlp options
|
||||
ydl_opts = {
|
||||
"format": "bestaudio/best",
|
||||
"outtmpl": output_template,
|
||||
"extractaudio": True,
|
||||
"audioformat": settings.EXTRACTION_AUDIO_FORMAT,
|
||||
"audioquality": settings.EXTRACTION_AUDIO_BITRATE,
|
||||
"writethumbnail": True,
|
||||
"writeinfojson": False,
|
||||
"writeautomaticsub": False,
|
||||
"writesubtitles": False,
|
||||
"postprocessors": [
|
||||
{
|
||||
"key": "FFmpegExtractAudio",
|
||||
"preferredcodec": settings.EXTRACTION_AUDIO_FORMAT,
|
||||
"preferredquality": settings.EXTRACTION_AUDIO_BITRATE.rstrip("k"),
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
try:
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
# Download and extract
|
||||
ydl.download([extraction_url])
|
||||
|
||||
# Find the extracted files
|
||||
audio_files = list(
|
||||
temp_dir.glob(
|
||||
f"extraction_{extraction_id}_*.{settings.EXTRACTION_AUDIO_FORMAT}"
|
||||
)
|
||||
)
|
||||
thumbnail_files = (
|
||||
list(temp_dir.glob(f"extraction_{extraction_id}_*.webp"))
|
||||
+ list(temp_dir.glob(f"extraction_{extraction_id}_*.jpg"))
|
||||
+ list(temp_dir.glob(f"extraction_{extraction_id}_*.png"))
|
||||
)
|
||||
|
||||
if not audio_files:
|
||||
raise RuntimeError("No audio file was created during extraction")
|
||||
|
||||
audio_file = audio_files[0]
|
||||
thumbnail_file = thumbnail_files[0] if thumbnail_files else None
|
||||
|
||||
logger.info(
|
||||
"Extracted audio: %s, thumbnail: %s",
|
||||
audio_file,
|
||||
thumbnail_file or "None",
|
||||
)
|
||||
|
||||
return audio_file, thumbnail_file
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("yt-dlp extraction failed for %s", extraction_url)
|
||||
raise RuntimeError(f"Audio extraction failed: {e}") from e
|
||||
|
||||
async def _move_files_to_final_location(
|
||||
self,
|
||||
audio_file: Path,
|
||||
thumbnail_file: Path | None,
|
||||
title: str | None,
|
||||
service: str,
|
||||
service_id: str,
|
||||
) -> tuple[Path, Path | None]:
|
||||
"""Move extracted files to their final locations."""
|
||||
# Generate clean filename based on title and service
|
||||
safe_title = self._sanitize_filename(title or f"{service}_{service_id}")
|
||||
|
||||
# Move audio file
|
||||
final_audio_path = (
|
||||
Path("sounds/originals/extracted")
|
||||
/ f"{safe_title}.{settings.EXTRACTION_AUDIO_FORMAT}"
|
||||
)
|
||||
final_audio_path = self._ensure_unique_filename(final_audio_path)
|
||||
|
||||
shutil.move(str(audio_file), str(final_audio_path))
|
||||
logger.info("Moved audio file to: %s", final_audio_path)
|
||||
|
||||
# Move thumbnail file if it exists
|
||||
final_thumbnail_path = None
|
||||
if thumbnail_file:
|
||||
thumbnail_ext = thumbnail_file.suffix
|
||||
final_thumbnail_path = (
|
||||
Path(settings.EXTRACTION_THUMBNAILS_DIR)
|
||||
/ f"{safe_title}{thumbnail_ext}"
|
||||
)
|
||||
final_thumbnail_path = self._ensure_unique_filename(final_thumbnail_path)
|
||||
|
||||
shutil.move(str(thumbnail_file), str(final_thumbnail_path))
|
||||
logger.info("Moved thumbnail file to: %s", final_thumbnail_path)
|
||||
|
||||
return final_audio_path, final_thumbnail_path
|
||||
|
||||
def _sanitize_filename(self, filename: str) -> str:
|
||||
"""Sanitize filename for filesystem."""
|
||||
# Remove or replace problematic characters
|
||||
invalid_chars = '<>:"/\\|?*'
|
||||
for char in invalid_chars:
|
||||
filename = filename.replace(char, "_")
|
||||
|
||||
# Limit length and remove leading/trailing spaces
|
||||
filename = filename.strip()[:100]
|
||||
|
||||
return filename or "untitled"
|
||||
|
||||
def _ensure_unique_filename(self, filepath: Path) -> Path:
|
||||
"""Ensure filename is unique by adding counter if needed."""
|
||||
if not filepath.exists():
|
||||
return filepath
|
||||
|
||||
stem = filepath.stem
|
||||
suffix = filepath.suffix
|
||||
parent = filepath.parent
|
||||
counter = 1
|
||||
|
||||
while True:
|
||||
new_path = parent / f"{stem}_{counter}{suffix}"
|
||||
if not new_path.exists():
|
||||
return new_path
|
||||
counter += 1
|
||||
|
||||
async def _create_sound_record(
|
||||
self, audio_path: Path, title: str | None, service: str, service_id: str
|
||||
) -> Sound:
|
||||
"""Create a Sound record for the extracted audio."""
|
||||
# Get audio metadata
|
||||
duration = get_audio_duration(audio_path)
|
||||
size = get_file_size(audio_path)
|
||||
file_hash = get_file_hash(audio_path)
|
||||
|
||||
# Create sound data
|
||||
sound_data = {
|
||||
"type": "EXT",
|
||||
"name": title or f"{service}_{service_id}",
|
||||
"filename": audio_path.name,
|
||||
"duration": duration,
|
||||
"size": size,
|
||||
"hash": file_hash,
|
||||
"is_deletable": True, # Extracted sounds can be deleted
|
||||
"is_music": True, # Assume extracted content is music
|
||||
"is_normalized": False,
|
||||
"play_count": 0,
|
||||
}
|
||||
|
||||
sound = await self.sound_repo.create(sound_data)
|
||||
logger.info("Created sound record with ID: %d", sound.id)
|
||||
|
||||
return sound
|
||||
|
||||
async def _normalize_sound(self, sound: Sound) -> None:
|
||||
"""Normalize the extracted sound."""
|
||||
try:
|
||||
normalizer_service = SoundNormalizerService(self.session)
|
||||
result = await normalizer_service.normalize_sound(sound)
|
||||
|
||||
if result["status"] == "error":
|
||||
logger.warning(
|
||||
"Failed to normalize sound %d: %s",
|
||||
sound.id,
|
||||
result.get("error"),
|
||||
)
|
||||
else:
|
||||
logger.info("Successfully normalized sound %d", sound.id)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Error normalizing sound %d: %s", sound.id, e)
|
||||
# Don't fail the extraction if normalization fails
|
||||
|
||||
async def _add_to_main_playlist(self, sound: Sound, user_id: int) -> None:
|
||||
"""Add the sound to the user's main playlist."""
|
||||
try:
|
||||
# This is a placeholder - implement based on your playlist logic
|
||||
# For now, we'll just log that we would add it to the main playlist
|
||||
logger.info(
|
||||
"Would add sound %d to main playlist for user %d",
|
||||
sound.id,
|
||||
user_id,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
"Error adding sound %d to main playlist for user %d: %s",
|
||||
sound.id,
|
||||
user_id,
|
||||
e,
|
||||
)
|
||||
# Don't fail the extraction if playlist addition fails
|
||||
|
||||
async def get_extraction_by_id(self, extraction_id: int) -> ExtractionInfo | None:
|
||||
"""Get extraction information by ID."""
|
||||
extraction = await self.extraction_repo.get_by_id(extraction_id)
|
||||
if not extraction:
|
||||
return None
|
||||
|
||||
return {
|
||||
"id": extraction.id or 0, # Should never be None for existing extraction
|
||||
"url": extraction.url,
|
||||
"service": extraction.service,
|
||||
"service_id": extraction.service_id,
|
||||
"title": extraction.title,
|
||||
"status": extraction.status,
|
||||
"error": extraction.error,
|
||||
"sound_id": extraction.sound_id,
|
||||
}
|
||||
|
||||
async def get_user_extractions(self, user_id: int) -> list[ExtractionInfo]:
|
||||
"""Get all extractions for a user."""
|
||||
extractions = await self.extraction_repo.get_by_user(user_id)
|
||||
|
||||
return [
|
||||
{
|
||||
"id": extraction.id
|
||||
or 0, # Should never be None for existing extraction
|
||||
"url": extraction.url,
|
||||
"service": extraction.service,
|
||||
"service_id": extraction.service_id,
|
||||
"title": extraction.title,
|
||||
"status": extraction.status,
|
||||
"error": extraction.error,
|
||||
"sound_id": extraction.sound_id,
|
||||
}
|
||||
for extraction in extractions
|
||||
]
|
||||
|
||||
async def get_pending_extractions(self) -> list[ExtractionInfo]:
|
||||
"""Get all pending extractions."""
|
||||
extractions = await self.extraction_repo.get_pending_extractions()
|
||||
|
||||
return [
|
||||
{
|
||||
"id": extraction.id
|
||||
or 0, # Should never be None for existing extraction
|
||||
"url": extraction.url,
|
||||
"service": extraction.service,
|
||||
"service_id": extraction.service_id,
|
||||
"title": extraction.title,
|
||||
"status": extraction.status,
|
||||
"error": extraction.error,
|
||||
"sound_id": extraction.sound_id,
|
||||
}
|
||||
for extraction in extractions
|
||||
]
|
||||
Reference in New Issue
Block a user