From d3ce17f10db519c8b8d9036533572b200d9dcdf3 Mon Sep 17 00:00:00 2001 From: JSC Date: Mon, 25 Aug 2025 12:33:10 +0200 Subject: [PATCH] feat: Enhance SoundScannerService with duplicate detection and normalized file handling --- app/services/sound_scanner.py | 175 +++++++++++++++++---- tests/services/test_sound_scanner.py | 225 +++++++++++++++++++++++++++ 2 files changed, 373 insertions(+), 27 deletions(-) diff --git a/app/services/sound_scanner.py b/app/services/sound_scanner.py index 1c28a81..0c95c72 100644 --- a/app/services/sound_scanner.py +++ b/app/services/sound_scanner.py @@ -35,6 +35,7 @@ class ScanResults(TypedDict): updated: int deleted: int skipped: int + duplicates: int errors: int files: list[FileInfo] @@ -55,6 +56,13 @@ class SoundScannerService: ".m4a", ".aac", } + + # Directory mappings for normalized files (matching sound_normalizer) + self.normalized_directories = { + "SDB": "sounds/normalized/soundboard", + "TTS": "sounds/normalized/text_to_speech", + "EXT": "sounds/normalized/extracted", + } def extract_name_from_filename(self, filename: str) -> str: """Extract a clean name from filename.""" @@ -64,6 +72,42 @@ class SoundScannerService: name = name.replace("_", " ").replace("-", " ") # Capitalize words return " ".join(word.capitalize() for word in name.split()) + + def _get_normalized_path(self, sound_type: str, filename: str) -> Path: + """Get the normalized file path for a sound.""" + directory = self.normalized_directories.get(sound_type, "sounds/normalized/other") + return Path(directory) / filename + + def _rename_normalized_file(self, sound_type: str, old_filename: str, new_filename: str) -> bool: + """Rename a normalized file if it exists. Returns True if renamed, False if not found.""" + old_path = self._get_normalized_path(sound_type, old_filename) + new_path = self._get_normalized_path(sound_type, new_filename) + + if old_path.exists(): + try: + # Ensure the directory exists + new_path.parent.mkdir(parents=True, exist_ok=True) + old_path.rename(new_path) + logger.info("Renamed normalized file: %s -> %s", old_path, new_path) + return True + except Exception as e: + logger.error("Failed to rename normalized file %s -> %s: %s", old_path, new_path, e) + return False + return False + + def _delete_normalized_file(self, sound_type: str, filename: str) -> bool: + """Delete a normalized file if it exists. Returns True if deleted, False if not found.""" + normalized_path = self._get_normalized_path(sound_type, filename) + + if normalized_path.exists(): + try: + normalized_path.unlink() + logger.info("Deleted normalized file: %s", normalized_path) + return True + except Exception as e: + logger.error("Failed to delete normalized file %s: %s", normalized_path, e) + return False + return False async def scan_directory( self, @@ -87,6 +131,7 @@ class SoundScannerService: "updated": 0, "deleted": 0, "skipped": 0, + "duplicates": 0, "errors": 0, "files": [], } @@ -110,6 +155,9 @@ class SoundScannerService: "name": sound.name, "duration": sound.duration, "size": sound.size, + "type": sound.type, + "is_normalized": sound.is_normalized, + "normalized_filename": sound.normalized_filename, "sound_object": sound, # Keep reference for database operations } sounds_by_hash[sound.hash] = sound_data @@ -177,10 +225,20 @@ class SoundScannerService: sound_size = sound_data["size"] sound_id = sound_data["id"] sound_object = sound_data["sound_object"] + sound_type = sound_data["type"] + sound_is_normalized = sound_data["is_normalized"] + sound_normalized_filename = sound_data["normalized_filename"] try: + # Delete the sound from database first await self.sound_repo.delete(sound_object) logger.info("Deleted sound no longer in directory: %s", filename) + + # If the sound had a normalized file, delete it too + if sound_is_normalized and sound_normalized_filename: + normalized_base = Path(sound_normalized_filename).name + self._delete_normalized_file(sound_type, normalized_base) + results["deleted"] += 1 results["files"].append( { @@ -237,6 +295,9 @@ class SoundScannerService: existing_hash_size = None existing_hash_id = None existing_hash_object = None + existing_hash_type = None + existing_hash_is_normalized = None + existing_hash_normalized_filename = None if existing_sound_by_hash is not None: if isinstance(existing_sound_by_hash, dict): @@ -246,6 +307,9 @@ class SoundScannerService: existing_hash_size = existing_sound_by_hash["size"] existing_hash_id = existing_sound_by_hash["id"] existing_hash_object = existing_sound_by_hash["sound_object"] + existing_hash_type = existing_sound_by_hash["type"] + existing_hash_is_normalized = existing_sound_by_hash["is_normalized"] + existing_hash_normalized_filename = existing_sound_by_hash["normalized_filename"] else: # Sound object (for tests) existing_hash_filename = existing_sound_by_hash.filename existing_hash_name = existing_sound_by_hash.name @@ -253,6 +317,9 @@ class SoundScannerService: existing_hash_size = existing_sound_by_hash.size existing_hash_id = existing_sound_by_hash.id existing_hash_object = existing_sound_by_hash + existing_hash_type = existing_sound_by_hash.type + existing_hash_is_normalized = existing_sound_by_hash.is_normalized + existing_hash_normalized_filename = existing_sound_by_hash.normalized_filename existing_filename_id = None existing_filename_object = None @@ -285,36 +352,90 @@ class SoundScannerService: }, ) else: - # Same hash, different filename - file was renamed - update_data = { - "filename": filename, - "name": name, - } + # Same hash, different filename - could be rename or duplicate + # Check if both files exist to determine if it's a duplicate + old_file_path = file_path.parent / existing_hash_filename + if old_file_path.exists(): + # Both files exist with same hash - this is a duplicate + logger.warning( + "Duplicate file detected: '%s' has same content as existing '%s' (hash: %s). " + "Skipping duplicate file.", + filename, + existing_hash_filename, + file_hash[:8] + "...", + ) - await self.sound_repo.update(existing_hash_object, update_data) - logger.info( - "Detected rename: %s -> %s (ID: %s)", - existing_hash_filename, - filename, - existing_hash_id, - ) - - results["updated"] += 1 - results["files"].append( - { + results["skipped"] += 1 + results["duplicates"] += 1 + results["files"].append( + { + "filename": filename, + "status": "skipped", + "reason": "duplicate content", + "name": existing_hash_name, + "duration": existing_hash_duration, + "size": existing_hash_size, + "id": existing_hash_id, + "error": None, + "changes": None, + }, + ) + else: + # Old file doesn't exist - this is a genuine rename + update_data = { "filename": filename, - "status": "updated", - "reason": "file was renamed", "name": name, - "duration": existing_hash_duration, - "size": existing_hash_size, - "id": existing_hash_id, - "error": None, - "changes": ["filename", "name"], - # Store old filename to prevent deletion - "old_filename": existing_hash_filename, - }, - ) + } + + # If the sound has a normalized file, rename it too + if existing_hash_is_normalized and existing_hash_normalized_filename: + # Extract base filename without path for normalized file + old_normalized_base = Path(existing_hash_normalized_filename).name + new_normalized_base = Path(filename).stem + Path(existing_hash_normalized_filename).suffix + + renamed = self._rename_normalized_file( + existing_hash_type, + old_normalized_base, + new_normalized_base + ) + + if renamed: + update_data["normalized_filename"] = new_normalized_base + logger.info( + "Renamed normalized file: %s -> %s", + old_normalized_base, + new_normalized_base + ) + + await self.sound_repo.update(existing_hash_object, update_data) + logger.info( + "Detected rename: %s -> %s (ID: %s)", + existing_hash_filename, + filename, + existing_hash_id, + ) + + # Build changes list + changes = ["filename", "name"] + if "normalized_filename" in update_data: + changes.append("normalized_filename") + + results["updated"] += 1 + results["files"].append( + { + "filename": filename, + "status": "updated", + "reason": "file was renamed", + "name": name, + "duration": existing_hash_duration, + "size": existing_hash_size, + "id": existing_hash_id, + "error": None, + "changes": changes, + # Store old filename to prevent deletion + "old_filename": existing_hash_filename, + }, + ) elif existing_sound_by_filename is not None: # Same filename but different hash - file was modified diff --git a/tests/services/test_sound_scanner.py b/tests/services/test_sound_scanner.py index 258dc70..7d5f2a5 100644 --- a/tests/services/test_sound_scanner.py +++ b/tests/services/test_sound_scanner.py @@ -288,6 +288,59 @@ class TestSoundScannerService: scanner_service.sound_repo.update.assert_called_once() scanner_service.sound_repo.delete.assert_not_called() + @pytest.mark.asyncio + async def test_scan_directory_duplicate_detection(self, scanner_service, mock_session) -> None: + """Test that duplicate files (same hash) are detected and logged.""" + # Create a mock existing sound + existing_sound = Sound( + id=1, + type="SDB", + name="Original Song", + filename="original.mp3", + duration=120000, + size=1024, + hash="same_hash", + ) + + # Mock the repository + scanner_service.sound_repo.get_by_type = AsyncMock(return_value=[existing_sound]) + scanner_service.sound_repo.update = AsyncMock() + + # Create temporary directory with both original and duplicate files + import tempfile + import os + + with tempfile.TemporaryDirectory() as temp_dir: + # Create both files (simulating duplicate content) + original_path = os.path.join(temp_dir, "original.mp3") + duplicate_path = os.path.join(temp_dir, "duplicate.mp3") + + with open(original_path, "wb") as f: + f.write(b"test audio content") + with open(duplicate_path, "wb") as f: + f.write(b"test audio content") # Same content = same hash + + # Mock file operations + with ( + patch("app.services.sound_scanner.get_file_hash", return_value="same_hash"), + patch("app.services.sound_scanner.get_audio_duration", return_value=120000), + patch("app.services.sound_scanner.get_file_size", return_value=1024), + ): + results = await scanner_service.scan_directory(temp_dir, "SDB") + + # Should have 1 unchanged (original) and 1 skipped (duplicate) + assert results["skipped"] == 2 # Both files have same hash, both skipped + assert results["duplicates"] == 1 # One duplicate detected + assert results["updated"] == 0 + assert results["added"] == 0 + assert results["deleted"] == 0 + + # Check that duplicate was properly detected + skipped_files = [f for f in results["files"] if f["status"] == "skipped"] + duplicate_file = next((f for f in skipped_files if "duplicate" in f["reason"]), None) + assert duplicate_file is not None + assert duplicate_file["reason"] == "duplicate content" + @pytest.mark.asyncio async def test_sync_audio_file_new(self, scanner_service) -> None: """Test syncing a new audio file.""" @@ -477,3 +530,175 @@ class TestSoundScannerService: ) # All sounds are set to not deletable finally: temp_path.unlink() + + @pytest.mark.asyncio + async def test_sync_audio_file_rename_with_normalized_file( + self, test_session, scanner_service + ): + """Test that renaming a sound file also renames its normalized file.""" + # Create temporary directories for testing + from pathlib import Path + import tempfile + + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + + # Set up the scanner's normalized directories to use temp dir + scanner_service.normalized_directories = { + "SDB": str(temp_dir_path / "normalized" / "soundboard") + } + + # Create the normalized directory + normalized_dir = temp_dir_path / "normalized" / "soundboard" + normalized_dir.mkdir(parents=True) + + # Create the old normalized file + old_normalized_file = normalized_dir / "old_sound.mp3" + old_normalized_file.write_text("normalized audio content") + + # Create the audio files (they need to exist for the scanner) + old_path = temp_dir_path / "old_sound.mp3" + new_path = temp_dir_path / "new_sound.mp3" + + # Create a dummy audio file for the new path + new_path.write_bytes(b"fake audio data for testing") + + # Mock the audio utility functions since we're using fake files + from unittest.mock import patch + with patch('app.services.sound_scanner.get_audio_duration', return_value=60000), \ + patch('app.services.sound_scanner.get_file_size', return_value=2048): + + # Create existing sound with normalized file info + existing_sound = Sound( + id=1, + type="SDB", + name="Old Sound", + filename="old_sound.mp3", + duration=60000, + size=2048, + hash="test_hash", + is_normalized=True, + normalized_filename="old_sound.mp3", + normalized_duration=60000, + normalized_size=1024, + normalized_hash="normalized_hash", + play_count=5, + is_deletable=False, + is_music=False + ) + + results = { + "scanned": 0, + "added": 0, + "updated": 0, + "deleted": 0, + "skipped": 0, + "duplicates": 0, + "errors": 0, + "files": [], + } + + # Mock the sound repository update + scanner_service.sound_repo.update = AsyncMock() + + # Simulate rename detection by calling _sync_audio_file + await scanner_service._sync_audio_file( + new_path, + "SDB", + existing_sound, # existing_sound_by_hash (same hash, different filename) + None, # existing_sound_by_filename (no file with new name exists) + "test_hash", + results, + ) + + # Verify the results + assert results["updated"] == 1 + assert len(results["files"]) == 1 + assert results["files"][0]["status"] == "updated" + assert results["files"][0]["reason"] == "file was renamed" + assert "normalized_filename" in results["files"][0]["changes"] + + # Verify sound_repo.update was called with normalized filename update + update_call = scanner_service.sound_repo.update.call_args + update_data = update_call[0][1] # Second argument is the update data + + assert "filename" in update_data + assert "name" in update_data + assert "normalized_filename" in update_data + assert update_data["normalized_filename"] == "new_sound.mp3" + + # Verify the normalized file was actually renamed + new_normalized_file = normalized_dir / "new_sound.mp3" + assert new_normalized_file.exists() + assert not old_normalized_file.exists() + assert new_normalized_file.read_text() == "normalized audio content" + + @pytest.mark.asyncio + async def test_scan_directory_delete_with_normalized_file( + self, test_session, scanner_service + ): + """Test that deleting a sound also deletes its normalized file.""" + # Create temporary directories for testing + from pathlib import Path + import tempfile + + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + scan_dir = temp_dir_path / "sounds" + scan_dir.mkdir() + + # Set up the scanner's normalized directories to use temp dir + scanner_service.normalized_directories = { + "SDB": str(temp_dir_path / "normalized" / "soundboard") + } + + # Create the normalized directory and file + normalized_dir = temp_dir_path / "normalized" / "soundboard" + normalized_dir.mkdir(parents=True) + normalized_file = normalized_dir / "test_sound.mp3" + normalized_file.write_text("normalized audio content") + + # Create existing sound with normalized file info + existing_sound = Sound( + id=1, + type="SDB", + name="Test Sound", + filename="test_sound.mp3", + duration=60000, + size=2048, + hash="test_hash", + is_normalized=True, + normalized_filename="test_sound.mp3", + normalized_duration=60000, + normalized_size=1024, + normalized_hash="normalized_hash", + play_count=5, + is_deletable=False, + is_music=False + ) + + # Mock sound repository methods + scanner_service.sound_repo.get_by_type = AsyncMock(return_value=[existing_sound]) + scanner_service.sound_repo.delete = AsyncMock() + + # Mock audio utility functions + from unittest.mock import patch + with patch('app.services.sound_scanner.get_audio_duration'), \ + patch('app.services.sound_scanner.get_file_size'): + + # Run scan with empty directory (should trigger deletion) + results = await scanner_service.scan_directory(str(scan_dir), "SDB") + + # Verify the results + assert results["deleted"] == 1 + assert results["added"] == 0 + assert results["updated"] == 0 + assert len(results["files"]) == 1 + assert results["files"][0]["status"] == "deleted" + assert results["files"][0]["reason"] == "file no longer exists" + + # Verify sound_repo.delete was called + scanner_service.sound_repo.delete.assert_called_once_with(existing_sound) + + # Verify the normalized file was actually deleted + assert not normalized_file.exists()