feat: Enhance SoundScannerService with duplicate detection and normalized file handling

2025-08-25 12:33:10 +02:00
parent da66516bb3
commit d3ce17f10d
2 changed files with 373 additions and 27 deletions
--- a/app/services/sound_scanner.py
+++ b/app/services/sound_scanner.py
@@ -35,6 +35,7 @@ class ScanResults(TypedDict):
    updated: int
    deleted: int
    skipped: int
+    duplicates: int
    errors: int
    files: list[FileInfo]

@@ -55,6 +56,13 @@ class SoundScannerService:
            ".m4a",
            ".aac",
        }
+        
+        # Directory mappings for normalized files (matching sound_normalizer)
+        self.normalized_directories = {
+            "SDB": "sounds/normalized/soundboard",
+            "TTS": "sounds/normalized/text_to_speech",
+            "EXT": "sounds/normalized/extracted",
+        }

    def extract_name_from_filename(self, filename: str) -> str:
        """Extract a clean name from filename."""
@@ -64,6 +72,42 @@ class SoundScannerService:
        name = name.replace("_", " ").replace("-", " ")
        # Capitalize words
        return " ".join(word.capitalize() for word in name.split())
+    
+    def _get_normalized_path(self, sound_type: str, filename: str) -> Path:
+        """Get the normalized file path for a sound."""
+        directory = self.normalized_directories.get(sound_type, "sounds/normalized/other")
+        return Path(directory) / filename
+    
+    def _rename_normalized_file(self, sound_type: str, old_filename: str, new_filename: str) -> bool:
+        """Rename a normalized file if it exists. Returns True if renamed, False if not found."""
+        old_path = self._get_normalized_path(sound_type, old_filename)
+        new_path = self._get_normalized_path(sound_type, new_filename)
+        
+        if old_path.exists():
+            try:
+                # Ensure the directory exists
+                new_path.parent.mkdir(parents=True, exist_ok=True)
+                old_path.rename(new_path)
+                logger.info("Renamed normalized file: %s -> %s", old_path, new_path)
+                return True
+            except Exception as e:
+                logger.error("Failed to rename normalized file %s -> %s: %s", old_path, new_path, e)
+                return False
+        return False
+    
+    def _delete_normalized_file(self, sound_type: str, filename: str) -> bool:
+        """Delete a normalized file if it exists. Returns True if deleted, False if not found."""
+        normalized_path = self._get_normalized_path(sound_type, filename)
+        
+        if normalized_path.exists():
+            try:
+                normalized_path.unlink()
+                logger.info("Deleted normalized file: %s", normalized_path)
+                return True
+            except Exception as e:
+                logger.error("Failed to delete normalized file %s: %s", normalized_path, e)
+                return False
+        return False

    async def scan_directory(
        self,
@@ -87,6 +131,7 @@ class SoundScannerService:
            "updated": 0,
            "deleted": 0,
            "skipped": 0,
+            "duplicates": 0,
            "errors": 0,
            "files": [],
        }
@@ -110,6 +155,9 @@ class SoundScannerService:
                "name": sound.name,
                "duration": sound.duration,
                "size": sound.size,
+                "type": sound.type,
+                "is_normalized": sound.is_normalized,
+                "normalized_filename": sound.normalized_filename,
                "sound_object": sound,  # Keep reference for database operations
            }
            sounds_by_hash[sound.hash] = sound_data
@@ -177,10 +225,20 @@ class SoundScannerService:
                sound_size = sound_data["size"]
                sound_id = sound_data["id"]
                sound_object = sound_data["sound_object"]
+                sound_type = sound_data["type"]
+                sound_is_normalized = sound_data["is_normalized"]
+                sound_normalized_filename = sound_data["normalized_filename"]

                try:
+                    # Delete the sound from database first
                    await self.sound_repo.delete(sound_object)
                    logger.info("Deleted sound no longer in directory: %s", filename)
+                    
+                    # If the sound had a normalized file, delete it too
+                    if sound_is_normalized and sound_normalized_filename:
+                        normalized_base = Path(sound_normalized_filename).name
+                        self._delete_normalized_file(sound_type, normalized_base)
+                    
                    results["deleted"] += 1
                    results["files"].append(
                        {
@@ -237,6 +295,9 @@ class SoundScannerService:
        existing_hash_size = None
        existing_hash_id = None
        existing_hash_object = None
+        existing_hash_type = None
+        existing_hash_is_normalized = None
+        existing_hash_normalized_filename = None

        if existing_sound_by_hash is not None:
            if isinstance(existing_sound_by_hash, dict):
@@ -246,6 +307,9 @@ class SoundScannerService:
                existing_hash_size = existing_sound_by_hash["size"]
                existing_hash_id = existing_sound_by_hash["id"]
                existing_hash_object = existing_sound_by_hash["sound_object"]
+                existing_hash_type = existing_sound_by_hash["type"]
+                existing_hash_is_normalized = existing_sound_by_hash["is_normalized"]
+                existing_hash_normalized_filename = existing_sound_by_hash["normalized_filename"]
            else:  # Sound object (for tests)
                existing_hash_filename = existing_sound_by_hash.filename
                existing_hash_name = existing_sound_by_hash.name
@@ -253,6 +317,9 @@ class SoundScannerService:
                existing_hash_size = existing_sound_by_hash.size
                existing_hash_id = existing_sound_by_hash.id
                existing_hash_object = existing_sound_by_hash
+                existing_hash_type = existing_sound_by_hash.type
+                existing_hash_is_normalized = existing_sound_by_hash.is_normalized
+                existing_hash_normalized_filename = existing_sound_by_hash.normalized_filename

        existing_filename_id = None
        existing_filename_object = None
@@ -285,36 +352,90 @@ class SoundScannerService:
                    },
                )
            else:
-                # Same hash, different filename - file was renamed
-                update_data = {
-                    "filename": filename,
-                    "name": name,
-                }
+                # Same hash, different filename - could be rename or duplicate
+                # Check if both files exist to determine if it's a duplicate
+                old_file_path = file_path.parent / existing_hash_filename
+                if old_file_path.exists():
+                    # Both files exist with same hash - this is a duplicate
+                    logger.warning(
+                        "Duplicate file detected: '%s' has same content as existing '%s' (hash: %s). "
+                        "Skipping duplicate file.",
+                        filename,
+                        existing_hash_filename,
+                        file_hash[:8] + "...",
+                    )

-                await self.sound_repo.update(existing_hash_object, update_data)
-                logger.info(
-                    "Detected rename: %s -> %s (ID: %s)",
-                    existing_hash_filename,
-                    filename,
-                    existing_hash_id,
-                )
-
-                results["updated"] += 1
-                results["files"].append(
-                    {
+                    results["skipped"] += 1
+                    results["duplicates"] += 1
+                    results["files"].append(
+                        {
+                            "filename": filename,
+                            "status": "skipped",
+                            "reason": "duplicate content",
+                            "name": existing_hash_name,
+                            "duration": existing_hash_duration,
+                            "size": existing_hash_size,
+                            "id": existing_hash_id,
+                            "error": None,
+                            "changes": None,
+                        },
+                    )
+                else:
+                    # Old file doesn't exist - this is a genuine rename
+                    update_data = {
                        "filename": filename,
-                        "status": "updated",
-                        "reason": "file was renamed",
                        "name": name,
-                        "duration": existing_hash_duration,
-                        "size": existing_hash_size,
-                        "id": existing_hash_id,
-                        "error": None,
-                        "changes": ["filename", "name"],
-                        # Store old filename to prevent deletion
-                        "old_filename": existing_hash_filename,
-                    },
-                )
+                    }
+                    
+                    # If the sound has a normalized file, rename it too
+                    if existing_hash_is_normalized and existing_hash_normalized_filename:
+                        # Extract base filename without path for normalized file
+                        old_normalized_base = Path(existing_hash_normalized_filename).name
+                        new_normalized_base = Path(filename).stem + Path(existing_hash_normalized_filename).suffix
+                        
+                        renamed = self._rename_normalized_file(
+                            existing_hash_type,
+                            old_normalized_base,
+                            new_normalized_base
+                        )
+                        
+                        if renamed:
+                            update_data["normalized_filename"] = new_normalized_base
+                            logger.info(
+                                "Renamed normalized file: %s -> %s", 
+                                old_normalized_base, 
+                                new_normalized_base
+                            )
+
+                    await self.sound_repo.update(existing_hash_object, update_data)
+                    logger.info(
+                        "Detected rename: %s -> %s (ID: %s)",
+                        existing_hash_filename,
+                        filename,
+                        existing_hash_id,
+                    )
+
+                    # Build changes list
+                    changes = ["filename", "name"]
+                    if "normalized_filename" in update_data:
+                        changes.append("normalized_filename")
+
+                    results["updated"] += 1
+                    results["files"].append(
+                        {
+                            "filename": filename,
+                            "status": "updated",
+                            "reason": "file was renamed",
+                            "name": name,
+                            "duration": existing_hash_duration,
+                            "size": existing_hash_size,
+                            "id": existing_hash_id,
+                            "error": None,
+                            "changes": changes,
+                            # Store old filename to prevent deletion
+                            "old_filename": existing_hash_filename,
+                        },
+                    )

        elif existing_sound_by_filename is not None:
            # Same filename but different hash - file was modified
--- a/tests/services/test_sound_scanner.py
+++ b/tests/services/test_sound_scanner.py
@@ -288,6 +288,59 @@ class TestSoundScannerService:
                scanner_service.sound_repo.update.assert_called_once()
                scanner_service.sound_repo.delete.assert_not_called()

+    @pytest.mark.asyncio
+    async def test_scan_directory_duplicate_detection(self, scanner_service, mock_session) -> None:
+        """Test that duplicate files (same hash) are detected and logged."""
+        # Create a mock existing sound
+        existing_sound = Sound(
+            id=1,
+            type="SDB",
+            name="Original Song",
+            filename="original.mp3",
+            duration=120000,
+            size=1024,
+            hash="same_hash",
+        )
+        
+        # Mock the repository
+        scanner_service.sound_repo.get_by_type = AsyncMock(return_value=[existing_sound])
+        scanner_service.sound_repo.update = AsyncMock()
+        
+        # Create temporary directory with both original and duplicate files
+        import tempfile
+        import os
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Create both files (simulating duplicate content)
+            original_path = os.path.join(temp_dir, "original.mp3")
+            duplicate_path = os.path.join(temp_dir, "duplicate.mp3")
+            
+            with open(original_path, "wb") as f:
+                f.write(b"test audio content")
+            with open(duplicate_path, "wb") as f:
+                f.write(b"test audio content")  # Same content = same hash
+            
+            # Mock file operations
+            with (
+                patch("app.services.sound_scanner.get_file_hash", return_value="same_hash"),
+                patch("app.services.sound_scanner.get_audio_duration", return_value=120000),
+                patch("app.services.sound_scanner.get_file_size", return_value=1024),
+            ):
+                results = await scanner_service.scan_directory(temp_dir, "SDB")
+                
+                # Should have 1 unchanged (original) and 1 skipped (duplicate)
+                assert results["skipped"] == 2  # Both files have same hash, both skipped
+                assert results["duplicates"] == 1  # One duplicate detected
+                assert results["updated"] == 0
+                assert results["added"] == 0
+                assert results["deleted"] == 0
+                
+                # Check that duplicate was properly detected
+                skipped_files = [f for f in results["files"] if f["status"] == "skipped"]
+                duplicate_file = next((f for f in skipped_files if "duplicate" in f["reason"]), None)
+                assert duplicate_file is not None
+                assert duplicate_file["reason"] == "duplicate content"
+
    @pytest.mark.asyncio
    async def test_sync_audio_file_new(self, scanner_service) -> None:
        """Test syncing a new audio file."""
@@ -477,3 +530,175 @@ class TestSoundScannerService:
                )  # All sounds are set to not deletable
            finally:
                temp_path.unlink()
+
+    @pytest.mark.asyncio
+    async def test_sync_audio_file_rename_with_normalized_file(
+        self, test_session, scanner_service
+    ):
+        """Test that renaming a sound file also renames its normalized file."""
+        # Create temporary directories for testing
+        from pathlib import Path
+        import tempfile
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_dir_path = Path(temp_dir)
+            
+            # Set up the scanner's normalized directories to use temp dir
+            scanner_service.normalized_directories = {
+                "SDB": str(temp_dir_path / "normalized" / "soundboard")
+            }
+            
+            # Create the normalized directory
+            normalized_dir = temp_dir_path / "normalized" / "soundboard"
+            normalized_dir.mkdir(parents=True)
+            
+            # Create the old normalized file
+            old_normalized_file = normalized_dir / "old_sound.mp3"
+            old_normalized_file.write_text("normalized audio content")
+            
+            # Create the audio files (they need to exist for the scanner)
+            old_path = temp_dir_path / "old_sound.mp3"
+            new_path = temp_dir_path / "new_sound.mp3"
+            
+            # Create a dummy audio file for the new path
+            new_path.write_bytes(b"fake audio data for testing")
+            
+            # Mock the audio utility functions since we're using fake files
+            from unittest.mock import patch
+            with patch('app.services.sound_scanner.get_audio_duration', return_value=60000), \
+                 patch('app.services.sound_scanner.get_file_size', return_value=2048):
+                
+                # Create existing sound with normalized file info
+                existing_sound = Sound(
+                    id=1,
+                    type="SDB",
+                    name="Old Sound",
+                    filename="old_sound.mp3",
+                    duration=60000,
+                    size=2048,
+                    hash="test_hash",
+                    is_normalized=True,
+                    normalized_filename="old_sound.mp3",
+                    normalized_duration=60000,
+                    normalized_size=1024,
+                    normalized_hash="normalized_hash",
+                    play_count=5,
+                    is_deletable=False,
+                    is_music=False
+                )
+                
+                results = {
+                    "scanned": 0,
+                    "added": 0,
+                    "updated": 0,
+                    "deleted": 0,
+                    "skipped": 0,
+                    "duplicates": 0,
+                    "errors": 0,
+                    "files": [],
+                }
+                
+                # Mock the sound repository update
+                scanner_service.sound_repo.update = AsyncMock()
+                
+                # Simulate rename detection by calling _sync_audio_file
+                await scanner_service._sync_audio_file(
+                    new_path,
+                    "SDB",
+                    existing_sound,  # existing_sound_by_hash (same hash, different filename)
+                    None,  # existing_sound_by_filename (no file with new name exists)
+                    "test_hash",
+                    results,
+                )
+                
+                # Verify the results
+                assert results["updated"] == 1
+                assert len(results["files"]) == 1
+                assert results["files"][0]["status"] == "updated"
+                assert results["files"][0]["reason"] == "file was renamed"
+                assert "normalized_filename" in results["files"][0]["changes"]
+                
+                # Verify sound_repo.update was called with normalized filename update
+                update_call = scanner_service.sound_repo.update.call_args
+                update_data = update_call[0][1]  # Second argument is the update data
+                
+                assert "filename" in update_data
+                assert "name" in update_data
+                assert "normalized_filename" in update_data
+                assert update_data["normalized_filename"] == "new_sound.mp3"
+                
+                # Verify the normalized file was actually renamed
+                new_normalized_file = normalized_dir / "new_sound.mp3"
+                assert new_normalized_file.exists()
+                assert not old_normalized_file.exists()
+                assert new_normalized_file.read_text() == "normalized audio content"
+
+    @pytest.mark.asyncio
+    async def test_scan_directory_delete_with_normalized_file(
+        self, test_session, scanner_service
+    ):
+        """Test that deleting a sound also deletes its normalized file."""
+        # Create temporary directories for testing
+        from pathlib import Path
+        import tempfile
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_dir_path = Path(temp_dir)
+            scan_dir = temp_dir_path / "sounds"
+            scan_dir.mkdir()
+            
+            # Set up the scanner's normalized directories to use temp dir
+            scanner_service.normalized_directories = {
+                "SDB": str(temp_dir_path / "normalized" / "soundboard")
+            }
+            
+            # Create the normalized directory and file
+            normalized_dir = temp_dir_path / "normalized" / "soundboard"
+            normalized_dir.mkdir(parents=True)
+            normalized_file = normalized_dir / "test_sound.mp3"
+            normalized_file.write_text("normalized audio content")
+            
+            # Create existing sound with normalized file info
+            existing_sound = Sound(
+                id=1,
+                type="SDB",
+                name="Test Sound",
+                filename="test_sound.mp3",
+                duration=60000,
+                size=2048,
+                hash="test_hash",
+                is_normalized=True,
+                normalized_filename="test_sound.mp3",
+                normalized_duration=60000,
+                normalized_size=1024,
+                normalized_hash="normalized_hash",
+                play_count=5,
+                is_deletable=False,
+                is_music=False
+            )
+            
+            # Mock sound repository methods
+            scanner_service.sound_repo.get_by_type = AsyncMock(return_value=[existing_sound])
+            scanner_service.sound_repo.delete = AsyncMock()
+            
+            # Mock audio utility functions
+            from unittest.mock import patch
+            with patch('app.services.sound_scanner.get_audio_duration'), \
+                 patch('app.services.sound_scanner.get_file_size'):
+                
+                # Run scan with empty directory (should trigger deletion)
+                results = await scanner_service.scan_directory(str(scan_dir), "SDB")
+                
+                # Verify the results
+                assert results["deleted"] == 1
+                assert results["added"] == 0
+                assert results["updated"] == 0
+                assert len(results["files"]) == 1
+                assert results["files"][0]["status"] == "deleted"
+                assert results["files"][0]["reason"] == "file no longer exists"
+                
+                # Verify sound_repo.delete was called
+                scanner_service.sound_repo.delete.assert_called_once_with(existing_sound)
+                
+                # Verify the normalized file was actually deleted
+                assert not normalized_file.exists()