feat: Implement hash-first identification strategy in audio file syncing and enhance tests for renamed files

2025-08-25 11:56:07 +02:00
parent d81a54207c
commit da66516bb3
3 changed files with 316 additions and 68 deletions
--- a/app/models/sound.py
+++ b/app/models/sound.py
@@ -34,7 +34,11 @@ class Sound(BaseModel, table=True):
    __table_args__ = (UniqueConstraint("hash", name="uq_sound_hash"),)

    # relationships
-    playlist_sounds: list["PlaylistSound"] = Relationship(back_populates="sound")
+    playlist_sounds: list["PlaylistSound"] = Relationship(
+        back_populates="sound", cascade_delete=True
+    )
    extractions: list["Extraction"] = Relationship(back_populates="sound")
-    play_history: list["SoundPlayed"] = Relationship(back_populates="sound")
+    play_history: list["SoundPlayed"] = Relationship(
+        back_populates="sound", cascade_delete=True
+    )
    favorites: list["Favorite"] = Relationship(back_populates="sound")
--- a/app/services/sound_scanner.py
+++ b/app/services/sound_scanner.py
@@ -95,7 +95,25 @@ class SoundScannerService:

        # Get all existing sounds of this type from database
        existing_sounds = await self.sound_repo.get_by_type(sound_type)
-        sounds_by_filename = {sound.filename: sound for sound in existing_sounds}
+
+        # Create lookup dictionaries with immediate attribute access
+        # to avoid session detachment
+        sounds_by_hash = {}
+        sounds_by_filename = {}
+
+        for sound in existing_sounds:
+            # Capture all attributes immediately while session is valid
+            sound_data = {
+                "id": sound.id,
+                "hash": sound.hash,
+                "filename": sound.filename,
+                "name": sound.name,
+                "duration": sound.duration,
+                "size": sound.size,
+                "sound_object": sound,  # Keep reference for database operations
+            }
+            sounds_by_hash[sound.hash] = sound_data
+            sounds_by_filename[sound.filename] = sound_data

        # Get all audio files from directory
        audio_files = [
@@ -112,12 +130,27 @@ class SoundScannerService:
            processed_filenames.add(filename)

            try:
+                # Calculate hash first to enable hash-based lookup
+                file_hash = get_file_hash(file_path)
+                existing_sound_by_hash = sounds_by_hash.get(file_hash)
+                existing_sound_by_filename = sounds_by_filename.get(filename)
+
                await self._sync_audio_file(
                    file_path,
                    sound_type,
-                    sounds_by_filename.get(filename),
+                    existing_sound_by_hash,
+                    existing_sound_by_filename,
+                    file_hash,
                    results,
                )
+
+                # Check if this was a rename operation and mark old filename as processed
+                if results["files"] and results["files"][-1].get("old_filename"):
+                    old_filename = results["files"][-1]["old_filename"]
+                    processed_filenames.add(old_filename)
+                    logger.debug("Marked old filename as processed: %s", old_filename)
+                    # Remove temporary tracking field from results
+                    del results["files"][-1]["old_filename"]
            except Exception as e:
                logger.exception("Error processing file %s", file_path)
                results["errors"] += 1
@@ -136,10 +169,17 @@ class SoundScannerService:
                )

        # Delete sounds that no longer exist in directory
-        for filename, sound in sounds_by_filename.items():
+        for filename, sound_data in sounds_by_filename.items():
            if filename not in processed_filenames:
+                # Attributes already captured in sound_data dictionary
+                sound_name = sound_data["name"]
+                sound_duration = sound_data["duration"]
+                sound_size = sound_data["size"]
+                sound_id = sound_data["id"]
+                sound_object = sound_data["sound_object"]
+
                try:
-                    await self.sound_repo.delete(sound)
+                    await self.sound_repo.delete(sound_object)
                    logger.info("Deleted sound no longer in directory: %s", filename)
                    results["deleted"] += 1
                    results["files"].append(
@@ -147,10 +187,10 @@ class SoundScannerService:
                            "filename": filename,
                            "status": "deleted",
                            "reason": "file no longer exists",
-                            "name": sound.name,
-                            "duration": sound.duration,
-                            "size": sound.size,
-                            "id": sound.id,
+                            "name": sound_name,
+                            "duration": sound_duration,
+                            "size": sound_size,
+                            "id": sound_id,
                            "error": None,
                            "changes": None,
                        },
@@ -163,10 +203,10 @@ class SoundScannerService:
                            "filename": filename,
                            "status": "error",
                            "reason": "failed to delete",
-                            "name": sound.name,
-                            "duration": sound.duration,
-                            "size": sound.size,
-                            "id": sound.id,
+                            "name": sound_name,
+                            "duration": sound_duration,
+                            "size": sound_size,
+                            "id": sound_id,
                            "error": str(e),
                            "changes": None,
                        },
@@ -179,18 +219,136 @@ class SoundScannerService:
        self,
        file_path: Path,
        sound_type: str,
-        existing_sound: Sound | None,
+        existing_sound_by_hash: dict | Sound | None,
+        existing_sound_by_filename: dict | Sound | None,
+        file_hash: str,
        results: ScanResults,
    ) -> None:
-        """Sync a single audio file (add new or update existing)."""
+        """Sync a single audio file using hash-first identification strategy."""
        filename = file_path.name
-        file_hash = get_file_hash(file_path)
        duration = get_audio_duration(file_path)
        size = get_file_size(file_path)
        name = self.extract_name_from_filename(filename)

-        if existing_sound is None:
-            # Add new sound
+        # Extract attributes - handle both dict (normal) and Sound object (tests)
+        existing_hash_filename = None
+        existing_hash_name = None
+        existing_hash_duration = None
+        existing_hash_size = None
+        existing_hash_id = None
+        existing_hash_object = None
+
+        if existing_sound_by_hash is not None:
+            if isinstance(existing_sound_by_hash, dict):
+                existing_hash_filename = existing_sound_by_hash["filename"]
+                existing_hash_name = existing_sound_by_hash["name"]
+                existing_hash_duration = existing_sound_by_hash["duration"]
+                existing_hash_size = existing_sound_by_hash["size"]
+                existing_hash_id = existing_sound_by_hash["id"]
+                existing_hash_object = existing_sound_by_hash["sound_object"]
+            else:  # Sound object (for tests)
+                existing_hash_filename = existing_sound_by_hash.filename
+                existing_hash_name = existing_sound_by_hash.name
+                existing_hash_duration = existing_sound_by_hash.duration
+                existing_hash_size = existing_sound_by_hash.size
+                existing_hash_id = existing_sound_by_hash.id
+                existing_hash_object = existing_sound_by_hash
+
+        existing_filename_id = None
+        existing_filename_object = None
+        if existing_sound_by_filename is not None:
+            if isinstance(existing_sound_by_filename, dict):
+                existing_filename_id = existing_sound_by_filename["id"]
+                existing_filename_object = existing_sound_by_filename["sound_object"]
+            else:  # Sound object (for tests)
+                existing_filename_id = existing_sound_by_filename.id
+                existing_filename_object = existing_sound_by_filename
+
+        # Hash-first identification strategy
+        if existing_sound_by_hash is not None:
+            # Content exists in database (same hash)
+            if existing_hash_filename == filename:
+                # Same hash, same filename - file unchanged
+                logger.debug("Sound unchanged: %s", filename)
+                results["skipped"] += 1
+                results["files"].append(
+                    {
+                        "filename": filename,
+                        "status": "skipped",
+                        "reason": "file unchanged",
+                        "name": existing_hash_name,
+                        "duration": existing_hash_duration,
+                        "size": existing_hash_size,
+                        "id": existing_hash_id,
+                        "error": None,
+                        "changes": None,
+                    },
+                )
+            else:
+                # Same hash, different filename - file was renamed
+                update_data = {
+                    "filename": filename,
+                    "name": name,
+                }
+
+                await self.sound_repo.update(existing_hash_object, update_data)
+                logger.info(
+                    "Detected rename: %s -> %s (ID: %s)",
+                    existing_hash_filename,
+                    filename,
+                    existing_hash_id,
+                )
+
+                results["updated"] += 1
+                results["files"].append(
+                    {
+                        "filename": filename,
+                        "status": "updated",
+                        "reason": "file was renamed",
+                        "name": name,
+                        "duration": existing_hash_duration,
+                        "size": existing_hash_size,
+                        "id": existing_hash_id,
+                        "error": None,
+                        "changes": ["filename", "name"],
+                        # Store old filename to prevent deletion
+                        "old_filename": existing_hash_filename,
+                    },
+                )
+
+        elif existing_sound_by_filename is not None:
+            # Same filename but different hash - file was modified
+            update_data = {
+                "name": name,
+                "duration": duration,
+                "size": size,
+                "hash": file_hash,
+            }
+
+            await self.sound_repo.update(existing_filename_object, update_data)
+            logger.info(
+                "Updated modified sound: %s (ID: %s)",
+                name,
+                existing_filename_id,
+            )
+
+            results["updated"] += 1
+            results["files"].append(
+                {
+                    "filename": filename,
+                    "status": "updated",
+                    "reason": "file was modified",
+                    "name": name,
+                    "duration": duration,
+                    "size": size,
+                    "id": existing_filename_id,
+                    "error": None,
+                    "changes": ["hash", "duration", "size", "name"],
+                },
+            )
+
+        else:
+            # New file - neither hash nor filename exists
            sound_data = {
                "type": sound_type,
                "name": name,
@@ -222,51 +380,6 @@ class SoundScannerService:
                },
            )

-        elif existing_sound.hash != file_hash:
-            # Update existing sound (file was modified)
-            update_data = {
-                "name": name,
-                "duration": duration,
-                "size": size,
-                "hash": file_hash,
-            }
-
-            await self.sound_repo.update(existing_sound, update_data)
-            logger.info("Updated modified sound: %s (ID: %s)", name, existing_sound.id)
-
-            results["updated"] += 1
-            results["files"].append(
-                {
-                    "filename": filename,
-                    "status": "updated",
-                    "reason": "file was modified",
-                    "name": name,
-                    "duration": duration,
-                    "size": size,
-                    "id": existing_sound.id,
-                    "error": None,
-                    "changes": ["hash", "duration", "size", "name"],
-                },
-            )
-
-        else:
-            # File unchanged, skip
-            logger.debug("Sound unchanged: %s", filename)
-            results["skipped"] += 1
-            results["files"].append(
-                {
-                    "filename": filename,
-                    "status": "skipped",
-                    "reason": "file unchanged",
-                    "name": existing_sound.name,
-                    "duration": existing_sound.duration,
-                    "size": existing_sound.size,
-                    "id": existing_sound.id,
-                    "error": None,
-                    "changes": None,
-                },
-            )
-
    async def scan_soundboard_directory(self) -> ScanResults:
        """Sync the default soundboard directory."""
        soundboard_path = "sounds/originals/soundboard"
--- a/tests/services/test_sound_scanner.py
+++ b/tests/services/test_sound_scanner.py
@@ -152,10 +152,15 @@ class TestSoundScannerService:
                    "errors": 0,
                    "files": [],
                }
+                # Set the existing sound filename to match temp file for "unchanged" test
+                existing_sound.filename = temp_path.name
+                
                await scanner_service._sync_audio_file(
                    temp_path,
                    "SDB",
-                    existing_sound,
+                    existing_sound,  # existing_sound_by_hash (same hash)
+                    None,  # existing_sound_by_filename (no conflict)
+                    "same_hash",
                    results,
                )

@@ -168,6 +173,121 @@ class TestSoundScannerService:
            finally:
                temp_path.unlink()

+    @pytest.mark.asyncio
+    async def test_sync_audio_file_renamed(self, scanner_service) -> None:
+        """Test syncing file that was renamed (same hash, different filename)."""
+        # Existing sound with same hash but different filename
+        existing_sound = Sound(
+            id=1,
+            type="SDB",
+            name="Old Name",
+            filename="old_name.mp3",
+            duration=120000,
+            size=1024,
+            hash="same_hash",
+        )
+        
+        scanner_service.sound_repo.update = AsyncMock(return_value=existing_sound)
+
+        # Mock file operations to return same hash
+        with (
+            patch("app.services.sound_scanner.get_file_hash", return_value="same_hash"),
+            patch("app.services.sound_scanner.get_audio_duration", return_value=120000),
+            patch("app.services.sound_scanner.get_file_size", return_value=1024),
+        ):
+            # Create a temporary file with different name
+            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+                temp_path = Path(f.name)
+
+            try:
+                results = {
+                    "scanned": 0,
+                    "added": 0,
+                    "updated": 0,
+                    "deleted": 0,
+                    "skipped": 0,
+                    "errors": 0,
+                    "files": [],
+                }
+                
+                await scanner_service._sync_audio_file(
+                    temp_path,
+                    "SDB",
+                    existing_sound,  # existing_sound_by_hash (same hash)
+                    None,  # existing_sound_by_filename (different filename)
+                    "same_hash",
+                    results,
+                )
+
+                # Should be marked as updated (renamed)
+                assert results["updated"] == 1
+                assert results["added"] == 0
+                assert results["skipped"] == 0
+                assert len(results["files"]) == 1
+                assert results["files"][0]["status"] == "updated"
+                assert results["files"][0]["reason"] == "file was renamed"
+                assert results["files"][0]["changes"] == ["filename", "name"]
+                
+                # Verify update was called with new filename
+                scanner_service.sound_repo.update.assert_called_once()
+                call_args = scanner_service.sound_repo.update.call_args[0][1]  # update_data
+                assert call_args["filename"] == temp_path.name
+                
+            finally:
+                temp_path.unlink()
+
+    @pytest.mark.asyncio
+    async def test_scan_directory_rename_no_delete(self, scanner_service, mock_session) -> None:
+        """Test that renamed files are not deleted (regression test)."""
+        # Create a mock existing sound that will be "renamed"
+        existing_sound = Sound(
+            id=1,
+            type="SDB",
+            name="Old Name",
+            filename="old_name.mp3",
+            duration=120000,
+            size=1024,
+            hash="same_hash",
+        )
+        
+        # Mock the repository to return the existing sound
+        scanner_service.sound_repo.get_by_type = AsyncMock(return_value=[existing_sound])
+        scanner_service.sound_repo.update = AsyncMock()
+        scanner_service.sound_repo.delete = AsyncMock()
+        
+        # Create temporary directory with renamed file
+        import tempfile
+        import os
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Create the "renamed" file (same hash, different name)
+            new_file_path = os.path.join(temp_dir, "new_name.mp3")
+            with open(new_file_path, "wb") as f:
+                f.write(b"test audio content")  # This will produce consistent hash
+            
+            # Mock file operations to return same hash
+            with (
+                patch("app.services.sound_scanner.get_file_hash", return_value="same_hash"),
+                patch("app.services.sound_scanner.get_audio_duration", return_value=120000),
+                patch("app.services.sound_scanner.get_file_size", return_value=1024),
+            ):
+                results = await scanner_service.scan_directory(temp_dir, "SDB")
+                
+                # Should have detected one renamed file
+                assert results["updated"] == 1
+                assert results["deleted"] == 0  # This is the key assertion - no deletion!
+                assert results["added"] == 0
+                assert len(results["files"]) == 1
+                
+                # Verify it was marked as renamed
+                file_result = results["files"][0]
+                assert file_result["status"] == "updated"
+                assert file_result["reason"] == "file was renamed"
+                
+                # Verify update was called but delete was NOT called
+                scanner_service.sound_repo.update.assert_called_once()
+                scanner_service.sound_repo.delete.assert_not_called()
+
    @pytest.mark.asyncio
    async def test_sync_audio_file_new(self, scanner_service) -> None:
        """Test syncing a new audio file."""
@@ -202,7 +322,14 @@ class TestSoundScannerService:
                    "errors": 0,
                    "files": [],
                }
-                await scanner_service._sync_audio_file(temp_path, "SDB", None, results)
+                await scanner_service._sync_audio_file(
+                    temp_path,
+                    "SDB",
+                    None,  # existing_sound_by_hash
+                    None,  # existing_sound_by_filename
+                    "test_hash",
+                    results,
+                )

                assert results["added"] == 1
                assert results["skipped"] == 0
@@ -262,7 +389,9 @@ class TestSoundScannerService:
                await scanner_service._sync_audio_file(
                    temp_path,
                    "SDB",
-                    existing_sound,
+                    None,  # existing_sound_by_hash (different hash)
+                    existing_sound,  # existing_sound_by_filename
+                    "new_hash",
                    results,
                )

@@ -325,7 +454,9 @@ class TestSoundScannerService:
                await scanner_service._sync_audio_file(
                    temp_path,
                    "CUSTOM",
-                    None,
+                    None,  # existing_sound_by_hash
+                    None,  # existing_sound_by_filename
+                    "custom_hash",
                    results,
                )