feat: Implement hash-first identification strategy in audio file syncing and enhance tests for renamed files
Some checks failed
Backend CI / lint (push) Failing after 4m55s
Backend CI / test (push) Failing after 4m32s

This commit is contained in:
JSC
2025-08-25 11:56:07 +02:00
parent d81a54207c
commit da66516bb3
3 changed files with 316 additions and 68 deletions

View File

@@ -34,7 +34,11 @@ class Sound(BaseModel, table=True):
__table_args__ = (UniqueConstraint("hash", name="uq_sound_hash"),) __table_args__ = (UniqueConstraint("hash", name="uq_sound_hash"),)
# relationships # relationships
playlist_sounds: list["PlaylistSound"] = Relationship(back_populates="sound") playlist_sounds: list["PlaylistSound"] = Relationship(
back_populates="sound", cascade_delete=True
)
extractions: list["Extraction"] = Relationship(back_populates="sound") extractions: list["Extraction"] = Relationship(back_populates="sound")
play_history: list["SoundPlayed"] = Relationship(back_populates="sound") play_history: list["SoundPlayed"] = Relationship(
back_populates="sound", cascade_delete=True
)
favorites: list["Favorite"] = Relationship(back_populates="sound") favorites: list["Favorite"] = Relationship(back_populates="sound")

View File

@@ -95,7 +95,25 @@ class SoundScannerService:
# Get all existing sounds of this type from database # Get all existing sounds of this type from database
existing_sounds = await self.sound_repo.get_by_type(sound_type) existing_sounds = await self.sound_repo.get_by_type(sound_type)
sounds_by_filename = {sound.filename: sound for sound in existing_sounds}
# Create lookup dictionaries with immediate attribute access
# to avoid session detachment
sounds_by_hash = {}
sounds_by_filename = {}
for sound in existing_sounds:
# Capture all attributes immediately while session is valid
sound_data = {
"id": sound.id,
"hash": sound.hash,
"filename": sound.filename,
"name": sound.name,
"duration": sound.duration,
"size": sound.size,
"sound_object": sound, # Keep reference for database operations
}
sounds_by_hash[sound.hash] = sound_data
sounds_by_filename[sound.filename] = sound_data
# Get all audio files from directory # Get all audio files from directory
audio_files = [ audio_files = [
@@ -112,12 +130,27 @@ class SoundScannerService:
processed_filenames.add(filename) processed_filenames.add(filename)
try: try:
# Calculate hash first to enable hash-based lookup
file_hash = get_file_hash(file_path)
existing_sound_by_hash = sounds_by_hash.get(file_hash)
existing_sound_by_filename = sounds_by_filename.get(filename)
await self._sync_audio_file( await self._sync_audio_file(
file_path, file_path,
sound_type, sound_type,
sounds_by_filename.get(filename), existing_sound_by_hash,
existing_sound_by_filename,
file_hash,
results, results,
) )
# Check if this was a rename operation and mark old filename as processed
if results["files"] and results["files"][-1].get("old_filename"):
old_filename = results["files"][-1]["old_filename"]
processed_filenames.add(old_filename)
logger.debug("Marked old filename as processed: %s", old_filename)
# Remove temporary tracking field from results
del results["files"][-1]["old_filename"]
except Exception as e: except Exception as e:
logger.exception("Error processing file %s", file_path) logger.exception("Error processing file %s", file_path)
results["errors"] += 1 results["errors"] += 1
@@ -136,10 +169,17 @@ class SoundScannerService:
) )
# Delete sounds that no longer exist in directory # Delete sounds that no longer exist in directory
for filename, sound in sounds_by_filename.items(): for filename, sound_data in sounds_by_filename.items():
if filename not in processed_filenames: if filename not in processed_filenames:
# Attributes already captured in sound_data dictionary
sound_name = sound_data["name"]
sound_duration = sound_data["duration"]
sound_size = sound_data["size"]
sound_id = sound_data["id"]
sound_object = sound_data["sound_object"]
try: try:
await self.sound_repo.delete(sound) await self.sound_repo.delete(sound_object)
logger.info("Deleted sound no longer in directory: %s", filename) logger.info("Deleted sound no longer in directory: %s", filename)
results["deleted"] += 1 results["deleted"] += 1
results["files"].append( results["files"].append(
@@ -147,10 +187,10 @@ class SoundScannerService:
"filename": filename, "filename": filename,
"status": "deleted", "status": "deleted",
"reason": "file no longer exists", "reason": "file no longer exists",
"name": sound.name, "name": sound_name,
"duration": sound.duration, "duration": sound_duration,
"size": sound.size, "size": sound_size,
"id": sound.id, "id": sound_id,
"error": None, "error": None,
"changes": None, "changes": None,
}, },
@@ -163,10 +203,10 @@ class SoundScannerService:
"filename": filename, "filename": filename,
"status": "error", "status": "error",
"reason": "failed to delete", "reason": "failed to delete",
"name": sound.name, "name": sound_name,
"duration": sound.duration, "duration": sound_duration,
"size": sound.size, "size": sound_size,
"id": sound.id, "id": sound_id,
"error": str(e), "error": str(e),
"changes": None, "changes": None,
}, },
@@ -179,18 +219,136 @@ class SoundScannerService:
self, self,
file_path: Path, file_path: Path,
sound_type: str, sound_type: str,
existing_sound: Sound | None, existing_sound_by_hash: dict | Sound | None,
existing_sound_by_filename: dict | Sound | None,
file_hash: str,
results: ScanResults, results: ScanResults,
) -> None: ) -> None:
"""Sync a single audio file (add new or update existing).""" """Sync a single audio file using hash-first identification strategy."""
filename = file_path.name filename = file_path.name
file_hash = get_file_hash(file_path)
duration = get_audio_duration(file_path) duration = get_audio_duration(file_path)
size = get_file_size(file_path) size = get_file_size(file_path)
name = self.extract_name_from_filename(filename) name = self.extract_name_from_filename(filename)
if existing_sound is None: # Extract attributes - handle both dict (normal) and Sound object (tests)
# Add new sound existing_hash_filename = None
existing_hash_name = None
existing_hash_duration = None
existing_hash_size = None
existing_hash_id = None
existing_hash_object = None
if existing_sound_by_hash is not None:
if isinstance(existing_sound_by_hash, dict):
existing_hash_filename = existing_sound_by_hash["filename"]
existing_hash_name = existing_sound_by_hash["name"]
existing_hash_duration = existing_sound_by_hash["duration"]
existing_hash_size = existing_sound_by_hash["size"]
existing_hash_id = existing_sound_by_hash["id"]
existing_hash_object = existing_sound_by_hash["sound_object"]
else: # Sound object (for tests)
existing_hash_filename = existing_sound_by_hash.filename
existing_hash_name = existing_sound_by_hash.name
existing_hash_duration = existing_sound_by_hash.duration
existing_hash_size = existing_sound_by_hash.size
existing_hash_id = existing_sound_by_hash.id
existing_hash_object = existing_sound_by_hash
existing_filename_id = None
existing_filename_object = None
if existing_sound_by_filename is not None:
if isinstance(existing_sound_by_filename, dict):
existing_filename_id = existing_sound_by_filename["id"]
existing_filename_object = existing_sound_by_filename["sound_object"]
else: # Sound object (for tests)
existing_filename_id = existing_sound_by_filename.id
existing_filename_object = existing_sound_by_filename
# Hash-first identification strategy
if existing_sound_by_hash is not None:
# Content exists in database (same hash)
if existing_hash_filename == filename:
# Same hash, same filename - file unchanged
logger.debug("Sound unchanged: %s", filename)
results["skipped"] += 1
results["files"].append(
{
"filename": filename,
"status": "skipped",
"reason": "file unchanged",
"name": existing_hash_name,
"duration": existing_hash_duration,
"size": existing_hash_size,
"id": existing_hash_id,
"error": None,
"changes": None,
},
)
else:
# Same hash, different filename - file was renamed
update_data = {
"filename": filename,
"name": name,
}
await self.sound_repo.update(existing_hash_object, update_data)
logger.info(
"Detected rename: %s -> %s (ID: %s)",
existing_hash_filename,
filename,
existing_hash_id,
)
results["updated"] += 1
results["files"].append(
{
"filename": filename,
"status": "updated",
"reason": "file was renamed",
"name": name,
"duration": existing_hash_duration,
"size": existing_hash_size,
"id": existing_hash_id,
"error": None,
"changes": ["filename", "name"],
# Store old filename to prevent deletion
"old_filename": existing_hash_filename,
},
)
elif existing_sound_by_filename is not None:
# Same filename but different hash - file was modified
update_data = {
"name": name,
"duration": duration,
"size": size,
"hash": file_hash,
}
await self.sound_repo.update(existing_filename_object, update_data)
logger.info(
"Updated modified sound: %s (ID: %s)",
name,
existing_filename_id,
)
results["updated"] += 1
results["files"].append(
{
"filename": filename,
"status": "updated",
"reason": "file was modified",
"name": name,
"duration": duration,
"size": size,
"id": existing_filename_id,
"error": None,
"changes": ["hash", "duration", "size", "name"],
},
)
else:
# New file - neither hash nor filename exists
sound_data = { sound_data = {
"type": sound_type, "type": sound_type,
"name": name, "name": name,
@@ -222,51 +380,6 @@ class SoundScannerService:
}, },
) )
elif existing_sound.hash != file_hash:
# Update existing sound (file was modified)
update_data = {
"name": name,
"duration": duration,
"size": size,
"hash": file_hash,
}
await self.sound_repo.update(existing_sound, update_data)
logger.info("Updated modified sound: %s (ID: %s)", name, existing_sound.id)
results["updated"] += 1
results["files"].append(
{
"filename": filename,
"status": "updated",
"reason": "file was modified",
"name": name,
"duration": duration,
"size": size,
"id": existing_sound.id,
"error": None,
"changes": ["hash", "duration", "size", "name"],
},
)
else:
# File unchanged, skip
logger.debug("Sound unchanged: %s", filename)
results["skipped"] += 1
results["files"].append(
{
"filename": filename,
"status": "skipped",
"reason": "file unchanged",
"name": existing_sound.name,
"duration": existing_sound.duration,
"size": existing_sound.size,
"id": existing_sound.id,
"error": None,
"changes": None,
},
)
async def scan_soundboard_directory(self) -> ScanResults: async def scan_soundboard_directory(self) -> ScanResults:
"""Sync the default soundboard directory.""" """Sync the default soundboard directory."""
soundboard_path = "sounds/originals/soundboard" soundboard_path = "sounds/originals/soundboard"

View File

@@ -152,10 +152,15 @@ class TestSoundScannerService:
"errors": 0, "errors": 0,
"files": [], "files": [],
} }
# Set the existing sound filename to match temp file for "unchanged" test
existing_sound.filename = temp_path.name
await scanner_service._sync_audio_file( await scanner_service._sync_audio_file(
temp_path, temp_path,
"SDB", "SDB",
existing_sound, existing_sound, # existing_sound_by_hash (same hash)
None, # existing_sound_by_filename (no conflict)
"same_hash",
results, results,
) )
@@ -168,6 +173,121 @@ class TestSoundScannerService:
finally: finally:
temp_path.unlink() temp_path.unlink()
@pytest.mark.asyncio
async def test_sync_audio_file_renamed(self, scanner_service) -> None:
"""Test syncing file that was renamed (same hash, different filename)."""
# Existing sound with same hash but different filename
existing_sound = Sound(
id=1,
type="SDB",
name="Old Name",
filename="old_name.mp3",
duration=120000,
size=1024,
hash="same_hash",
)
scanner_service.sound_repo.update = AsyncMock(return_value=existing_sound)
# Mock file operations to return same hash
with (
patch("app.services.sound_scanner.get_file_hash", return_value="same_hash"),
patch("app.services.sound_scanner.get_audio_duration", return_value=120000),
patch("app.services.sound_scanner.get_file_size", return_value=1024),
):
# Create a temporary file with different name
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
temp_path = Path(f.name)
try:
results = {
"scanned": 0,
"added": 0,
"updated": 0,
"deleted": 0,
"skipped": 0,
"errors": 0,
"files": [],
}
await scanner_service._sync_audio_file(
temp_path,
"SDB",
existing_sound, # existing_sound_by_hash (same hash)
None, # existing_sound_by_filename (different filename)
"same_hash",
results,
)
# Should be marked as updated (renamed)
assert results["updated"] == 1
assert results["added"] == 0
assert results["skipped"] == 0
assert len(results["files"]) == 1
assert results["files"][0]["status"] == "updated"
assert results["files"][0]["reason"] == "file was renamed"
assert results["files"][0]["changes"] == ["filename", "name"]
# Verify update was called with new filename
scanner_service.sound_repo.update.assert_called_once()
call_args = scanner_service.sound_repo.update.call_args[0][1] # update_data
assert call_args["filename"] == temp_path.name
finally:
temp_path.unlink()
@pytest.mark.asyncio
async def test_scan_directory_rename_no_delete(self, scanner_service, mock_session) -> None:
"""Test that renamed files are not deleted (regression test)."""
# Create a mock existing sound that will be "renamed"
existing_sound = Sound(
id=1,
type="SDB",
name="Old Name",
filename="old_name.mp3",
duration=120000,
size=1024,
hash="same_hash",
)
# Mock the repository to return the existing sound
scanner_service.sound_repo.get_by_type = AsyncMock(return_value=[existing_sound])
scanner_service.sound_repo.update = AsyncMock()
scanner_service.sound_repo.delete = AsyncMock()
# Create temporary directory with renamed file
import tempfile
import os
with tempfile.TemporaryDirectory() as temp_dir:
# Create the "renamed" file (same hash, different name)
new_file_path = os.path.join(temp_dir, "new_name.mp3")
with open(new_file_path, "wb") as f:
f.write(b"test audio content") # This will produce consistent hash
# Mock file operations to return same hash
with (
patch("app.services.sound_scanner.get_file_hash", return_value="same_hash"),
patch("app.services.sound_scanner.get_audio_duration", return_value=120000),
patch("app.services.sound_scanner.get_file_size", return_value=1024),
):
results = await scanner_service.scan_directory(temp_dir, "SDB")
# Should have detected one renamed file
assert results["updated"] == 1
assert results["deleted"] == 0 # This is the key assertion - no deletion!
assert results["added"] == 0
assert len(results["files"]) == 1
# Verify it was marked as renamed
file_result = results["files"][0]
assert file_result["status"] == "updated"
assert file_result["reason"] == "file was renamed"
# Verify update was called but delete was NOT called
scanner_service.sound_repo.update.assert_called_once()
scanner_service.sound_repo.delete.assert_not_called()
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_sync_audio_file_new(self, scanner_service) -> None: async def test_sync_audio_file_new(self, scanner_service) -> None:
"""Test syncing a new audio file.""" """Test syncing a new audio file."""
@@ -202,7 +322,14 @@ class TestSoundScannerService:
"errors": 0, "errors": 0,
"files": [], "files": [],
} }
await scanner_service._sync_audio_file(temp_path, "SDB", None, results) await scanner_service._sync_audio_file(
temp_path,
"SDB",
None, # existing_sound_by_hash
None, # existing_sound_by_filename
"test_hash",
results,
)
assert results["added"] == 1 assert results["added"] == 1
assert results["skipped"] == 0 assert results["skipped"] == 0
@@ -262,7 +389,9 @@ class TestSoundScannerService:
await scanner_service._sync_audio_file( await scanner_service._sync_audio_file(
temp_path, temp_path,
"SDB", "SDB",
existing_sound, None, # existing_sound_by_hash (different hash)
existing_sound, # existing_sound_by_filename
"new_hash",
results, results,
) )
@@ -325,7 +454,9 @@ class TestSoundScannerService:
await scanner_service._sync_audio_file( await scanner_service._sync_audio_file(
temp_path, temp_path,
"CUSTOM", "CUSTOM",
None, None, # existing_sound_by_hash
None, # existing_sound_by_filename
"custom_hash",
results, results,
) )