feat: Enhance SoundScannerService with duplicate detection and normalized file handling
Some checks failed
Backend CI / lint (push) Failing after 4m52s
Backend CI / test (push) Failing after 4m39s

This commit is contained in:
JSC
2025-08-25 12:33:10 +02:00
parent da66516bb3
commit d3ce17f10d
2 changed files with 373 additions and 27 deletions

View File

@@ -35,6 +35,7 @@ class ScanResults(TypedDict):
updated: int
deleted: int
skipped: int
duplicates: int
errors: int
files: list[FileInfo]
@@ -55,6 +56,13 @@ class SoundScannerService:
".m4a",
".aac",
}
# Directory mappings for normalized files (matching sound_normalizer)
self.normalized_directories = {
"SDB": "sounds/normalized/soundboard",
"TTS": "sounds/normalized/text_to_speech",
"EXT": "sounds/normalized/extracted",
}
def extract_name_from_filename(self, filename: str) -> str:
"""Extract a clean name from filename."""
@@ -64,6 +72,42 @@ class SoundScannerService:
name = name.replace("_", " ").replace("-", " ")
# Capitalize words
return " ".join(word.capitalize() for word in name.split())
def _get_normalized_path(self, sound_type: str, filename: str) -> Path:
"""Get the normalized file path for a sound."""
directory = self.normalized_directories.get(sound_type, "sounds/normalized/other")
return Path(directory) / filename
def _rename_normalized_file(self, sound_type: str, old_filename: str, new_filename: str) -> bool:
"""Rename a normalized file if it exists. Returns True if renamed, False if not found."""
old_path = self._get_normalized_path(sound_type, old_filename)
new_path = self._get_normalized_path(sound_type, new_filename)
if old_path.exists():
try:
# Ensure the directory exists
new_path.parent.mkdir(parents=True, exist_ok=True)
old_path.rename(new_path)
logger.info("Renamed normalized file: %s -> %s", old_path, new_path)
return True
except Exception as e:
logger.error("Failed to rename normalized file %s -> %s: %s", old_path, new_path, e)
return False
return False
def _delete_normalized_file(self, sound_type: str, filename: str) -> bool:
"""Delete a normalized file if it exists. Returns True if deleted, False if not found."""
normalized_path = self._get_normalized_path(sound_type, filename)
if normalized_path.exists():
try:
normalized_path.unlink()
logger.info("Deleted normalized file: %s", normalized_path)
return True
except Exception as e:
logger.error("Failed to delete normalized file %s: %s", normalized_path, e)
return False
return False
async def scan_directory(
self,
@@ -87,6 +131,7 @@ class SoundScannerService:
"updated": 0,
"deleted": 0,
"skipped": 0,
"duplicates": 0,
"errors": 0,
"files": [],
}
@@ -110,6 +155,9 @@ class SoundScannerService:
"name": sound.name,
"duration": sound.duration,
"size": sound.size,
"type": sound.type,
"is_normalized": sound.is_normalized,
"normalized_filename": sound.normalized_filename,
"sound_object": sound, # Keep reference for database operations
}
sounds_by_hash[sound.hash] = sound_data
@@ -177,10 +225,20 @@ class SoundScannerService:
sound_size = sound_data["size"]
sound_id = sound_data["id"]
sound_object = sound_data["sound_object"]
sound_type = sound_data["type"]
sound_is_normalized = sound_data["is_normalized"]
sound_normalized_filename = sound_data["normalized_filename"]
try:
# Delete the sound from database first
await self.sound_repo.delete(sound_object)
logger.info("Deleted sound no longer in directory: %s", filename)
# If the sound had a normalized file, delete it too
if sound_is_normalized and sound_normalized_filename:
normalized_base = Path(sound_normalized_filename).name
self._delete_normalized_file(sound_type, normalized_base)
results["deleted"] += 1
results["files"].append(
{
@@ -237,6 +295,9 @@ class SoundScannerService:
existing_hash_size = None
existing_hash_id = None
existing_hash_object = None
existing_hash_type = None
existing_hash_is_normalized = None
existing_hash_normalized_filename = None
if existing_sound_by_hash is not None:
if isinstance(existing_sound_by_hash, dict):
@@ -246,6 +307,9 @@ class SoundScannerService:
existing_hash_size = existing_sound_by_hash["size"]
existing_hash_id = existing_sound_by_hash["id"]
existing_hash_object = existing_sound_by_hash["sound_object"]
existing_hash_type = existing_sound_by_hash["type"]
existing_hash_is_normalized = existing_sound_by_hash["is_normalized"]
existing_hash_normalized_filename = existing_sound_by_hash["normalized_filename"]
else: # Sound object (for tests)
existing_hash_filename = existing_sound_by_hash.filename
existing_hash_name = existing_sound_by_hash.name
@@ -253,6 +317,9 @@ class SoundScannerService:
existing_hash_size = existing_sound_by_hash.size
existing_hash_id = existing_sound_by_hash.id
existing_hash_object = existing_sound_by_hash
existing_hash_type = existing_sound_by_hash.type
existing_hash_is_normalized = existing_sound_by_hash.is_normalized
existing_hash_normalized_filename = existing_sound_by_hash.normalized_filename
existing_filename_id = None
existing_filename_object = None
@@ -285,36 +352,90 @@ class SoundScannerService:
},
)
else:
# Same hash, different filename - file was renamed
update_data = {
"filename": filename,
"name": name,
}
# Same hash, different filename - could be rename or duplicate
# Check if both files exist to determine if it's a duplicate
old_file_path = file_path.parent / existing_hash_filename
if old_file_path.exists():
# Both files exist with same hash - this is a duplicate
logger.warning(
"Duplicate file detected: '%s' has same content as existing '%s' (hash: %s). "
"Skipping duplicate file.",
filename,
existing_hash_filename,
file_hash[:8] + "...",
)
await self.sound_repo.update(existing_hash_object, update_data)
logger.info(
"Detected rename: %s -> %s (ID: %s)",
existing_hash_filename,
filename,
existing_hash_id,
)
results["updated"] += 1
results["files"].append(
{
results["skipped"] += 1
results["duplicates"] += 1
results["files"].append(
{
"filename": filename,
"status": "skipped",
"reason": "duplicate content",
"name": existing_hash_name,
"duration": existing_hash_duration,
"size": existing_hash_size,
"id": existing_hash_id,
"error": None,
"changes": None,
},
)
else:
# Old file doesn't exist - this is a genuine rename
update_data = {
"filename": filename,
"status": "updated",
"reason": "file was renamed",
"name": name,
"duration": existing_hash_duration,
"size": existing_hash_size,
"id": existing_hash_id,
"error": None,
"changes": ["filename", "name"],
# Store old filename to prevent deletion
"old_filename": existing_hash_filename,
},
)
}
# If the sound has a normalized file, rename it too
if existing_hash_is_normalized and existing_hash_normalized_filename:
# Extract base filename without path for normalized file
old_normalized_base = Path(existing_hash_normalized_filename).name
new_normalized_base = Path(filename).stem + Path(existing_hash_normalized_filename).suffix
renamed = self._rename_normalized_file(
existing_hash_type,
old_normalized_base,
new_normalized_base
)
if renamed:
update_data["normalized_filename"] = new_normalized_base
logger.info(
"Renamed normalized file: %s -> %s",
old_normalized_base,
new_normalized_base
)
await self.sound_repo.update(existing_hash_object, update_data)
logger.info(
"Detected rename: %s -> %s (ID: %s)",
existing_hash_filename,
filename,
existing_hash_id,
)
# Build changes list
changes = ["filename", "name"]
if "normalized_filename" in update_data:
changes.append("normalized_filename")
results["updated"] += 1
results["files"].append(
{
"filename": filename,
"status": "updated",
"reason": "file was renamed",
"name": name,
"duration": existing_hash_duration,
"size": existing_hash_size,
"id": existing_hash_id,
"error": None,
"changes": changes,
# Store old filename to prevent deletion
"old_filename": existing_hash_filename,
},
)
elif existing_sound_by_filename is not None:
# Same filename but different hash - file was modified