feat: Enhance SoundScannerService with duplicate detection and normalized file handling
This commit is contained in:
@@ -35,6 +35,7 @@ class ScanResults(TypedDict):
|
||||
updated: int
|
||||
deleted: int
|
||||
skipped: int
|
||||
duplicates: int
|
||||
errors: int
|
||||
files: list[FileInfo]
|
||||
|
||||
@@ -55,6 +56,13 @@ class SoundScannerService:
|
||||
".m4a",
|
||||
".aac",
|
||||
}
|
||||
|
||||
# Directory mappings for normalized files (matching sound_normalizer)
|
||||
self.normalized_directories = {
|
||||
"SDB": "sounds/normalized/soundboard",
|
||||
"TTS": "sounds/normalized/text_to_speech",
|
||||
"EXT": "sounds/normalized/extracted",
|
||||
}
|
||||
|
||||
def extract_name_from_filename(self, filename: str) -> str:
|
||||
"""Extract a clean name from filename."""
|
||||
@@ -64,6 +72,42 @@ class SoundScannerService:
|
||||
name = name.replace("_", " ").replace("-", " ")
|
||||
# Capitalize words
|
||||
return " ".join(word.capitalize() for word in name.split())
|
||||
|
||||
def _get_normalized_path(self, sound_type: str, filename: str) -> Path:
|
||||
"""Get the normalized file path for a sound."""
|
||||
directory = self.normalized_directories.get(sound_type, "sounds/normalized/other")
|
||||
return Path(directory) / filename
|
||||
|
||||
def _rename_normalized_file(self, sound_type: str, old_filename: str, new_filename: str) -> bool:
|
||||
"""Rename a normalized file if it exists. Returns True if renamed, False if not found."""
|
||||
old_path = self._get_normalized_path(sound_type, old_filename)
|
||||
new_path = self._get_normalized_path(sound_type, new_filename)
|
||||
|
||||
if old_path.exists():
|
||||
try:
|
||||
# Ensure the directory exists
|
||||
new_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
old_path.rename(new_path)
|
||||
logger.info("Renamed normalized file: %s -> %s", old_path, new_path)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error("Failed to rename normalized file %s -> %s: %s", old_path, new_path, e)
|
||||
return False
|
||||
return False
|
||||
|
||||
def _delete_normalized_file(self, sound_type: str, filename: str) -> bool:
|
||||
"""Delete a normalized file if it exists. Returns True if deleted, False if not found."""
|
||||
normalized_path = self._get_normalized_path(sound_type, filename)
|
||||
|
||||
if normalized_path.exists():
|
||||
try:
|
||||
normalized_path.unlink()
|
||||
logger.info("Deleted normalized file: %s", normalized_path)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error("Failed to delete normalized file %s: %s", normalized_path, e)
|
||||
return False
|
||||
return False
|
||||
|
||||
async def scan_directory(
|
||||
self,
|
||||
@@ -87,6 +131,7 @@ class SoundScannerService:
|
||||
"updated": 0,
|
||||
"deleted": 0,
|
||||
"skipped": 0,
|
||||
"duplicates": 0,
|
||||
"errors": 0,
|
||||
"files": [],
|
||||
}
|
||||
@@ -110,6 +155,9 @@ class SoundScannerService:
|
||||
"name": sound.name,
|
||||
"duration": sound.duration,
|
||||
"size": sound.size,
|
||||
"type": sound.type,
|
||||
"is_normalized": sound.is_normalized,
|
||||
"normalized_filename": sound.normalized_filename,
|
||||
"sound_object": sound, # Keep reference for database operations
|
||||
}
|
||||
sounds_by_hash[sound.hash] = sound_data
|
||||
@@ -177,10 +225,20 @@ class SoundScannerService:
|
||||
sound_size = sound_data["size"]
|
||||
sound_id = sound_data["id"]
|
||||
sound_object = sound_data["sound_object"]
|
||||
sound_type = sound_data["type"]
|
||||
sound_is_normalized = sound_data["is_normalized"]
|
||||
sound_normalized_filename = sound_data["normalized_filename"]
|
||||
|
||||
try:
|
||||
# Delete the sound from database first
|
||||
await self.sound_repo.delete(sound_object)
|
||||
logger.info("Deleted sound no longer in directory: %s", filename)
|
||||
|
||||
# If the sound had a normalized file, delete it too
|
||||
if sound_is_normalized and sound_normalized_filename:
|
||||
normalized_base = Path(sound_normalized_filename).name
|
||||
self._delete_normalized_file(sound_type, normalized_base)
|
||||
|
||||
results["deleted"] += 1
|
||||
results["files"].append(
|
||||
{
|
||||
@@ -237,6 +295,9 @@ class SoundScannerService:
|
||||
existing_hash_size = None
|
||||
existing_hash_id = None
|
||||
existing_hash_object = None
|
||||
existing_hash_type = None
|
||||
existing_hash_is_normalized = None
|
||||
existing_hash_normalized_filename = None
|
||||
|
||||
if existing_sound_by_hash is not None:
|
||||
if isinstance(existing_sound_by_hash, dict):
|
||||
@@ -246,6 +307,9 @@ class SoundScannerService:
|
||||
existing_hash_size = existing_sound_by_hash["size"]
|
||||
existing_hash_id = existing_sound_by_hash["id"]
|
||||
existing_hash_object = existing_sound_by_hash["sound_object"]
|
||||
existing_hash_type = existing_sound_by_hash["type"]
|
||||
existing_hash_is_normalized = existing_sound_by_hash["is_normalized"]
|
||||
existing_hash_normalized_filename = existing_sound_by_hash["normalized_filename"]
|
||||
else: # Sound object (for tests)
|
||||
existing_hash_filename = existing_sound_by_hash.filename
|
||||
existing_hash_name = existing_sound_by_hash.name
|
||||
@@ -253,6 +317,9 @@ class SoundScannerService:
|
||||
existing_hash_size = existing_sound_by_hash.size
|
||||
existing_hash_id = existing_sound_by_hash.id
|
||||
existing_hash_object = existing_sound_by_hash
|
||||
existing_hash_type = existing_sound_by_hash.type
|
||||
existing_hash_is_normalized = existing_sound_by_hash.is_normalized
|
||||
existing_hash_normalized_filename = existing_sound_by_hash.normalized_filename
|
||||
|
||||
existing_filename_id = None
|
||||
existing_filename_object = None
|
||||
@@ -285,36 +352,90 @@ class SoundScannerService:
|
||||
},
|
||||
)
|
||||
else:
|
||||
# Same hash, different filename - file was renamed
|
||||
update_data = {
|
||||
"filename": filename,
|
||||
"name": name,
|
||||
}
|
||||
# Same hash, different filename - could be rename or duplicate
|
||||
# Check if both files exist to determine if it's a duplicate
|
||||
old_file_path = file_path.parent / existing_hash_filename
|
||||
if old_file_path.exists():
|
||||
# Both files exist with same hash - this is a duplicate
|
||||
logger.warning(
|
||||
"Duplicate file detected: '%s' has same content as existing '%s' (hash: %s). "
|
||||
"Skipping duplicate file.",
|
||||
filename,
|
||||
existing_hash_filename,
|
||||
file_hash[:8] + "...",
|
||||
)
|
||||
|
||||
await self.sound_repo.update(existing_hash_object, update_data)
|
||||
logger.info(
|
||||
"Detected rename: %s -> %s (ID: %s)",
|
||||
existing_hash_filename,
|
||||
filename,
|
||||
existing_hash_id,
|
||||
)
|
||||
|
||||
results["updated"] += 1
|
||||
results["files"].append(
|
||||
{
|
||||
results["skipped"] += 1
|
||||
results["duplicates"] += 1
|
||||
results["files"].append(
|
||||
{
|
||||
"filename": filename,
|
||||
"status": "skipped",
|
||||
"reason": "duplicate content",
|
||||
"name": existing_hash_name,
|
||||
"duration": existing_hash_duration,
|
||||
"size": existing_hash_size,
|
||||
"id": existing_hash_id,
|
||||
"error": None,
|
||||
"changes": None,
|
||||
},
|
||||
)
|
||||
else:
|
||||
# Old file doesn't exist - this is a genuine rename
|
||||
update_data = {
|
||||
"filename": filename,
|
||||
"status": "updated",
|
||||
"reason": "file was renamed",
|
||||
"name": name,
|
||||
"duration": existing_hash_duration,
|
||||
"size": existing_hash_size,
|
||||
"id": existing_hash_id,
|
||||
"error": None,
|
||||
"changes": ["filename", "name"],
|
||||
# Store old filename to prevent deletion
|
||||
"old_filename": existing_hash_filename,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
# If the sound has a normalized file, rename it too
|
||||
if existing_hash_is_normalized and existing_hash_normalized_filename:
|
||||
# Extract base filename without path for normalized file
|
||||
old_normalized_base = Path(existing_hash_normalized_filename).name
|
||||
new_normalized_base = Path(filename).stem + Path(existing_hash_normalized_filename).suffix
|
||||
|
||||
renamed = self._rename_normalized_file(
|
||||
existing_hash_type,
|
||||
old_normalized_base,
|
||||
new_normalized_base
|
||||
)
|
||||
|
||||
if renamed:
|
||||
update_data["normalized_filename"] = new_normalized_base
|
||||
logger.info(
|
||||
"Renamed normalized file: %s -> %s",
|
||||
old_normalized_base,
|
||||
new_normalized_base
|
||||
)
|
||||
|
||||
await self.sound_repo.update(existing_hash_object, update_data)
|
||||
logger.info(
|
||||
"Detected rename: %s -> %s (ID: %s)",
|
||||
existing_hash_filename,
|
||||
filename,
|
||||
existing_hash_id,
|
||||
)
|
||||
|
||||
# Build changes list
|
||||
changes = ["filename", "name"]
|
||||
if "normalized_filename" in update_data:
|
||||
changes.append("normalized_filename")
|
||||
|
||||
results["updated"] += 1
|
||||
results["files"].append(
|
||||
{
|
||||
"filename": filename,
|
||||
"status": "updated",
|
||||
"reason": "file was renamed",
|
||||
"name": name,
|
||||
"duration": existing_hash_duration,
|
||||
"size": existing_hash_size,
|
||||
"id": existing_hash_id,
|
||||
"error": None,
|
||||
"changes": changes,
|
||||
# Store old filename to prevent deletion
|
||||
"old_filename": existing_hash_filename,
|
||||
},
|
||||
)
|
||||
|
||||
elif existing_sound_by_filename is not None:
|
||||
# Same filename but different hash - file was modified
|
||||
|
||||
@@ -288,6 +288,59 @@ class TestSoundScannerService:
|
||||
scanner_service.sound_repo.update.assert_called_once()
|
||||
scanner_service.sound_repo.delete.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scan_directory_duplicate_detection(self, scanner_service, mock_session) -> None:
|
||||
"""Test that duplicate files (same hash) are detected and logged."""
|
||||
# Create a mock existing sound
|
||||
existing_sound = Sound(
|
||||
id=1,
|
||||
type="SDB",
|
||||
name="Original Song",
|
||||
filename="original.mp3",
|
||||
duration=120000,
|
||||
size=1024,
|
||||
hash="same_hash",
|
||||
)
|
||||
|
||||
# Mock the repository
|
||||
scanner_service.sound_repo.get_by_type = AsyncMock(return_value=[existing_sound])
|
||||
scanner_service.sound_repo.update = AsyncMock()
|
||||
|
||||
# Create temporary directory with both original and duplicate files
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Create both files (simulating duplicate content)
|
||||
original_path = os.path.join(temp_dir, "original.mp3")
|
||||
duplicate_path = os.path.join(temp_dir, "duplicate.mp3")
|
||||
|
||||
with open(original_path, "wb") as f:
|
||||
f.write(b"test audio content")
|
||||
with open(duplicate_path, "wb") as f:
|
||||
f.write(b"test audio content") # Same content = same hash
|
||||
|
||||
# Mock file operations
|
||||
with (
|
||||
patch("app.services.sound_scanner.get_file_hash", return_value="same_hash"),
|
||||
patch("app.services.sound_scanner.get_audio_duration", return_value=120000),
|
||||
patch("app.services.sound_scanner.get_file_size", return_value=1024),
|
||||
):
|
||||
results = await scanner_service.scan_directory(temp_dir, "SDB")
|
||||
|
||||
# Should have 1 unchanged (original) and 1 skipped (duplicate)
|
||||
assert results["skipped"] == 2 # Both files have same hash, both skipped
|
||||
assert results["duplicates"] == 1 # One duplicate detected
|
||||
assert results["updated"] == 0
|
||||
assert results["added"] == 0
|
||||
assert results["deleted"] == 0
|
||||
|
||||
# Check that duplicate was properly detected
|
||||
skipped_files = [f for f in results["files"] if f["status"] == "skipped"]
|
||||
duplicate_file = next((f for f in skipped_files if "duplicate" in f["reason"]), None)
|
||||
assert duplicate_file is not None
|
||||
assert duplicate_file["reason"] == "duplicate content"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sync_audio_file_new(self, scanner_service) -> None:
|
||||
"""Test syncing a new audio file."""
|
||||
@@ -477,3 +530,175 @@ class TestSoundScannerService:
|
||||
) # All sounds are set to not deletable
|
||||
finally:
|
||||
temp_path.unlink()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sync_audio_file_rename_with_normalized_file(
|
||||
self, test_session, scanner_service
|
||||
):
|
||||
"""Test that renaming a sound file also renames its normalized file."""
|
||||
# Create temporary directories for testing
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_dir_path = Path(temp_dir)
|
||||
|
||||
# Set up the scanner's normalized directories to use temp dir
|
||||
scanner_service.normalized_directories = {
|
||||
"SDB": str(temp_dir_path / "normalized" / "soundboard")
|
||||
}
|
||||
|
||||
# Create the normalized directory
|
||||
normalized_dir = temp_dir_path / "normalized" / "soundboard"
|
||||
normalized_dir.mkdir(parents=True)
|
||||
|
||||
# Create the old normalized file
|
||||
old_normalized_file = normalized_dir / "old_sound.mp3"
|
||||
old_normalized_file.write_text("normalized audio content")
|
||||
|
||||
# Create the audio files (they need to exist for the scanner)
|
||||
old_path = temp_dir_path / "old_sound.mp3"
|
||||
new_path = temp_dir_path / "new_sound.mp3"
|
||||
|
||||
# Create a dummy audio file for the new path
|
||||
new_path.write_bytes(b"fake audio data for testing")
|
||||
|
||||
# Mock the audio utility functions since we're using fake files
|
||||
from unittest.mock import patch
|
||||
with patch('app.services.sound_scanner.get_audio_duration', return_value=60000), \
|
||||
patch('app.services.sound_scanner.get_file_size', return_value=2048):
|
||||
|
||||
# Create existing sound with normalized file info
|
||||
existing_sound = Sound(
|
||||
id=1,
|
||||
type="SDB",
|
||||
name="Old Sound",
|
||||
filename="old_sound.mp3",
|
||||
duration=60000,
|
||||
size=2048,
|
||||
hash="test_hash",
|
||||
is_normalized=True,
|
||||
normalized_filename="old_sound.mp3",
|
||||
normalized_duration=60000,
|
||||
normalized_size=1024,
|
||||
normalized_hash="normalized_hash",
|
||||
play_count=5,
|
||||
is_deletable=False,
|
||||
is_music=False
|
||||
)
|
||||
|
||||
results = {
|
||||
"scanned": 0,
|
||||
"added": 0,
|
||||
"updated": 0,
|
||||
"deleted": 0,
|
||||
"skipped": 0,
|
||||
"duplicates": 0,
|
||||
"errors": 0,
|
||||
"files": [],
|
||||
}
|
||||
|
||||
# Mock the sound repository update
|
||||
scanner_service.sound_repo.update = AsyncMock()
|
||||
|
||||
# Simulate rename detection by calling _sync_audio_file
|
||||
await scanner_service._sync_audio_file(
|
||||
new_path,
|
||||
"SDB",
|
||||
existing_sound, # existing_sound_by_hash (same hash, different filename)
|
||||
None, # existing_sound_by_filename (no file with new name exists)
|
||||
"test_hash",
|
||||
results,
|
||||
)
|
||||
|
||||
# Verify the results
|
||||
assert results["updated"] == 1
|
||||
assert len(results["files"]) == 1
|
||||
assert results["files"][0]["status"] == "updated"
|
||||
assert results["files"][0]["reason"] == "file was renamed"
|
||||
assert "normalized_filename" in results["files"][0]["changes"]
|
||||
|
||||
# Verify sound_repo.update was called with normalized filename update
|
||||
update_call = scanner_service.sound_repo.update.call_args
|
||||
update_data = update_call[0][1] # Second argument is the update data
|
||||
|
||||
assert "filename" in update_data
|
||||
assert "name" in update_data
|
||||
assert "normalized_filename" in update_data
|
||||
assert update_data["normalized_filename"] == "new_sound.mp3"
|
||||
|
||||
# Verify the normalized file was actually renamed
|
||||
new_normalized_file = normalized_dir / "new_sound.mp3"
|
||||
assert new_normalized_file.exists()
|
||||
assert not old_normalized_file.exists()
|
||||
assert new_normalized_file.read_text() == "normalized audio content"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scan_directory_delete_with_normalized_file(
|
||||
self, test_session, scanner_service
|
||||
):
|
||||
"""Test that deleting a sound also deletes its normalized file."""
|
||||
# Create temporary directories for testing
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_dir_path = Path(temp_dir)
|
||||
scan_dir = temp_dir_path / "sounds"
|
||||
scan_dir.mkdir()
|
||||
|
||||
# Set up the scanner's normalized directories to use temp dir
|
||||
scanner_service.normalized_directories = {
|
||||
"SDB": str(temp_dir_path / "normalized" / "soundboard")
|
||||
}
|
||||
|
||||
# Create the normalized directory and file
|
||||
normalized_dir = temp_dir_path / "normalized" / "soundboard"
|
||||
normalized_dir.mkdir(parents=True)
|
||||
normalized_file = normalized_dir / "test_sound.mp3"
|
||||
normalized_file.write_text("normalized audio content")
|
||||
|
||||
# Create existing sound with normalized file info
|
||||
existing_sound = Sound(
|
||||
id=1,
|
||||
type="SDB",
|
||||
name="Test Sound",
|
||||
filename="test_sound.mp3",
|
||||
duration=60000,
|
||||
size=2048,
|
||||
hash="test_hash",
|
||||
is_normalized=True,
|
||||
normalized_filename="test_sound.mp3",
|
||||
normalized_duration=60000,
|
||||
normalized_size=1024,
|
||||
normalized_hash="normalized_hash",
|
||||
play_count=5,
|
||||
is_deletable=False,
|
||||
is_music=False
|
||||
)
|
||||
|
||||
# Mock sound repository methods
|
||||
scanner_service.sound_repo.get_by_type = AsyncMock(return_value=[existing_sound])
|
||||
scanner_service.sound_repo.delete = AsyncMock()
|
||||
|
||||
# Mock audio utility functions
|
||||
from unittest.mock import patch
|
||||
with patch('app.services.sound_scanner.get_audio_duration'), \
|
||||
patch('app.services.sound_scanner.get_file_size'):
|
||||
|
||||
# Run scan with empty directory (should trigger deletion)
|
||||
results = await scanner_service.scan_directory(str(scan_dir), "SDB")
|
||||
|
||||
# Verify the results
|
||||
assert results["deleted"] == 1
|
||||
assert results["added"] == 0
|
||||
assert results["updated"] == 0
|
||||
assert len(results["files"]) == 1
|
||||
assert results["files"][0]["status"] == "deleted"
|
||||
assert results["files"][0]["reason"] == "file no longer exists"
|
||||
|
||||
# Verify sound_repo.delete was called
|
||||
scanner_service.sound_repo.delete.assert_called_once_with(existing_sound)
|
||||
|
||||
# Verify the normalized file was actually deleted
|
||||
assert not normalized_file.exists()
|
||||
|
||||
Reference in New Issue
Block a user