feat: Implement Text-to-Speech (TTS) functionality with API endpoints, models, and service integration

2025-09-20 23:10:47 +02:00
parent fb0e5e919c
commit 5e8d619736
11 changed files with 887 additions and 0 deletions
--- a/app/services/tts/providers/init.py
+++ b/app/services/tts/providers/init.py
@@ -0,0 +1,5 @@
+"""TTS providers package."""
+
+from .gtts import GTTSProvider
+
+__all__ = ["GTTSProvider"]
--- a/app/services/tts/providers/gtts.py
+++ b/app/services/tts/providers/gtts.py
@@ -0,0 +1,81 @@
+"""Google Text-to-Speech provider."""
+
+import asyncio
+import io
+from typing import Any
+
+from gtts import gTTS
+
+from ..base import TTSProvider
+
+
+class GTTSProvider(TTSProvider):
+    """Google Text-to-Speech provider implementation."""
+
+    @property
+    def name(self) -> str:
+        """Return the provider name."""
+        return "gtts"
+
+    @property
+    def file_extension(self) -> str:
+        """Return the default file extension for this provider."""
+        return "mp3"
+
+    async def generate_speech(self, text: str, **options: Any) -> bytes:
+        """Generate speech from text using Google TTS.
+
+        Args:
+            text: The text to convert to speech
+            **options: GTTS-specific options (lang, tld, slow)
+
+        Returns:
+            MP3 audio data as bytes
+        """
+        lang = options.get("lang", "en")
+        tld = options.get("tld", "com")
+        slow = options.get("slow", False)
+
+        # Run TTS generation in thread pool since gTTS is synchronous
+        def _generate():
+            tts = gTTS(text=text, lang=lang, tld=tld, slow=slow)
+            fp = io.BytesIO()
+            tts.write_to_fp(fp)
+            fp.seek(0)
+            return fp.read()
+
+        # Use asyncio.to_thread which is more reliable than run_in_executor
+        return await asyncio.to_thread(_generate)
+
+    def get_supported_languages(self) -> list[str]:
+        """Return list of supported language codes."""
+        # Common GTTS supported languages
+        return [
+            "af", "ar", "bg", "bn", "bs", "ca", "cs", "cy", "da", "de", "el", "en",
+            "eo", "es", "et", "fi", "fr", "gu", "hi", "hr", "hu", "hy", "id", "is",
+            "it", "ja", "jw", "km", "kn", "ko", "la", "lv", "mk", "ml", "mr", "my",
+            "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sq", "sr", "su",
+            "sv", "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw"
+        ]
+
+    def get_option_schema(self) -> dict[str, Any]:
+        """Return schema for GTTS-specific options."""
+        return {
+            "lang": {
+                "type": "string",
+                "default": "en",
+                "description": "Language code",
+                "enum": self.get_supported_languages()
+            },
+            "tld": {
+                "type": "string",
+                "default": "com",
+                "description": "Top-level domain for Google TTS",
+                "enum": ["com", "co.uk", "com.au", "ca", "co.in", "ie", "co.za"]
+            },
+            "slow": {
+                "type": "boolean",
+                "default": False,
+                "description": "Speak slowly"
+            }
+        }