Spaces:

AlserFurma
/

LipSyncAI

Running

App Files Files Community

AlserFurma commited on 20 days ago

Commit

80ff644

verified ·

1 Parent(s): 534e83d

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -76

app.py CHANGED Viewed

@@ -4,69 +4,109 @@ from PIL import Image
 import tempfile
 from gradio_client import Client, handle_file
 import torch
-from transformers import VitsModel, AutoTokenizer
 import scipy.io.wavfile as wavfile
 import traceback
-# Загрузка моделей при старте
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 try:
-    # TTS модель для казахского (исправлено с rus на kaz)
     tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
     tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
-    print("TTS модель (kaz) загружена успешно!")
 except Exception as e:
-    raise RuntimeError(f"Ошибка загрузки TTS модели: {str(e)}")
-# Space для talking-head
 TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
 def inference(image: Image.Image, text: str):
     error_msg = ""
     video_path = None
     audio_path = None
     img_path = None
     try:
-        # Валидация входных данных
         if image is None:
             raise ValueError("Загрузите изображение лектора!")
         if not text or not text.strip():
             raise ValueError("Введите текст лекции!")
         if len(text) > 500:
-            raise ValueError("Текст слишком длинный! Используйте до 500 символов.")
-        print(f"Генерация TTS для текста: '{text[:50]}...'")
-        # Шаг 1: Генерация аудио через TTS
-        torch.manual_seed(42)
-        inputs = tts_tokenizer(text, return_tensors="pt").to(device)
         with torch.no_grad():
             output = tts_model(**inputs)
-            waveform = output.waveform.squeeze().cpu().numpy()
-        if waveform.size == 0:
-            raise ValueError("TTS сгенерировал пустое аудио! Попробуйте другой текст.")
-        # Конвертация в int16 для WAV
         audio = (waveform * 32767).astype("int16")
         sampling_rate = tts_model.config.sampling_rate
-        # Сохранение аудио
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as audio_file:
-            wavfile.write(audio_file.name, sampling_rate, audio)
-            audio_path = audio_file.name
-        print(f"TTS аудио сохранено: {audio_path} (длина: {len(waveform)/sampling_rate:.1f} сек)")
-        # Шаг 2: Сохранение изображения
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as img_file:
-            # Конвертация в RGB если нужно
-            if image.mode != 'RGB':
-                image = image.convert('RGB')
-            image.save(img_file.name, format='PNG')
-            img_path = img_file.name
-        print(f"Изображение сохранено: {img_path}")
-        # Шаг 3: Вызов talking-head API
-        print(f"Подключение к {TALKING_HEAD_SPACE}...")
         client = Client(TALKING_HEAD_SPACE)
-        # Проверяем доступные API endpoints
-        print("Доступные API методы:", client.view_api())
-        # Вызов API с правильными параметрами
         result = client.predict(
             image_path=handle_file(img_path),
             audio_path=handle_file(audio_path),
@@ -74,67 +114,63 @@ def inference(image: Image.Image, text: str):
             steps=10,
             api_name="/process_image_audio"
         )
-        print(f"Результат API: {type(result)}")
-        # Обработка результата
-        if isinstance(result, tuple) and len(result) > 0:
-            video_data = result[0]
-            if isinstance(video_data, dict) and 'video' in video_data:
-                video_path = video_data['video']
-            elif isinstance(video_data, dict) and 'path' in video_data:
-                video_path = video_data['path']
-            elif isinstance(video_data, str):
-                video_path = video_data
-            else:
-                video_path = video_data
         else:
-            video_path = result
-        print(f"Видео сгенерировано: {video_path}")
-        error_msg = "✅ Видео успешно сгенерировано!"
     except Exception as e:
         error_msg = f"❌ Ошибка: {str(e)}"
-        print(f"ОШИБКА: {error_msg}")
         traceback.print_exc()
     finally:
-        # Очистка временных файлов
-        if audio_path and os.path.exists(audio_path):
-            try:
-                os.remove(audio_path)
-                print(f"Удален временный файл: {audio_path}")
-            except:
-                pass
-        if img_path and os.path.exists(img_path):
-            try:
-                os.remove(img_path)
-                print(f"Удален временный файл: {img_path}")
-            except:
-                pass
     return video_path, error_msg
-# Интерфейс Gradio
-title = "🎓 Видео-лектор с TTS (Русский)"
-description = """Загрузите фото лектора и введите текст лекции. Система сгенерирует видео, где лектор "произносит" ваш текст!
-**Требования:**
-- Фото: фронтальное изображение лица
-- Текст: до 500 символов на русском языке"""
 iface = gr.Interface(
     fn=inference,
     inputs=[
-        gr.Image(type="pil", label="📸 Фото лектора"),
         gr.Textbox(
             lines=5,
-            placeholder="Введите текст лекции на русском языке (до 500 символов)...",
-            label="📝 Текст лекции"
         )
     ],
     outputs=[
-        gr.Video(label="🎬 Готовое видео"),
-        gr.Textbox(label="ℹ️ Статус", interactive=False)
     ],
     title=title,
     description=description,
-    cache_examples=False
 )
 if __name__ == "__main__":
-    iface.launch()

 import tempfile
 from gradio_client import Client, handle_file
 import torch
+from transformers import VitsModel, AutoTokenizer, pipeline
 import scipy.io.wavfile as wavfile
 import traceback
+# =========================
+# Загрузка моделей
+# =========================
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 try:
+    # TTS модель казахского языка
     tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
     tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
+    # Модель перевода ru -> kk
+    translator = pipeline(
+        "translation",
+        model="facebook/nllb-200-distilled-600M",
+        device=0 if device == "cuda" else -1
+    )
+    print("✅ Все модели успешно загружены!")
 except Exception as e:
+    raise RuntimeError(f"Ошибка загрузки моделей: {str(e)}")
+# =========================
+# Talking Head Space
+# =========================
 TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
+# =========================
+# Основная функция
+# =========================
 def inference(image: Image.Image, text: str):
     error_msg = ""
     video_path = None
     audio_path = None
     img_path = None
     try:
+        # Проверки
         if image is None:
             raise ValueError("Загрузите изображение лектора!")
         if not text or not text.strip():
             raise ValueError("Введите текст лекции!")
         if len(text) > 500:
+            raise ValueError("Текст превышает 500 символов!")
+        print("Ввод (RU):", text)
+        # =========================
+        # Шаг 1 — Перевод
+        # =========================
+        translation = translator(
+            text,
+            src_lang="rus_Cyrl",
+            tgt_lang="kaz_Cyrl"
+        )
+        translated_text = translation[0]["translation_text"]
+        print("Перевод (KK):", translated_text)
+        # =========================
+        # Шаг 2 — Озвучка
+        # =========================
+        inputs = tts_tokenizer(translated_text, return_tensors="pt").to(device)
         with torch.no_grad():
             output = tts_model(**inputs)
+        waveform = output.waveform.squeeze().cpu().numpy()
         audio = (waveform * 32767).astype("int16")
         sampling_rate = tts_model.config.sampling_rate
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            wavfile.write(f.name, sampling_rate, audio)
+            audio_path = f.name
+        # =========================
+        # Шаг 3 — Сохранение изображения
+        # =========================
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
+            if image.mode != "RGB":
+                image = image.convert("RGB")
+            image.save(f.name)
+            img_path = f.name
+        # =========================
+        # Шаг 4 — Генерация видео
+        # =========================
         client = Client(TALKING_HEAD_SPACE)
         result = client.predict(
             image_path=handle_file(img_path),
             audio_path=handle_file(audio_path),
             steps=10,
             api_name="/process_image_audio"
         )
+        if isinstance(result, tuple):
+            video_path = result[0]
         else:
+            raise ValueError("Видео не получено!")
+        error_msg = "✅ Видео успешно создано!"
     except Exception as e:
         error_msg = f"❌ Ошибка: {str(e)}"
         traceback.print_exc()
     finally:
+        for p in [audio_path, img_path]:
+            if p and os.path.exists(p):
+                try:
+                    os.remove(p)
+                except:
+                    pass
     return video_path, error_msg
+# =========================
+# Gradio Интерфейс
+# =========================
+title = "Бейне Оқытушы"
+description = """
+Суретіңізді жүктеп, дәріс мәтінін орыс тілінде енгізіңіз.
+Жүйе автоматты түрде қазақ тіліне аударады және бейне жасайды!
+**Талаптар:**
+- Фото: бет анық көрінетін
+- Мәтін: орыс тілінде (500 таңбаға дейін)
+"""
 iface = gr.Interface(
     fn=inference,
     inputs=[
+        gr.Image(type="pil", label="📸 Фото дәріскер"),
         gr.Textbox(
             lines=5,
+            label="📝 Дәріс мәтіні (орыс тілінде)",
+            placeholder="500 таңбаға дейін..."
         )
     ],
     outputs=[
+        gr.Video(label="🎬 Дайын бейне"),
+        gr.Textbox(label="ℹ️ Мәртебе")
     ],
     title=title,
     description=description,
+    cache_examples=False,
+    flagging_mode="never"
 )
 if __name__ == "__main__":
+    iface.launch()