drrobot9 commited on
Commit
3845214
·
verified ·
1 Parent(s): 7b109a2

Initial commit

Browse files
Files changed (4) hide show
  1. Dockerfile +18 -0
  2. config.json +5 -0
  3. main.py +130 -0
  4. requirements.txt +8 -0
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base image
2
+ FROM python:3.12-slim
3
+
4
+ # Set workdir
5
+ WORKDIR /app
6
+
7
+ # Copy code and config
8
+ COPY . .
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir \
12
+ torch torchaudio librosa soundfile requests transformers fastapi uvicorn[standard] python-multipart
13
+
14
+ # Expose port
15
+ EXPOSE 7860
16
+
17
+ # Run FastAPI
18
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eleven_api_key": "sk_84188df74eab2dc17d5cafae4365e2883699bb57dbfaacaa",
3
+ "eleven_voice_id": "ZthjuvLPty3kTMaNKVKb",
4
+ "llm_url": "https://remostart-super-test-verstion-three.hf.space/ask"
5
+ }
main.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import torch
4
+ import librosa
5
+ import requests
6
+ import soundfile as sf
7
+ from fastapi import FastAPI, UploadFile, File
8
+ from fastapi.responses import FileResponse
9
+ from transformers import (
10
+ Wav2Vec2Processor, Wav2Vec2ForCTC,
11
+ AutoFeatureExtractor, AutoModelForAudioClassification
12
+ )
13
+ from starlette.middleware.cors import CORSMiddleware
14
+
15
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
+ print("Device:", DEVICE)
17
+
18
+ # Load config
19
+ with open("config.json") as f:
20
+ config = json.load(f)
21
+
22
+ ELEVEN_API_KEY = config["eleven_api_key"]
23
+ VOICE_ID = config["eleven_voice_id"]
24
+ LLM_URL = config["llm_url"]
25
+
26
+
27
+ # STT Model
28
+
29
+ print("Loading STT model...")
30
+ stt_processor = Wav2Vec2Processor.from_pretrained("facebook/mms-1b-all")
31
+ stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all").to(DEVICE)
32
+ stt_model.eval()
33
+ print("STT loaded ")
34
+
35
+
36
+ def transcribe(audio_path):
37
+ wav, sr = librosa.load(audio_path, sr=16000)
38
+ inputs = stt_processor(wav, sampling_rate=16000, return_tensors="pt", padding=True)
39
+ with torch.no_grad():
40
+ logits = stt_model(inputs.input_values.to(DEVICE)).logits
41
+ ids = torch.argmax(logits, dim=-1)
42
+ return stt_processor.batch_decode(ids)[0].strip()
43
+
44
+
45
+ # Emotion Model
46
+
47
+ print("Loading Emotion model...")
48
+ emotion_extractor = AutoFeatureExtractor.from_pretrained("superb/hubert-base-superb-er")
49
+ emotion_model = AutoModelForAudioClassification.from_pretrained(
50
+ "superb/hubert-base-superb-er"
51
+ ).to(DEVICE)
52
+ emotion_model.eval()
53
+ print("Emotion model loaded ")
54
+
55
+
56
+ def get_emotion(audio_path):
57
+ wav, sr = librosa.load(audio_path, sr=16000)
58
+ feats = emotion_extractor(wav, sampling_rate=16000, return_tensors="pt")
59
+ with torch.no_grad():
60
+ out = emotion_model(feats["input_values"].to(DEVICE))
61
+ pred = torch.argmax(out.logits, dim=-1).item()
62
+ return emotion_model.config.id2label[pred]
63
+
64
+
65
+
66
+ # LLM Call
67
+
68
+ def ask_llm(text):
69
+ payload = {"query": text}
70
+ r = requests.post(LLM_URL, json=payload, timeout=200)
71
+ try:
72
+ return r.json()["answer"]
73
+ except:
74
+ return str(r.json())
75
+
76
+
77
+
78
+ # TTS
79
+
80
+ def tts_eleven(text, out_file="response.mp3"):
81
+ url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
82
+ headers = {
83
+ "xi-api-key": ELEVEN_API_KEY,
84
+ "Content-Type": "application/json"
85
+ }
86
+ payload = {"text": text, "model_id": "eleven_multilingual_v2"}
87
+
88
+ resp = requests.post(url, json=payload, headers=headers)
89
+ if resp.status_code != 200:
90
+ raise Exception(f"ElevenLabs TTS Error: {resp.text}")
91
+
92
+ with open(out_file, "wb") as f:
93
+ f.write(resp.content)
94
+ return out_file
95
+
96
+
97
+
98
+ # FastAPI App
99
+
100
+ app = FastAPI(title="Voice AI API")
101
+
102
+ # Enable CORS for Hugging Face Spaces frontend
103
+ app.add_middleware(
104
+ CORSMiddleware,
105
+ allow_origins=["*"],
106
+ allow_credentials=True,
107
+ allow_methods=["*"],
108
+ allow_headers=["*"],
109
+ )
110
+
111
+
112
+ @app.post("/process-audio/")
113
+ async def process_audio(file: UploadFile = File(...)):
114
+ audio_path = f"temp_{file.filename}"
115
+ with open(audio_path, "wb") as f:
116
+ f.write(await file.read())
117
+
118
+
119
+ transcript = transcribe(audio_path)
120
+ emotion = get_emotion(audio_path)
121
+ llm_out = ask_llm(transcript)
122
+ tts_file = tts_eleven(llm_out)
123
+
124
+ # Return TTS file as downloadable mp3
125
+ return FileResponse(tts_file, media_type="audio/mpeg", filename="response.mp3")
126
+
127
+
128
+ @app.get("/")
129
+ async def root():
130
+ return {"message": "Voice AI API is running. Use /process-audio/ endpoint to upload audio."}
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ transformers
4
+ librosa
5
+ soundfile
6
+ requests
7
+ fastapi
8
+ uvicorn[standard]