EurekaPotato commited on
Commit
9d8ae5e
Β·
verified Β·
1 Parent(s): 3469c65

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +14 -2
  2. README.md +27 -10
  3. handler.py +167 -63
  4. requirements.txt +17 -6
Dockerfile CHANGED
@@ -2,12 +2,24 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
 
 
 
 
 
 
 
5
  COPY requirements.txt .
6
- RUN pip install --no-cache-dir torch==2.1.0 --index-url https://download.pytorch.org/whl/cpu
 
 
 
 
 
7
  RUN pip install -r requirements.txt
8
 
9
  COPY . .
10
 
11
  EXPOSE 7860
12
 
13
- CMD ["python", "handler.py"]
 
2
 
3
  WORKDIR /app
4
 
5
+ # System dependencies for audio processing + git for torch.hub
6
+ RUN apt-get update && apt-get install -y \
7
+ libsndfile1 \
8
+ ffmpeg \
9
+ git \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
  COPY requirements.txt .
13
+
14
+ # Install CPU-only torch first (prevents CUDA downloads)
15
+ RUN pip install --no-cache-dir torch==2.1.0+cpu torchvision==0.16.0+cpu torchaudio==2.1.0+cpu \
16
+ --extra-index-url https://download.pytorch.org/whl/cpu
17
+
18
+ # Install other dependencies
19
  RUN pip install -r requirements.txt
20
 
21
  COPY . .
22
 
23
  EXPOSE 7860
24
 
25
+ CMD ["uvicorn", "handler:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,26 +1,43 @@
1
  ---
2
- title: Busy Module Text Features
3
- emoji: πŸ’¬
4
- colorFrom: blue
5
- colorTo: green
6
  sdk: docker
7
  app_port: 7860
8
  pinned: false
9
  ---
10
 
11
- # Text Feature Extraction API
 
 
12
 
13
- Extracts 9 text features from conversation transcripts: explicit intent, response patterns, cognitive load, time pressure, deflection, sentiment (RoBERTa), coherence (Sentence Transformer), and latency.
14
 
15
  ## API
16
 
17
- **POST** `/extract-text-features`
18
  ```json
19
  {
20
- "transcript": "I'm driving right now, can't talk",
21
- "utterances": ["I'm driving right now", "can't talk"],
22
- "question": "How are you doing?"
23
  }
24
  ```
25
 
 
 
 
 
 
 
 
 
26
  **GET** `/health`
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Busy Module Audio Features
3
+ emoji: 🎀
4
+ colorFrom: indigo
5
+ colorTo: purple
6
  sdk: docker
7
  app_port: 7860
8
  pinned: false
9
  ---
10
 
11
+ # Busy Module Audio Features
12
+
13
+ ## Audio Feature Extraction API
14
 
15
+ Extracts 17 voice features from audio: SNR, noise classification, speech rate, pitch, energy, pause analysis, and emotion features.
16
 
17
  ## API
18
 
19
+ **POST** `/extract-audio-features-base64`
20
  ```json
21
  {
22
+ "audio_base64": "<base64-encoded-wav>",
23
+ "transcript": "I'm driving right now"
 
24
  }
25
  ```
26
 
27
+ **POST** `/extract-audio-features` (multipart form)
28
+ - `audio`: audio file upload
29
+ - `transcript`: text transcript
30
+
31
+ **POST** `/extract-audio-features` (multipart form)
32
+ - `audio`: audio file upload
33
+ - `transcript`: text transcript
34
+
35
  **GET** `/health`
36
+
37
+ ## Authentication
38
+
39
+ This Space requires access to private models. You must add your Hugging Face token as a secret:
40
+ 1. Go to **Settings** -> **Variables and secrets**.
41
+ 2. Click **New secret**.
42
+ 3. Name: `HF_TOKEN`
43
+ 4. Value: Your Hugging Face Access Token (with read permissions).
handler.py CHANGED
@@ -1,53 +1,91 @@
1
  """
2
- Text Feature Extraction β€” Hugging Face Inference Endpoint Handler
3
 
4
- Extracts all 9 text features from conversation transcript:
5
- t0_explicit_free, t1_explicit_busy, t2_avg_resp_len, t3_short_ratio,
6
- t4_cognitive_load, t5_time_pressure, t6_deflection, t7_sentiment,
7
- t8_coherence, t9_latency
8
 
9
- Derived from: src/text_features.py
10
  """
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # ──────────────────────────────────────────────────────────────────────── #
13
  # Imports from standardized modules
14
  # ──────────────────────────────────────────────────────────────────────── #
15
  try:
16
- from text_features import TextFeatureExtractor
17
  except ImportError:
 
18
  import sys
19
  sys.path.append('.')
20
- from text_features import TextFeatureExtractor
21
 
22
  # Initialize global extractor
23
- print("[INFO] Initializing Global TextFeatureExtractor...")
24
- extractor = TextFeatureExtractor(use_intent_model=True)
 
 
 
 
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # ──────────────────────────────────────────────────────────────────────── #
28
- # FastAPI handler for deployment
29
  # ──────────────────────────────────────────────────────────────────────── #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- from fastapi import FastAPI, Request
32
- from fastapi.middleware.cors import CORSMiddleware
33
- from fastapi.responses import JSONResponse
34
- from pydantic import BaseModel
35
- from typing import Optional, List, Dict
36
- import traceback
37
 
38
  # ──────────────────────────────────────────────────────────────────────── #
39
- # Constants & Defaults
40
  # ──────────────────────────────────────────────────────────────────────── #
41
 
42
- DEFAULT_TEXT_FEATURES = {
43
- "t0_explicit_free": 0.0, "t1_explicit_busy": 0.0,
44
- "t2_avg_resp_len": 0.0, "t3_short_ratio": 0.0,
45
- "t4_cognitive_load": 0.0, "t5_time_pressure": 0.0,
46
- "t6_deflection": 0.0, "t7_sentiment": 0.0,
47
- "t8_coherence": 0.5, "t9_latency": 0.0,
48
- }
49
 
50
- app = FastAPI(title="Text Feature Extraction API", version="1.0.0")
51
  app.add_middleware(
52
  CORSMiddleware,
53
  allow_origins=["*"], allow_credentials=True,
@@ -57,65 +95,130 @@ app.add_middleware(
57
 
58
  @app.exception_handler(Exception)
59
  async def global_exception_handler(request: Request, exc: Exception):
 
60
  print(f"[GLOBAL ERROR] {request.url}: {exc}")
61
  traceback.print_exc()
62
  return JSONResponse(
63
  status_code=200,
64
- content={**DEFAULT_TEXT_FEATURES, "_error": str(exc), "_handler": "global"},
65
  )
66
 
67
- class TextRequest(BaseModel):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  transcript: str = ""
69
- # Optional list of extra utterances if available
70
- utterances: List[str] = []
71
- question: str = ""
72
- events: Optional[List[Dict]] = None
73
 
74
 
75
  @app.get("/")
76
  async def root():
77
  return {
78
- "service": "Text Feature Extraction API",
79
  "version": "1.0.0",
80
- "endpoints": ["/health", "/extract-text-features"],
81
  }
82
 
83
 
84
  @app.get("/health")
85
  async def health():
 
 
86
  return {
87
- "status": "healthy",
88
- "intent_model_loaded": extractor.use_intent_model,
89
- "sentiment_loaded": extractor.sentiment_model is not None,
90
  }
91
 
92
 
93
- @app.post("/extract-text-features")
94
- async def extract_text_features(data: TextRequest):
95
- """Extract all 9 text features from transcript."""
96
- # Prepare inputs for TextFeatureExtractor.extract_all
97
- # It expects: transcript_list, full_transcript, question, events
98
-
99
- transcript_list = data.utterances
100
- if not transcript_list and data.transcript:
101
- transcript_list = [data.transcript]
102
-
103
- features = extractor.extract_all(
104
- transcript_list=transcript_list,
105
- full_transcript=data.transcript,
106
- question=data.question,
107
- events=data.events,
108
- )
109
-
110
- # Sanitize inputs to ensure floats
111
- sanitized = {}
112
- for k, v in features.items():
113
- if isinstance(v, float):
114
- sanitized[k] = 0.0 if np.isnan(v) or np.isinf(v) else v
115
- else:
116
- sanitized[k] = v
117
-
118
- return sanitized
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
 
121
  if __name__ == "__main__":
@@ -123,3 +226,4 @@ if __name__ == "__main__":
123
  import os
124
  port = int(os.environ.get("PORT", 7860))
125
  uvicorn.run(app, host="0.0.0.0", port=port)
 
 
1
  """
2
+ Audio Feature Extraction β€” Hugging Face Inference Endpoint Handler
3
 
4
+ Extracts all 17 voice features from uploaded audio:
5
+ v1_snr, v2_noise_* (5), v3_speech_rate, v4/v5_pitch, v6/v7_energy,
6
+ v8/v9/v10_pause, v11/v12/v13_emotion
 
7
 
8
+ Derived from: src/audio_features.py, src/emotion_features.py
9
  """
10
 
11
+ import io
12
+ import numpy as np
13
+ import librosa
14
+ from scipy import signal as scipy_signal
15
+ from typing import Dict
16
+ import torch
17
+ import torch.nn as nn
18
+ from torchvision import models
19
+ import warnings
20
+
21
+ warnings.filterwarnings("ignore")
22
+
23
+
24
  # ──────────────────────────────────────────────────────────────────────── #
25
  # Imports from standardized modules
26
  # ──────────────────────────────────────────────────────────────────────── #
27
  try:
28
+ from audio_features import AudioFeatureExtractor
29
  except ImportError:
30
+ # Fallback if running from a different context
31
  import sys
32
  sys.path.append('.')
33
+ from audio_features import AudioFeatureExtractor
34
 
35
  # Initialize global extractor
36
+ # We use a global instance to cache models (VAD, Emotion)
37
+ print("[INFO] Initializing Global AudioFeatureExtractor...")
38
+ extractor = AudioFeatureExtractor(
39
+ sample_rate=16000,
40
+ use_emotion=True,
41
+ emotion_models_dir="/app/models" # Absolute path in Docker container
42
+ )
43
 
44
+ # Ensure models are downloaded/ready
45
+ if extractor.use_emotion and extractor.emotion_extractor:
46
+ print("[INFO] Checking for emotion models...")
47
+ # Trigger download if needed/possible
48
+ try:
49
+ if len(extractor.emotion_extractor.models) == 0:
50
+ print("[INFO] Models not found, attempting download...")
51
+ extractor.emotion_extractor.download_models()
52
+ # Re-init manually to load them
53
+ extractor.emotion_extractor.__init__(models_dir=extractor.emotion_extractor.models_dir)
54
+ except Exception as e:
55
+ print(f"[WARN] Failed to download emotion models: {e}")
56
 
57
  # ──────────────────────────────────────────────────────────────────────── #
58
+ # Helper to handle NaN/Inf for JSON
59
  # ──────────────────────────────────────────────────────────────────────── #
60
+ def sanitize_features(features: Dict[str, float]) -> Dict[str, float]:
61
+ sanitized = {}
62
+ for key, val in features.items():
63
+ if isinstance(val, (float, np.floating)):
64
+ if np.isnan(val) or np.isinf(val):
65
+ sanitized[key] = 0.0
66
+ else:
67
+ sanitized[key] = float(val)
68
+ elif isinstance(val, (int, np.integer)):
69
+ sanitized[key] = int(val)
70
+ else:
71
+ sanitized[key] = val # keep string/other as is
72
+ return sanitized
73
+
74
 
 
 
 
 
 
 
75
 
76
  # ──────────────────────────────────────────────────────────────────────── #
77
+ # FastAPI handler for deployment (HF Spaces / Cloud Run / Lambda)
78
  # ──────────────────────────────────────────────────────────────────────── #
79
 
80
+ from fastapi import FastAPI, File, UploadFile, Form, Request
81
+ from fastapi.middleware.cors import CORSMiddleware
82
+ from fastapi.responses import JSONResponse
83
+ from pydantic import BaseModel
84
+ from typing import Optional
85
+ import base64
86
+ import traceback
87
 
88
+ app = FastAPI(title="Audio Feature Extraction API", version="1.0.0")
89
  app.add_middleware(
90
  CORSMiddleware,
91
  allow_origins=["*"], allow_credentials=True,
 
95
 
96
  @app.exception_handler(Exception)
97
  async def global_exception_handler(request: Request, exc: Exception):
98
+ """Catch any unhandled exceptions and return defaults instead of 500."""
99
  print(f"[GLOBAL ERROR] {request.url}: {exc}")
100
  traceback.print_exc()
101
  return JSONResponse(
102
  status_code=200,
103
+ content={**DEFAULT_AUDIO_FEATURES, "_error": str(exc), "_handler": "global"},
104
  )
105
 
106
+ # Extractor is already initialized globally above
107
+
108
+ # ──────────────────────────────────────────────────────────────────────── #
109
+ # Constants & Defaults
110
+ # ──────────────────────────────────────────────────────────────────────── #
111
+
112
+ DEFAULT_AUDIO_FEATURES = {
113
+ "v1_snr": 0.0,
114
+ "v2_noise_traffic": 0.0,
115
+ "v2_noise_office": 0.0,
116
+ "v2_noise_crowd": 0.0,
117
+ "v2_noise_wind": 0.0,
118
+ "v2_noise_clean": 1.0,
119
+ "v3_speech_rate": 0.0,
120
+ "v4_pitch_mean": 0.0,
121
+ "v5_pitch_std": 0.0,
122
+ "v6_energy_mean": 0.0,
123
+ "v7_energy_std": 0.0,
124
+ "v8_pause_ratio": 0.0,
125
+ "v9_avg_pause_dur": 0.0,
126
+ "v10_mid_pause_cnt": 0.0,
127
+ "v11_emotion_stress": 0.0,
128
+ "v12_emotion_energy": 0.0,
129
+ "v13_emotion_valence": 0.0,
130
+ }
131
+
132
+ class AudioBase64Request(BaseModel):
133
+ audio_base64: str = ""
134
  transcript: str = ""
 
 
 
 
135
 
136
 
137
  @app.get("/")
138
  async def root():
139
  return {
140
+ "service": "Audio Feature Extraction API",
141
  "version": "1.0.0",
142
+ "endpoints": ["/health", "/extract-audio-features", "/extract-audio-features-base64"],
143
  }
144
 
145
 
146
  @app.get("/health")
147
  async def health():
148
+ vad_status = extractor.vad_model is not None
149
+ emotion_status = extractor.emotion_extractor is not None if extractor.use_emotion else False
150
  return {
151
+ "status": "healthy",
152
+ "vad_loaded": vad_status,
153
+ "emotion_loaded": emotion_status
154
  }
155
 
156
 
157
+ @app.post("/extract-audio-features")
158
+ async def extract_audio_features(audio: UploadFile = File(...), transcript: str = Form("")):
159
+ """Extract all 17 voice features from uploaded audio file."""
160
+ try:
161
+ audio_bytes = await audio.read()
162
+ # librosa.load returns (audio, sr)
163
+ y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
164
+
165
+ # AudioFeatureExtractor.extract_all expects numpy array and optional transcript
166
+ features = extractor.extract_all(y, transcript)
167
+
168
+ return sanitize_features(features)
169
+ except Exception as e:
170
+ print(f"[ERROR] extract_audio_features: {e}")
171
+ traceback.print_exc()
172
+ return {**DEFAULT_AUDIO_FEATURES, "_error": str(e)}
173
+
174
+
175
+ @app.post("/extract-audio-features-base64")
176
+ async def extract_audio_features_base64(data: AudioBase64Request):
177
+ """Extract features from base64-encoded audio (for Vercel serverless calls)."""
178
+ import soundfile as sf
179
+
180
+ audio_b64 = data.audio_base64
181
+ transcript = data.transcript
182
+
183
+ # Handle empty / missing audio β€” return default features
184
+ if not audio_b64 or len(audio_b64) < 100:
185
+ print("[INFO] Empty or too-short audio_base64, returning defaults")
186
+ return {**DEFAULT_AUDIO_FEATURES}
187
+
188
+ try:
189
+ # Strip data URL prefix if present (e.g. "data:audio/wav;base64,...")
190
+ if "," in audio_b64[:80]:
191
+ audio_b64 = audio_b64.split(",", 1)[1]
192
+
193
+ audio_bytes = base64.b64decode(audio_b64)
194
+ print(f"[INFO] Decoded {len(audio_bytes)} bytes of audio")
195
+
196
+ # Try soundfile first, fall back to librosa
197
+ try:
198
+ y, sr = sf.read(io.BytesIO(audio_bytes))
199
+ except Exception as sf_err:
200
+ print(f"[WARN] soundfile failed ({sf_err}), trying librosa...")
201
+ y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
202
+
203
+ if hasattr(y, 'shape') and len(y.shape) > 1:
204
+ y = np.mean(y, axis=1)
205
+ y = np.asarray(y, dtype=np.float32)
206
+ if sr != 16000:
207
+ y = librosa.resample(y, orig_sr=sr, target_sr=16000)
208
+ y = y.astype(np.float32)
209
+
210
+ if len(y) < 100:
211
+ print("[WARN] Audio too short after decode, returning defaults")
212
+ return {**DEFAULT_AUDIO_FEATURES}
213
+
214
+ features = extractor.extract_all(y, transcript)
215
+ print(f"[OK] Extracted {len(features)} audio features")
216
+ return sanitize_features(features)
217
+ except Exception as e:
218
+ print(f"[ERROR] extract_audio_features_base64: {e}")
219
+ traceback.print_exc()
220
+ # Return defaults rather than 500
221
+ return {**DEFAULT_AUDIO_FEATURES, "_error": str(e)}
222
 
223
 
224
  if __name__ == "__main__":
 
226
  import os
227
  port = int(os.environ.get("PORT", 7860))
228
  uvicorn.run(app, host="0.0.0.0", port=port)
229
+
requirements.txt CHANGED
@@ -1,11 +1,22 @@
1
- # NLP
2
- transformers==4.35.0
3
- sentence-transformers==2.2.2
4
-
5
  numpy==1.24.3
6
- scikit-learn==1.3.2
 
 
 
 
 
 
 
 
 
7
 
8
  # API
9
  fastapi==0.95.2
10
  uvicorn==0.22.0
11
- pydantic==1.10.13
 
 
 
 
1
+ # Core audio processing
2
+ librosa==0.10.1
3
+ soundfile==0.12.1
 
4
  numpy==1.24.3
5
+ scipy==1.11.2
6
+
7
+ # ML - CPU-only versions (HF Spaces friendly)
8
+ # Torch for Silero VAD
9
+ --extra-index-url https://download.pytorch.org/whl/cpu
10
+ torch==2.1.0+cpu
11
+ torchaudio==2.1.0+cpu
12
+
13
+ # TensorFlow for Emotion Models
14
+ tensorflow-cpu==2.15.0
15
 
16
  # API
17
  fastapi==0.95.2
18
  uvicorn==0.22.0
19
+ python-multipart==0.0.6
20
+ huggingface_hub>=0.19.0
21
+ noisereduce>=3.0.0
22
+ scikit-image>=0.21.0