Spaces:

divAIne
/

busy-module-audio

Sleeping

App Files Files Community

EurekaPotato commited on Feb 19

Commit

9d8ae5e

verified ·

1 Parent(s): 3469c65

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

Dockerfile +14 -2
README.md +27 -10
handler.py +167 -63
requirements.txt +17 -6

Dockerfile CHANGED Viewed

@@ -2,12 +2,24 @@ FROM python:3.10-slim
 WORKDIR /app
 COPY requirements.txt .
-RUN pip install --no-cache-dir torch==2.1.0 --index-url https://download.pytorch.org/whl/cpu
 RUN pip install -r requirements.txt
 COPY . .
 EXPOSE 7860
-CMD ["python", "handler.py"]

 WORKDIR /app
+# System dependencies for audio processing + git for torch.hub
+RUN apt-get update && apt-get install -y \
+    libsndfile1 \
+    ffmpeg \
+    git \
+    && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
+# Install CPU-only torch first (prevents CUDA downloads)
+RUN pip install --no-cache-dir torch==2.1.0+cpu torchvision==0.16.0+cpu torchaudio==2.1.0+cpu \
+    --extra-index-url https://download.pytorch.org/whl/cpu
+# Install other dependencies
 RUN pip install -r requirements.txt
 COPY . .
 EXPOSE 7860
+CMD ["uvicorn", "handler:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,26 +1,43 @@
 ---
-title: Busy Module Text Features
-emoji: 💬
-colorFrom: blue
-colorTo: green
 sdk: docker
 app_port: 7860
 pinned: false
 ---
-# Text Feature Extraction API
-Extracts 9 text features from conversation transcripts: explicit intent, response patterns, cognitive load, time pressure, deflection, sentiment (RoBERTa), coherence (Sentence Transformer), and latency.
 ## API
-**POST** `/extract-text-features`
 ```json
 {
-  "transcript": "I'm driving right now, can't talk",
-  "utterances": ["I'm driving right now", "can't talk"],
-  "question": "How are you doing?"
 }
 ```
 **GET** `/health`

 ---
+title: Busy Module Audio Features
+emoji: 🎤
+colorFrom: indigo
+colorTo: purple
 sdk: docker
 app_port: 7860
 pinned: false
 ---
+# Busy Module Audio Features
+## Audio Feature Extraction API
+Extracts 17 voice features from audio: SNR, noise classification, speech rate, pitch, energy, pause analysis, and emotion features.
 ## API
+**POST** `/extract-audio-features-base64`
 ```json
 {
+  "audio_base64": "<base64-encoded-wav>",
+  "transcript": "I'm driving right now"
 }
 ```
+**POST** `/extract-audio-features` (multipart form)
+- `audio`: audio file upload
+- `transcript`: text transcript
+**POST** `/extract-audio-features` (multipart form)
+- `audio`: audio file upload
+- `transcript`: text transcript
 **GET** `/health`
+## Authentication
+This Space requires access to private models. You must add your Hugging Face token as a secret:
+1. Go to **Settings** -> **Variables and secrets**.
+2. Click **New secret**.
+3. Name: `HF_TOKEN`
+4. Value: Your Hugging Face Access Token (with read permissions).

handler.py CHANGED Viewed

@@ -1,53 +1,91 @@
 """
-Text Feature Extraction — Hugging Face Inference Endpoint Handler
-Extracts all 9 text features from conversation transcript:
-  t0_explicit_free, t1_explicit_busy, t2_avg_resp_len, t3_short_ratio,
-  t4_cognitive_load, t5_time_pressure, t6_deflection, t7_sentiment,
-  t8_coherence, t9_latency
-Derived from: src/text_features.py
 """
 # ──────────────────────────────────────────────────────────────────────── #
 # Imports from standardized modules
 # ──────────────────────────────────────────────────────────────────────── #
 try:
-    from text_features import TextFeatureExtractor
 except ImportError:
     import sys
     sys.path.append('.')
-    from text_features import TextFeatureExtractor
 # Initialize global extractor
-print("[INFO] Initializing Global TextFeatureExtractor...")
-extractor = TextFeatureExtractor(use_intent_model=True)
 # ──────────────────────────────────────────────────────────────────────── #
-# FastAPI handler for deployment
 # ──────────────────────────────────────────────────────────────────────── #
-from fastapi import FastAPI, Request
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel
-from typing import Optional, List, Dict
-import traceback
 # ──────────────────────────────────────────────────────────────────────── #
-# Constants & Defaults
 # ──────────────────────────────────────────────────────────────────────── #
-DEFAULT_TEXT_FEATURES = {
-    "t0_explicit_free": 0.0, "t1_explicit_busy": 0.0,
-    "t2_avg_resp_len": 0.0, "t3_short_ratio": 0.0,
-    "t4_cognitive_load": 0.0, "t5_time_pressure": 0.0,
-    "t6_deflection": 0.0, "t7_sentiment": 0.0,
-    "t8_coherence": 0.5, "t9_latency": 0.0,
-}
-app = FastAPI(title="Text Feature Extraction API", version="1.0.0")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"], allow_credentials=True,
@@ -57,65 +95,130 @@ app.add_middleware(
 @app.exception_handler(Exception)
 async def global_exception_handler(request: Request, exc: Exception):
     print(f"[GLOBAL ERROR] {request.url}: {exc}")
     traceback.print_exc()
     return JSONResponse(
         status_code=200,
-        content={**DEFAULT_TEXT_FEATURES, "_error": str(exc), "_handler": "global"},
     )
-class TextRequest(BaseModel):
     transcript: str = ""
-    # Optional list of extra utterances if available
-    utterances: List[str] = []
-    question: str = ""
-    events: Optional[List[Dict]] = None
 @app.get("/")
 async def root():
     return {
-        "service": "Text Feature Extraction API",
         "version": "1.0.0",
-        "endpoints": ["/health", "/extract-text-features"],
     }
 @app.get("/health")
 async def health():
     return {
-        "status": "healthy",
-        "intent_model_loaded": extractor.use_intent_model,
-        "sentiment_loaded": extractor.sentiment_model is not None,
     }
-@app.post("/extract-text-features")
-async def extract_text_features(data: TextRequest):
-    """Extract all 9 text features from transcript."""
-    # Prepare inputs for TextFeatureExtractor.extract_all
-    # It expects: transcript_list, full_transcript, question, events
-    transcript_list = data.utterances
-    if not transcript_list and data.transcript:
-        transcript_list = [data.transcript]
-    features = extractor.extract_all(
-        transcript_list=transcript_list,
-        full_transcript=data.transcript,
-        question=data.question,
-        events=data.events,
-    )
-    # Sanitize inputs to ensure floats
-    sanitized = {}
-    for k, v in features.items():
-        if isinstance(v, float):
-             sanitized[k] = 0.0 if np.isnan(v) or np.isinf(v) else v
-        else:
-             sanitized[k] = v
-    return sanitized
 if __name__ == "__main__":
@@ -123,3 +226,4 @@ if __name__ == "__main__":
     import os
     port = int(os.environ.get("PORT", 7860))
     uvicorn.run(app, host="0.0.0.0", port=port)

 """
+Audio Feature Extraction — Hugging Face Inference Endpoint Handler
+Extracts all 17 voice features from uploaded audio:
+  v1_snr, v2_noise_* (5), v3_speech_rate, v4/v5_pitch, v6/v7_energy,
+  v8/v9/v10_pause, v11/v12/v13_emotion
+Derived from: src/audio_features.py, src/emotion_features.py
 """
+import io
+import numpy as np
+import librosa
+from scipy import signal as scipy_signal
+from typing import Dict
+import torch
+import torch.nn as nn
+from torchvision import models
+import warnings
+warnings.filterwarnings("ignore")
 # ──────────────────────────────────────────────────────────────────────── #
 # Imports from standardized modules
 # ──────────────────────────────────────────────────────────────────────── #
 try:
+    from audio_features import AudioFeatureExtractor
 except ImportError:
+    # Fallback if running from a different context
     import sys
     sys.path.append('.')
+    from audio_features import AudioFeatureExtractor
 # Initialize global extractor
+# We use a global instance to cache models (VAD, Emotion)
+print("[INFO] Initializing Global AudioFeatureExtractor...")
+extractor = AudioFeatureExtractor(
+    sample_rate=16000,
+    use_emotion=True,
+    emotion_models_dir="/app/models" # Absolute path in Docker container
+)
+# Ensure models are downloaded/ready
+if extractor.use_emotion and extractor.emotion_extractor:
+    print("[INFO] Checking for emotion models...")
+    # Trigger download if needed/possible
+    try:
+        if len(extractor.emotion_extractor.models) == 0:
+             print("[INFO] Models not found, attempting download...")
+             extractor.emotion_extractor.download_models()
+             # Re-init manually to load them
+             extractor.emotion_extractor.__init__(models_dir=extractor.emotion_extractor.models_dir)
+    except Exception as e:
+        print(f"[WARN] Failed to download emotion models: {e}")
 # ──────────────────────────────────────────────────────────────────────── #
+# Helper to handle NaN/Inf for JSON
 # ──────────────────────────────────────────────────────────────────────── #
+def sanitize_features(features: Dict[str, float]) -> Dict[str, float]:
+    sanitized = {}
+    for key, val in features.items():
+        if isinstance(val, (float, np.floating)):
+            if np.isnan(val) or np.isinf(val):
+                sanitized[key] = 0.0
+            else:
+                sanitized[key] = float(val)
+        elif isinstance(val, (int, np.integer)):
+            sanitized[key] = int(val)
+        else:
+            sanitized[key] = val # keep string/other as is
+    return sanitized
 # ──────────────────────────────────────────────────────────────────────── #
+# FastAPI handler for deployment (HF Spaces / Cloud Run / Lambda)
 # ──────────────────────────────────────────────────────────────────────── #
+from fastapi import FastAPI, File, UploadFile, Form, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from typing import Optional
+import base64
+import traceback
+app = FastAPI(title="Audio Feature Extraction API", version="1.0.0")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"], allow_credentials=True,
 @app.exception_handler(Exception)
 async def global_exception_handler(request: Request, exc: Exception):
+    """Catch any unhandled exceptions and return defaults instead of 500."""
     print(f"[GLOBAL ERROR] {request.url}: {exc}")
     traceback.print_exc()
     return JSONResponse(
         status_code=200,
+        content={**DEFAULT_AUDIO_FEATURES, "_error": str(exc), "_handler": "global"},
     )
+# Extractor is already initialized globally above
+# ──────────────────────────────────────────────────────────────────────── #
+# Constants & Defaults
+# ──────────────────────────────────────────────────────────────────────── #
+DEFAULT_AUDIO_FEATURES = {
+    "v1_snr": 0.0,
+    "v2_noise_traffic": 0.0,
+    "v2_noise_office": 0.0,
+    "v2_noise_crowd": 0.0,
+    "v2_noise_wind": 0.0,
+    "v2_noise_clean": 1.0,
+    "v3_speech_rate": 0.0,
+    "v4_pitch_mean": 0.0,
+    "v5_pitch_std": 0.0,
+    "v6_energy_mean": 0.0,
+    "v7_energy_std": 0.0,
+    "v8_pause_ratio": 0.0,
+    "v9_avg_pause_dur": 0.0,
+    "v10_mid_pause_cnt": 0.0,
+    "v11_emotion_stress": 0.0,
+    "v12_emotion_energy": 0.0,
+    "v13_emotion_valence": 0.0,
+}
+class AudioBase64Request(BaseModel):
+    audio_base64: str = ""
     transcript: str = ""
 @app.get("/")
 async def root():
     return {
+        "service": "Audio Feature Extraction API",
         "version": "1.0.0",
+        "endpoints": ["/health", "/extract-audio-features", "/extract-audio-features-base64"],
     }
 @app.get("/health")
 async def health():
+    vad_status = extractor.vad_model is not None
+    emotion_status = extractor.emotion_extractor is not None if extractor.use_emotion else False
     return {
+        "status": "healthy",
+        "vad_loaded": vad_status,
+        "emotion_loaded": emotion_status
     }
+@app.post("/extract-audio-features")
+async def extract_audio_features(audio: UploadFile = File(...), transcript: str = Form("")):
+    """Extract all 17 voice features from uploaded audio file."""
+    try:
+        audio_bytes = await audio.read()
+        # librosa.load returns (audio, sr)
+        y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
+        # AudioFeatureExtractor.extract_all expects numpy array and optional transcript
+        features = extractor.extract_all(y, transcript)
+        return sanitize_features(features)
+    except Exception as e:
+        print(f"[ERROR] extract_audio_features: {e}")
+        traceback.print_exc()
+        return {**DEFAULT_AUDIO_FEATURES, "_error": str(e)}
+@app.post("/extract-audio-features-base64")
+async def extract_audio_features_base64(data: AudioBase64Request):
+    """Extract features from base64-encoded audio (for Vercel serverless calls)."""
+    import soundfile as sf
+    audio_b64 = data.audio_base64
+    transcript = data.transcript
+    # Handle empty / missing audio — return default features
+    if not audio_b64 or len(audio_b64) < 100:
+        print("[INFO] Empty or too-short audio_base64, returning defaults")
+        return {**DEFAULT_AUDIO_FEATURES}
+    try:
+        # Strip data URL prefix if present (e.g. "data:audio/wav;base64,...")
+        if "," in audio_b64[:80]:
+            audio_b64 = audio_b64.split(",", 1)[1]
+        audio_bytes = base64.b64decode(audio_b64)
+        print(f"[INFO] Decoded {len(audio_bytes)} bytes of audio")
+        # Try soundfile first, fall back to librosa
+        try:
+            y, sr = sf.read(io.BytesIO(audio_bytes))
+        except Exception as sf_err:
+            print(f"[WARN] soundfile failed ({sf_err}), trying librosa...")
+            y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
+        if hasattr(y, 'shape') and len(y.shape) > 1:
+            y = np.mean(y, axis=1)
+        y = np.asarray(y, dtype=np.float32)
+        if sr != 16000:
+            y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+        y = y.astype(np.float32)
+        if len(y) < 100:
+            print("[WARN] Audio too short after decode, returning defaults")
+            return {**DEFAULT_AUDIO_FEATURES}
+        features = extractor.extract_all(y, transcript)
+        print(f"[OK] Extracted {len(features)} audio features")
+        return sanitize_features(features)
+    except Exception as e:
+        print(f"[ERROR] extract_audio_features_base64: {e}")
+        traceback.print_exc()
+        # Return defaults rather than 500
+        return {**DEFAULT_AUDIO_FEATURES, "_error": str(e)}
 if __name__ == "__main__":
     import os
     port = int(os.environ.get("PORT", 7860))
     uvicorn.run(app, host="0.0.0.0", port=port)

requirements.txt CHANGED Viewed

@@ -1,11 +1,22 @@
-# NLP
-transformers==4.35.0
-sentence-transformers==2.2.2
 numpy==1.24.3
-scikit-learn==1.3.2
 # API
 fastapi==0.95.2
 uvicorn==0.22.0
-pydantic==1.10.13

+# Core audio processing
+librosa==0.10.1
+soundfile==0.12.1
 numpy==1.24.3
+scipy==1.11.2
+# ML - CPU-only versions (HF Spaces friendly)
+# Torch for Silero VAD
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.1.0+cpu
+torchaudio==2.1.0+cpu
+# TensorFlow for Emotion Models
+tensorflow-cpu==2.15.0
 # API
 fastapi==0.95.2
 uvicorn==0.22.0
+python-multipart==0.0.6
+huggingface_hub>=0.19.0
+noisereduce>=3.0.0
+scikit-image>=0.21.0