File size: 6,160 Bytes
50e9833
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc9fc84
50e9833
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a3e94e
50e9833
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e017c42
45da97f
e017c42
 
50e9833
 
 
 
cc9fc84
50e9833
 
 
 
cc9fc84
50e9833
 
cc9fc84
50e9833
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import requests
import json
import logging
import scipy
from scipy.io import wavfile
from pydub import AudioSegment
import io
from io import BytesIO

# Load the model
model = hub.load('Audio_Multiple_v1')

def class_names_from_csv(class_map_csv_text):
    """Returns list of class names corresponding to score vector."""
    class_names = []
    with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            class_names.append(row['display_name'])
    return class_names

class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

def ensure_sample_rate(original_sample_rate, waveform, desired_sample_rate=16000):
    if original_sample_rate != desired_sample_rate:  # Resample waveform if required
        desired_length = int(round(float(len(waveform)) / original_sample_rate * desired_sample_rate))
        waveform = np.array(scipy.signal.resample(waveform, desired_length), dtype=np.float32)
    return desired_sample_rate, waveform

def convert_mp3_to_wav(mp3_data):
    audio = AudioSegment.from_file(io.BytesIO(mp3_data), format="mp3")
    wav_buffer = io.BytesIO()
    audio.export(wav_buffer, format='wav')
    wav_buffer.seek(0)
    return wav_buffer.getvalue()

def process_audio_file(file_data, url,file_id):
    try:
        sample_rate, wav_data = wavfile.read(BytesIO(file_data))      
        
        if wav_data.ndim > 1:                        
            wav_data = np.mean(wav_data, axis=1)
        sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)

        waveform = wav_data / tf.int16.max

        scores, embeddings, spectrogram = model(waveform)      

        scores_np = scores.numpy()
        mean_scores = np.mean(scores, axis=0)

        inferred_class = class_names[mean_scores.argmax()]    
        
        confidence_threshold = 0.60                 
        confident_classes = set()

        exclusion_list = ['Mechanisms','Domestic animals, pets', 'Animal', 'Silence', 'Alarm', 'Wind chime', 'Water', 'Livestock, farm animals, working animals', 'Wild animals', 'Bleat', 'Siren', 'Computer keyboard', 'Toot', 'Shatter', 'Bird','Caw', 'Independent music', 'Tender music', 'Ocean', 'House music', 'Middle Eastern music', 'Swing music', 'Soul music', 'Shofar', 'Motor vehicle (road)', 'White noise','Pink noise', 'Cacophony', 'Sidetone', 'Static', 'Outside, rural or natural', 'Outside, urban or manmade', 'Inside, public space', 'Inside, large room or hall', 'Inside, small room', 'Sound effect']
        for frame_scores in scores_np:
            for i, score in enumerate(frame_scores):
                if score > confidence_threshold:
                    class_name = class_names[i]

                    if class_name =='Child speech, kid speaking':
                        class_name='Child speech'
                    elif class_name =='Vehicle horn, car horn, honking':
                        class_name='Vehicle horn'
                    elif class_name =='Railroad car, train wagon':
                        class_name='Train/wagon'
                    elif class_name=='Rail transport':
                        class_name='Train/wagon'

                    if class_name not in exclusion_list:
                        confident_classes.add(class_name)

        confident_classes = sorted(confident_classes)
        confident_classes_list = list(confident_classes)
        
        answer_dict = {'url': url, 'answer': confident_classes_list, "qcUser" : None, "normalfileID": file_id}
        return answer_dict
    
    except Exception as e:
        logging.error(f"Error processing {url}: {e}")
        return None

def get_audio_data(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.content

# def send_results_to_api(data, result_url):
#     headers = {"Content-Type": "application/json"}
#     try:
#         response = requests.post(result_url, json=data, headers=headers)
#         response.raise_for_status()  # Raise error for non-200 responses
#         return response.json()  # Return any JSON response from the API
#     except requests.exceptions.HTTPError as http_err:
#         logging.error(f"HTTP error occurred: {http_err}")
#         return {"error": f"HTTP error occurred: {http_err}"}
#     except requests.exceptions.RequestException as req_err:
#         logging.error(f"Request error occurred: {req_err}")
#         return {"error": f"Request error occurred: {req_err}"}
#     except ValueError as val_err:
#         logging.error(f"Error decoding JSON response: {val_err}")
#         return {"error": f"Error decoding JSON response: {val_err}"}

def process_audio(params):
    try:
        params = json.loads(params)
    except json.JSONDecodeError as e:
        return {"error": f"Invalid JSON input: {e.msg} at line {e.lineno} column {e.colno}"}

    audio_files = params.get("urls", [])
    if not params.get("normalfileID",[]):
        file_ids = [None]*len(audio_files)
    else:
        file_ids = params.get("normalfileID",[])
    # api = params.get("api", "")
    # job_id = params.get("job_id", "")

    solutions = []
    for audio_url, file_id in zip(audio_files, file_ids):
        audio_data = get_audio_data(audio_url)

        if audio_url.endswith(".mp3"):            
            wav_data = convert_mp3_to_wav(audio_data)
            result = process_audio_file(wav_data, audio_url, file_id)

        elif audio_url.endswith(".wav"):           
            result = process_audio_file(audio_data, audio_url, file_id)
        
        if result:
            solutions.append(result)

    # result_url = f"{api}/{job_id}"
    # send_results_to_api(solutions, result_url)

    return json.dumps({"solutions": solutions})

import gradio as gr

inputt = gr.Textbox(label="Parameters (JSON format) Eg. {'urls':['file1.mp3','file2.wav']}")
outputs = gr.JSON()

application = gr.Interface(fn=process_audio, inputs=inputt, outputs=outputs, title="Audio Classification with API Integration")
application.launch()