Spaces:
Running on CPU Upgrade

ziem-io commited on
Commit
9a2984d
·
1 Parent(s): b81d9a3

Update: Refactor code

Browse files
Files changed (2) hide show
  1. app.py +50 -37
  2. lib/bert_regressor_utils.py +26 -24
app.py CHANGED
@@ -115,72 +115,81 @@ def _translate_en(text: str, target_lang: str = "EN-GB"):
115
  result = deepl_client.translate_text(text, target_lang=target_lang)
116
  return result.text
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  ### Do actual prediction #########################################################
119
  @spaces.GPU(duration=10) # Sekunden GPU-Zeit pro Call
120
  def predict(review_raw: str, do_cleanup: bool):
121
-
122
  review_raw = (review_raw or "").strip()
123
  is_translated = False
124
  html_info_out = ""
125
 
126
- # Abort if no text if given
127
  if not review_raw:
128
- # immer drei Outputs zurückgeben
129
  return "Please enter a review.", "", {}
130
-
131
- # Check for lang of text
132
  review_is_eng, review_lang_prob = _is_eng(review_raw)
133
 
134
- # Abort if text is not english
135
  if not review_is_eng:
136
  review_raw = _translate_en(review_raw)
137
- html_info_out += f"""<strong style='margin-bottom: 0.5em'>Your text has been automatically translated:</strong>
138
- <p>{review_raw}</p>
139
- """
 
140
  is_translated = True
141
-
 
142
  prediction_flavours = {}
143
  prediction_flavours_list = [0, 0, 0, 0, 0, 0, 0, 0]
144
 
145
- # Do actual predictions if is english and whisky note
146
  t_start_flavours = time.time()
147
 
148
- # Default-Werte, damit spätere Nutzung immer safe ist
 
149
  review_clean = review_raw
150
  cleanup_meta = []
151
- has_review = True # ohne Cleanup gehen wir davon aus: kompletten Text klassifizieren
152
 
153
- # Cleanup nur wenn Checkbox aktiv
154
  if do_cleanup:
155
- review_clean, cleanup_meta, has_review = cleanup_tasting_note(
156
  review_raw,
157
  model_cleanup,
158
  tokenizer_cleanup,
159
  device
160
  )
161
 
162
- html_info_out += (
163
- "<strong style='margin-bottom: 0.5em; display:block;'>"
164
- "BETA: Your text has been cleaned up:</strong>"
165
- "<p>"
166
- )
167
-
168
- for s in cleanup_meta:
169
- sent = html.escape(s.get("sentence", ""))
170
-
171
- if s.get("is_note"):
172
- html_info_out += f"{sent} "
173
- else:
174
- html_info_out += f"<span style='text-decoration: line-through; color: gray;'>{sent}</span> "
175
-
176
- html_info_out += "</p>"
177
-
178
- if not has_review:
179
  html_info_out += "<strong>No tasting notes detected.</strong>"
180
 
181
-
182
- # Nur vorhersagen, wenn wir Tasting Notes haben (oder Cleanup aus ist)
183
- if has_review:
184
  prediction_flavours = predict_flavours(
185
  review_clean,
186
  model_classify,
@@ -189,18 +198,21 @@ def predict(review_raw: str, do_cleanup: bool):
189
  )
190
  prediction_flavours_list = list(prediction_flavours.values())
191
 
 
192
  t_end_flavours = time.time()
193
 
 
194
  html_wheel_out = build_svg_with_values(prediction_flavours_list)
195
 
 
196
  json_out = {
197
  "result": dict(prediction_flavours.items()),
198
- "range": { "min": 0, "max": 4 },
199
  "review": {
200
  "raw": review_raw,
201
  "clean": review_clean,
202
  "clean_meta": cleanup_meta,
203
- "has_review": has_review,
204
  },
205
  "models": {
206
  "cleanup": MODEL_FILE_CLEANUP,
@@ -211,6 +223,7 @@ def predict(review_raw: str, do_cleanup: bool):
211
  "duration": round((t_end_flavours - t_start_flavours), 3),
212
  }
213
 
 
214
  return html_info_out, html_wheel_out, json_out
215
 
216
  ##################################################################################
 
115
  result = deepl_client.translate_text(text, target_lang=target_lang)
116
  return result.text
117
 
118
+ def _render_cleanup_html(cleanup_meta):
119
+ """
120
+ Renders cleanup_meta into HTML with struck-through non-note sentences.
121
+ """
122
+ html_info_out = "<strong style='display:block'>Your text has been cleaned up:</strong><p>"
123
+
124
+ for s in cleanup_meta:
125
+ sent = html.escape(s.get("sentence", ""))
126
+
127
+ if s.get("is_note"):
128
+ html_out += f"{sent} "
129
+ else:
130
+ html_out += (
131
+ f"<span style='text-decoration: line-through; color: gray;'>"
132
+ f"{sent}</span> "
133
+ )
134
+
135
+ html_info_out += "</p>"
136
+
137
+ return html_out
138
+
139
  ### Do actual prediction #########################################################
140
  @spaces.GPU(duration=10) # Sekunden GPU-Zeit pro Call
141
  def predict(review_raw: str, do_cleanup: bool):
142
+ # Normalize input (handle None and trim whitespace)
143
  review_raw = (review_raw or "").strip()
144
  is_translated = False
145
  html_info_out = ""
146
 
147
+ # Abort early if no text is provided
148
  if not review_raw:
 
149
  return "Please enter a review.", "", {}
150
+
151
+ # Detect language of the input text
152
  review_is_eng, review_lang_prob = _is_eng(review_raw)
153
 
154
+ # Automatically translate non-English text
155
  if not review_is_eng:
156
  review_raw = _translate_en(review_raw)
157
+ html_info_out += (
158
+ "<strong style='display:block'>Your text has been automatically translated:</strong>"
159
+ f"<p>{html.escape(review_raw)}</p>"
160
+ )
161
  is_translated = True
162
+
163
+ # Initialize prediction outputs
164
  prediction_flavours = {}
165
  prediction_flavours_list = [0, 0, 0, 0, 0, 0, 0, 0]
166
 
167
+ # Start timing the model inference
168
  t_start_flavours = time.time()
169
 
170
+ # Default values to ensure all variables are always defined
171
+ # Without cleanup enabled, the full text is treated as a tasting note
172
  review_clean = review_raw
173
  cleanup_meta = []
174
+ review_status = "review_only"
175
 
176
+ # Apply cleanup only if the checkbox is enabled
177
  if do_cleanup:
178
+ review_clean, cleanup_meta, review_status = cleanup_tasting_note(
179
  review_raw,
180
  model_cleanup,
181
  tokenizer_cleanup,
182
  device
183
  )
184
 
185
+ # Render cleanup visualization only if the text was actually modified
186
+ if review_status != "review_only":
187
+ html_info_out += _render_cleanup_html(cleanup_meta)
188
+ elif review_status == "noise_only":
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  html_info_out += "<strong>No tasting notes detected.</strong>"
190
 
191
+ # Run flavour prediction only if review content is present
192
+ if (not do_cleanup) or (review_status in ("review_only", "mixed")):
 
193
  prediction_flavours = predict_flavours(
194
  review_clean,
195
  model_classify,
 
198
  )
199
  prediction_flavours_list = list(prediction_flavours.values())
200
 
201
+ # Stop timing inference
202
  t_end_flavours = time.time()
203
 
204
+ # Build the flavour wheel SVG
205
  html_wheel_out = build_svg_with_values(prediction_flavours_list)
206
 
207
+ # Prepare structured JSON output
208
  json_out = {
209
  "result": dict(prediction_flavours.items()),
210
+ "range": {"min": 0, "max": 4},
211
  "review": {
212
  "raw": review_raw,
213
  "clean": review_clean,
214
  "clean_meta": cleanup_meta,
215
+ "status": review_status
216
  },
217
  "models": {
218
  "cleanup": MODEL_FILE_CLEANUP,
 
223
  "duration": round((t_end_flavours - t_start_flavours), 3),
224
  }
225
 
226
+ # Return HTML info, flavour wheel, and JSON output
227
  return html_info_out, html_wheel_out, json_out
228
 
229
  ##################################################################################
lib/bert_regressor_utils.py CHANGED
@@ -271,45 +271,47 @@ def text_to_sentences(text):
271
  ###################################################################################
272
 
273
  def cleanup_tasting_note(text, model, tokenizer, device, threshold=0.5):
274
- # Initialize an empty list to store sentences that are identified as tasting notes
275
  good_sentences = []
276
  scored_sentences = []
277
 
 
 
 
278
  sentences = text_to_sentences(text)
279
-
280
- # Iterate through each sentence detected in the processed document
281
- for sentence in sentences:
282
 
283
- # Leere Schnipsel überspringen
284
  if not sentence:
285
  continue
286
-
287
- # AI Filter section (Your Guardrail model)
288
- # Predict if the current sentence is a review using the loaded model and tokenizer
289
  result = predict_is_review(sentence, model, tokenizer, device)
290
-
291
- # Extract the probability score from the result and round it to 3 decimal places
292
  score = round(result["probability"], 3)
 
293
 
294
- # If valid, append the clean sentence text to the list
295
  scored_sentences.append({
296
- 'is_note': score > threshold,
297
- 'score': score,
298
- 'sentence': sentence
299
- })
300
-
301
- # Check if the probability is greater than 50% (threshold for being a tasting note)
302
- if score > threshold:
303
- # If valid, append the clean sentence text to the list
304
  good_sentences.append(sentence)
 
 
 
305
 
306
  new_text = " ".join(good_sentences)
307
 
308
- # True, wenn mindestens ein Satz als Tasting Note erkannt wurde
309
- has_review = len(good_sentences) > 0
310
-
311
- # Join all valid sentences into a single string separated by spaces and return it
312
- return new_text, scored_sentences, has_review
 
 
 
 
 
 
313
 
314
  ###################################################################################
315
 
 
271
  ###################################################################################
272
 
273
  def cleanup_tasting_note(text, model, tokenizer, device, threshold=0.5):
 
274
  good_sentences = []
275
  scored_sentences = []
276
 
277
+ has_review = False
278
+ has_noise = False
279
+
280
  sentences = text_to_sentences(text)
 
 
 
281
 
282
+ for sentence in sentences:
283
  if not sentence:
284
  continue
285
+
 
 
286
  result = predict_is_review(sentence, model, tokenizer, device)
 
 
287
  score = round(result["probability"], 3)
288
+ is_note = score > threshold
289
 
 
290
  scored_sentences.append({
291
+ "is_note": is_note,
292
+ "score": score,
293
+ "sentence": sentence
294
+ })
295
+
296
+ if is_note:
 
 
297
  good_sentences.append(sentence)
298
+ has_review = True
299
+ else:
300
+ has_noise = True
301
 
302
  new_text = " ".join(good_sentences)
303
 
304
+ # Status bestimmen
305
+ if has_review and has_noise:
306
+ review_status = "mixed"
307
+ elif has_review:
308
+ review_status = "review_only"
309
+ elif has_noise:
310
+ review_status = "noise_only"
311
+ else:
312
+ review_status = "noise_only" # leerer Text → effektiv kein Review
313
+
314
+ return new_text, scored_sentences, review_status
315
 
316
  ###################################################################################
317