File size: 17,967 Bytes
60ff7f8
5f1346e
 
798edfa
75dfef8
4599119
8482391
506c0c1
29f1cd2
4599119
798edfa
 
cc810ad
 
 
 
 
b6d7b0f
cc810ad
 
 
 
 
 
 
 
 
 
b57f25a
 
 
 
a9cc0fe
b57f25a
 
 
 
 
7168f17
a9d08fa
7168f17
 
b57f25a
3d0fba7
333839c
 
 
 
 
 
 
 
 
 
 
 
4690187
709b3b0
333839c
3d0fba7
333839c
cc810ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f39d504
 
 
 
9d2e15e
f39d504
 
 
 
9d2e15e
 
728aa1f
 
 
 
 
 
 
 
 
 
 
a0661eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
798edfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f1346e
798edfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6bf613
798edfa
 
c6bf613
a0661eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
798edfa
 
 
 
 
 
 
 
 
 
 
 
 
cc810ad
 
 
 
c6bf613
cc810ad
 
424902e
c6bf613
424902e
 
cc810ad
 
 
ec9588e
cc810ad
 
c6bf613
cc810ad
 
424902e
c6bf613
424902e
 
cc810ad
 
 
 
 
f280192
798edfa
cc810ad
 
de507f6
cc810ad
 
de507f6
798edfa
 
 
 
 
 
 
 
 
 
dbc3265
798edfa
 
a0661eb
 
 
 
 
 
 
 
798edfa
75dfef8
798edfa
75dfef8
 
 
 
 
 
 
 
 
 
60ff7f8
333839c
798edfa
f280192
 
798edfa
b6d7b0f
 
 
 
 
 
 
 
798edfa
cc810ad
b57f25a
75a4f60
333839c
 
 
 
b57f25a
333839c
 
 
 
 
 
 
708298b
 
 
 
92746fc
708298b
 
 
 
 
 
 
 
 
 
 
 
 
cc810ad
 
 
 
 
 
 
 
 
 
 
 
 
 
b57f25a
 
 
 
 
 
 
 
92746fc
cc810ad
708298b
cc810ad
 
f280192
 
cc810ad
 
 
f280192
 
4599119
75a739b
4599119
cc810ad
 
4599119
 
 
798edfa
b57f25a
f280192
b57f25a
f280192
 
ec3307f
 
f280192
 
 
cc810ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f280192
b57f25a
f280192
 
cc810ad
 
f280192
60ff7f8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
import gradio as gr
import os
import sys
import json
from PIL import Image,ImageDraw
import tempfile
from inference_sdk import InferenceHTTPClient
from ultralytics import YOLO
import ultralytics

classes = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'tick', 'fraction']
API_KEY = os.environ.get("ROBOFLOW_API_KEY")
ACCESS_TOKEN = os.environ.get("ACCESS_TOKEN")

def RoboFlowGetOutlineBoxesPIL(pil_img):
    client = InferenceHTTPClient(
        api_url="https://detect.roboflow.com",
        api_key= API_KEY
    )
    result = client.run_workflow(
        workspace_name="mathnet-mmpuo",
        workflow_id="custom-workflow-2",
        images={
            "image": pil_img
        },
        use_cache=True # cache workflow definition for 15 minutes
    )
    return result
    
def localOutlineBox(image, selected_model="yolo_accurate"):
    model_paths = {
        "yolo_accurate": "./vt_dataset_yolov12_v7_weights.pt",
        "yolo_extra_large": "./VT_dataset_2_Yolov12_Extra_large.pt"
    }
    
    model_path = model_paths.get(selected_model, "./vt_dataset_yolov12_v7_weights.pt")
    model_path = "./vt_dataset_yolov12_v7_weights.pt"
    if os.path.exists(model_path):
        print("model exists")
        print(f'current model path is: {model_path}')
    else:
        print("model is not available")
    model = YOLO(model_path)
    yolo_ret = model(image, verbose=False)
    useful_ret = yolo_ret[0]
    names = model.names
    all_box_info = []
    for bb in useful_ret.boxes:
        box_info = {}
        x, y, w, h = bb.xywh[0].tolist()
        cls_name = names[int(bb.cls)]
        box_info['class'] = cls_name
        box_info['x'] = x
        box_info['y'] = y
        box_info['width'] = w
        box_info['height'] = h
        box_info['confidence'] = float(bb.conf)
        box_info['class_id'] = int(bb.cls)
        all_box_info.append(box_info)
    print(f"total length of all box_info is: {len(all_box_info)}")
    return all_box_info
def calculate_iou(box1, box2):
    """计算两个框的IoU
    box格式: (x, y, width, height)
    """
    # 计算每个框的左上角和右下角坐标
    box1_x1 = box1['x'] - box1['width']/2
    box1_y1 = box1['y'] - box1['height']/2
    box1_x2 = box1['x'] + box1['width']/2
    box1_y2 = box1['y'] + box1['height']/2
    
    box2_x1 = box2['x'] - box2['width']/2
    box2_y1 = box2['y'] - box2['height']/2
    box2_x2 = box2['x'] + box2['width']/2
    box2_y2 = box2['y'] + box2['height']/2
    
    # 计算交集区域的坐标
    inter_x1 = max(box1_x1, box2_x1)
    inter_y1 = max(box1_y1, box2_y1)
    inter_x2 = min(box1_x2, box2_x2)
    inter_y2 = min(box1_y2, box2_y2)
    
    # 计算交集面积
    if inter_x1 < inter_x2 and inter_y1 < inter_y2:
        inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
    else:
        return 0.0
    
    # 计算两个框的面积
    box1_area = box1['width'] * box1['height']
    box2_area = box2['width'] * box2['height']
    
    # 计算并集面积
    union_area = box1_area + box2_area - inter_area
    
    # 返回IoU
    return inter_area / union_area

def parse_roboflow_result(result, kept_classes):
    all_box_info = []
    for box_info in result[0]['predictions']['predictions']['predictions']:
        if box_info['class'] in kept_classes:
            all_box_info.append(box_info)
    return all_box_info

def filter_overlapping_boxes(filter_box_info, iou_threshold=0.5):

    digit_classes = {'zero', 'one', 'two', 'three', 'four', 
                    'five', 'six', 'seven', 'eight', 'nine'}
    
    # 分离数字框和其他框
    digit_boxes = []
    other_boxes = []
    
    for box in filter_box_info:
        if box['class'] in digit_classes:
            digit_boxes.append(box)
        else:
            other_boxes.append(box)
    

    digit_boxes.sort(key=lambda x: x['confidence'], reverse=True)
    kept_boxes = []

    for i, box in enumerate(digit_boxes):
        should_keep = True
        
        for kept_box in kept_boxes:
            if calculate_iou(box, kept_box) > iou_threshold:
                should_keep = False
                break
        if should_keep:
            kept_boxes.append(box)
    kept_other_boxes = []
    for i, box in enumerate(other_boxes):
        should_keep = True
        
        for kept_box in kept_other_boxes:
            if calculate_iou(box, kept_box) > iou_threshold:
                should_keep = False
                break
        if should_keep:
            kept_other_boxes.append(box)
    return kept_other_boxes + kept_boxes
    
def getCenterXDis(box1, box2):
    x0_1, y0_1, x1_1, y1_1 = box1
    x0_2, y0_2, x1_2, y1_2 = box2
    #centeral corrdinates
    center_x1 = (x0_1 + x1_1) / 2
    center_y1 = (y0_1 + y1_1) / 2
    center_x2 = (x0_2 + x1_2) / 2
    center_y2 = (y0_2 + y1_2) / 2
    return abs(center_x1 - center_x2)
    
def to_ordinal(n):
    """
    Converts an integer to its ordinal string representation.
    e.g., 1 -> "1st", 2 -> "2nd", 13 -> "13th"
    """
    if not isinstance(n, int):
        raise TypeError("Input must be an integer.")
    # Check for 11th, 12th, 13th, which are special cases
    if 11 <= (n % 100) <= 13:
        suffix = 'th'
    else:
        # Check the last digit for all other cases
        last_digit = n % 10
        if last_digit == 1:
            suffix = 'st'
        elif last_digit == 2:
            suffix = 'nd'
        elif last_digit == 3:
            suffix = 'rd'
        else:
            suffix = 'th'   
    return f"{n}{suffix}"

def packFilterBoxInfo(filter_box_info):
    # 数字类别映射
    digit_classes = {
        'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5',
        'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'zero': '0'
    }
    fraction_boxes = []
    number_boxes = []
    
    for box in filter_box_info:
        if box['class'] == 'fraction':
            fraction_boxes.append(box)
        elif box['class'] in digit_classes:
            number_boxes.append(box)
    
    fraction_boxes.sort(key=lambda x: x['x'] - x['width']/2)

    fraction_values = []

    for frac_box in fraction_boxes:
        # fraction框的边界
        frac_x = frac_box['x']
        frac_y = frac_box['y']
        frac_width = frac_box['width']
        frac_height = frac_box['height']
        
        # 定义分子分母的区域
        numerator_numbers = []
        denominator_numbers = []
        
        # 遍历所有数字,判断是否在当前fraction框内
        for num_box in number_boxes:
            # 检查数字是否在fraction框的水平范围内
            if (frac_x - frac_width/2 <= num_box['x'] <= frac_x + frac_width/2 and frac_y - frac_height/2 <= num_box['y'] <= frac_y + frac_height/2):
                # 获取数字值
                digit = digit_classes[num_box['class']]
                
                # 根据y坐标判断是分子还是分母
                if num_box['y'] < frac_y:  # 在分数线上方
                    numerator_numbers.append((num_box['x'], num_box['y'], num_box['width'], num_box['height'], digit))
                else:  # 在分数线下方
                    denominator_numbers.append((num_box['x'], num_box['y'], num_box['width'], num_box['height'], digit))
        
        # 按x坐标排序
        numerator_numbers.sort(key=lambda x: x[0]-x[2]/2)
        denominator_numbers.sort(key=lambda x: x[0]-x[2]/2)
        
        # 提取排序后的数字
        numerator = ''.join(digit for _, _, _, _, digit in numerator_numbers)
        denominator = ''.join(digit for _, _, _, _, digit in denominator_numbers)
        if numerator == "": 
            numerator = "?"
        if denominator == "":
            denominator = "?"
        fraction_values.append(f"{numerator}/{denominator}")
        
    return fraction_values
#Assume its coordinate are top-left, bottom-right    
def getOverlap(box1, box2):
    b1_x1, b1_y1, b1_x2, b1_y2 = box1
    b2_x1, b2_y1, b2_x2, b2_y2 = box2
    inter_x1 = max(b1_x1, b2_x1)
    inter_y1= max(b1_y1, b2_y1)
    inter_x2 = min(b1_x2, b2_x2)
    inter_y2 = min(b1_y2, b2_y2)
    if inter_x1 < inter_x2 and inter_y1 < inter_y2:
        inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
    else:
        return 0.0
    #b1_area = abs(b1_x2 - b1_x1) * abs(b1_y2 - b1_y1)
    b2_area = abs(b2_x2 - b2_x1) * abs(b2_y2 - b2_y1)

    return inter_area / b2_area
    
def tick2fraction(ticks, fractions):
    ret = []
    used_ticks = set()
    for fi, frac in enumerate(fractions):
        all_dis = []
        for ti, tick in enumerate(ticks):
            if ti not in used_ticks:
                dis = getCenterXDis(tick, frac)
                all_dis.append((dis, ti))
        if len(all_dis) == 0:
            #print(f"no tick found for fraction {fi}")
            #no tick found for this fraction
            break
        all_dis.sort(key=lambda x: x[0])
        min_dis_index = all_dis[0][1]
        used_ticks.add(min_dis_index)
        ret.append(f"T{min_dis_index}-F{fi}")
    return ret
    
def generate_textual_description(box_info):
    fraction_values = packFilterBoxInfo(box_info)
    # Create a dictionary to store information by class ID
    class_summary = {c: [] for c in classes}
    for box in box_info:
        c_name = box['class']
        if c_name not in class_summary:
            continue
        else:
            x, y, w, h = box['x'], box['y'], box['width'], box['height']
            class_summary[c_name].append([x-w/2, y-h/2, x+w/2, y+h/2])
    # Generate a summary for each class
    #the index of the left one
    kept_zero_boxes = []
    for zero_box in class_summary['zero']:
        kept_zero = True
        for fra_box in class_summary['fraction']:
            if getOverlap(fra_box, zero_box) >= 0.5:
                kept_zero = False
                break
        for tick_box in class_summary['tick']:
            if getOverlap(tick_box, zero_box) >= 0.5:
                kept_zero = False
                break
        if kept_zero:
            kept_zero_boxes.append(zero_box)
    kept_one_boxes = []
    for one_box in class_summary['one']:
        kept_one = True
        for fra_box in class_summary['fraction']:
            if getOverlap(fra_box, one_box) >= 0.5:
                kept_one = False
                break
        for tick_box in class_summary['tick']:
            if getOverlap(tick_box, one_box) >= 0.5:
                kept_one = False
                break
        if kept_one:
            kept_one_boxes.append(one_box)
    kept_zero_boxes.sort(key = lambda x: x[0])
    kept_one_boxes.sort(key = lambda x: x[0])
    textual_description = "" #final output
    textual_description += "The key elements are interpreted via visual translator. Their coordinates are represented as outlined boxes (top-left, bottom-right)."
    #print(f"The key elements are interpreted via visual translator. Their coordinates are represented as outlined boxes (top-left, bottom-right)")
    if len(kept_zero_boxes) >= 1:
        left_most_zero_cor = kept_zero_boxes[0]
        textual_description += f"\nThere is a zero on the left side of the number line. Its coordinate is (({left_most_zero_cor[0]:.2f}, {left_most_zero_cor[1]:.2f}), ({left_most_zero_cor[2]:.2f}, {left_most_zero_cor[3]:.2f}))"
    if len(kept_one_boxes) >= 1:
        right_most_one_cor = kept_one_boxes[-1]
        textual_description += f"\nThere is a one on the right side of the number line. Its coordinate is (({right_most_one_cor[0]:.2f}, {right_most_one_cor[1]:.2f}), ({right_most_one_cor[2]:.2f}, {right_most_one_cor[3]:.2f}))"
    present_classes = ['fraction', 'tick']
    for cid, boxes in class_summary.items():
        class_name = cid
        if class_name not in present_classes:
            continue
        count = len(boxes)
        boxes.sort(key=lambda x: x[0]) # it has been the x of the top-left corner
        if count > 0:
            textual_description += f"\nThere are {count} {class_name}s. Their coordinates are: "
            for box in boxes:
                textual_description += f"(({box[0]:.2f}, {box[1]:.2f}), ({box[2]:.2f}, {box[3]:.2f})), " 
            if (class_name == "fraction"):
                textual_description += f"\nThe fraction numbers from left to right are: {fraction_values}. "
    tick2fra = tick2fraction(class_summary['tick'], class_summary['fraction'])
    tick2fraction_des = ""
    for cor in tick2fra:
        tick_part, fraction_part = cor.split('-')
        fraction_idx = int(fraction_part[1:])
        tick_idx = int(tick_part[1:])
        tick2fraction_des += f"{to_ordinal(fraction_idx + 1)} fraction is associated with {to_ordinal(tick_idx + 1)} tick. "
    textual_description += tick2fraction_des
    return textual_description
def drawWithAllBox_info(pil_image, box_info):
    colors = ['red', 'green', 'blue', 'orange', 'purple', 'cyan', 'magenta', 'yellow', 'brown', 'pink', 'gray', 'lime', 'navy']
    draw = ImageDraw.Draw(pil_image)
    for box in box_info:
        x, y, w, h = box['x'], box['y'], box['width'], box['height']
        class_id = box['class_id']
        color = 'black'
        if class_id < len(colors):
            color = colors[class_id]
        draw.rectangle([x-w/2, y-h/2, x+w/2, y+h/2], outline=color, width=2)
    return pil_image


def online_process_image(image):
    if image is None:
        # Ensure we always return 3 values to prevent errors.
        return None, "", ""
    pil_image = image.copy() if hasattr(image, 'copy') else Image.fromarray(image)
    roboflow_ret = RoboFlowGetOutlineBoxesPIL(pil_image)
    all_box_info = parse_roboflow_result(roboflow_ret, classes)
    del roboflow_ret
    kept_box_info = filter_overlapping_boxes(all_box_info)
    del all_box_info
    boxed_img = drawWithAllBox_info(pil_image, kept_box_info)
    textual = generate_textual_description(kept_box_info)
    json_str = json.dumps(kept_box_info, indent=2)
    return boxed_img, textual, json_str

def process_image(image, selected_model = "yolo_accurate"):
    print("start processing image")
    if image is None:
        # Ensure we always return 3 values to prevent errors.
        return None, "", ""
    #pil_image = image.copy() if hasattr(image, 'copy') else Image.fromarray(image)
    all_box_info = localOutlineBox(image, selected_model)
    kept_box_info = filter_overlapping_boxes(all_box_info)
    del all_box_info
    boxed_img = drawWithAllBox_info(image, kept_box_info)
    textual = generate_textual_description(kept_box_info)
    json_str = json.dumps(kept_box_info, indent=2)
    return boxed_img, textual, json_str
    
with gr.Blocks(
    css=r"""
    /* 精确命中这个 Textbox 的文本区 */
    textarea[aria-label="Textual Description"] {
        font-size: 28px !important;
        line-height: 1.6 !important;
        font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif !important;
        color: #222 !important;
    }

    /* 兜底选择器 */
    #desc textarea,
    .large-font textarea,
    div[data-testid="textbox"] textarea {
        font-size: 24px !important;
    }
    """
    ) as demo:
    # --- Authentication Layer ---
    with gr.Row():
        token_input = gr.Textbox(
            label="Invite Token",
            type="password",
            placeholder="Enter your invite token to unlock the app"
        )
        unlock_btn = gr.Button("Unlock")

    status_text = gr.Markdown()

    # --- Main Application (initially hidden) ---
    with gr.Column(visible=False) as main_app:
        img_input = gr.Image(type="pil", label="Upload Image")

        with gr.Row():
            model_list = gr.Dropdown(
                choices=["yolo_accurate", "yolo_extra_large"],
                value="yolo_accurate",
                label="Select Model",
                info="Choose the YOLO model for detection"
            )
            run_btn = gr.Button("Run Detection")
        img_out = gr.Image(type="pil", label="Image with Boxes")
        text_out = gr.Textbox(label="Textual Description", lines=8, elem_id="desc", elem_classes=["large-font"])
        json_state = gr.State("")
        download_btn = gr.DownloadButton(
            label="Download Box Info as JSON",
            interactive=False  # Start as disabled.
        )

    # --- Backend Functions ---
    def create_json_file(json_str):
        """Creates a temp file with JSON content and returns its path."""
        if not json_str:
            return None
        with tempfile.NamedTemporaryFile(
            prefix="detection_info_",
            mode='w', delete=False, suffix='.json', encoding='utf-8'
        ) as f:
            f.write(json_str)
            return f.name

    def _process_and_prepare_download(image, selected_model):
        """Processes the image, creates the JSON file, and updates the UI."""
        boxed_img, textual, json_str = process_image(image, selected_model)
        filepath = create_json_file(json_str)
        
        # Use the legacy gr.update() for compatibility with older Gradio versions.
        download_update = gr.update(value=filepath, interactive=True)
        
        return boxed_img, textual, json_str, download_update

    def check_token(token):
        # Securely check if the token is correct
        if ACCESS_TOKEN and token == ACCESS_TOKEN:
            return gr.update(visible=True), "Token accepted. You can now use the application."
        else:
            return gr.update(visible=False), "Invalid token. Please try again."

    # --- Event Listeners ---
    unlock_btn.click(
        check_token,
        inputs=token_input,
        outputs=[main_app, status_text]
    )

    run_btn.click(
        _process_and_prepare_download,
        inputs=[img_input, model_list],
        # The output now includes the download button itself.
        outputs=[img_out, text_out, json_state, download_btn]
    )

    # The download_btn no longer needs its own click event.
demo.launch()