File size: 4,432 Bytes
3e7d998
 
32853d9
3e7d998
 
 
 
 
 
 
feca03d
3e7d998
 
 
feca03d
3e7d998
 
 
 
feca03d
3e7d998
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45f0a42
3e7d998
 
 
 
 
 
45f0a42
3e7d998
 
 
 
 
884798e
45f0a42
3e7d998
 
 
 
 
 
 
45f0a42
3e7d998
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45f0a42
feca03d
 
 
 
3e7d998
 
45f0a42
 
3e7d998
feca03d
 
 
 
 
 
 
 
 
 
 
 
3e7d998
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
{
  "title": "Edge Inference Benchmarks",
  "subtitle": "Compare throughput and latency across devices and model variants.",
  "model_column": "model",
  "model_family_column": "model_family",
  "model_link_prefix": "https://huggingface.co/",
  "optimized_org": "embedl",
  "filters": [
    {
      "column": "type",
      "label": "MODALITY"
    },
    {
      "column": "batch",
      "label": "BATCH SIZE",
      "type": "number"
    },
    {
      "column": "device",
      "label": "DEVICE",
      "value_labels": {
        "orin_nano": "Jetson Orin Nano Super",
        "orin_nano_super": "Jetson Orin Nano Super",
        "agx_orin": "Jetson AGX Orin",
        "agx_thor": "Jetson AGX Thor"
      }
    }
  ],
  "metrics": [
    {
      "column": "tps",
      "label": "Tokens / sec",
      "short": "TPS ↑",
      "higher_is_better": true,
      "description": "Tokens per second (higher is better). Number of output tokens generated per second during the decoding phase. "
    },
    {
      "column": "tpot",
      "label": "Time per Output Token (ms)",
      "short": "TPOT(ms) ↓",
      "higher_is_better": false,
      "description": "Time per output token in ms (lower is better). Average time (in milliseconds) required to generate one output token during decoding. Computed as TPOT = (last_token_ts - first_token_ts) / total_output_tokens."
    },
    {
      "column": "ttft",
      "label": "Time to First Token (ms)",
      "short": "TTFT(ms) ↓",
      "higher_is_better": false,
      "description": "Time to first token in ms (lower is better). Time from request submission to generation of the first output token. This includes vision encoding, prompt prefill, KV cache initialization."
    },
    {
      "column": "e2e",
      "label": "End-to-End Latency (sec)",
      "short": "E2E(s) ↓",
      "higher_is_better": false,
      "description": "End-to-end latency in seconds (lower is better). Total time from request submission to completion of the full generated response. This reflects real user-perceived latency."
    }
  ],
  "display_columns": [
    {
      "column": "res",
      "label": "RESOLUTION",
      "visible_when": {
        "type": [
          "video",
          "image"
        ]
      }
    },
    {
      "column": "fps",
      "label": "FPS",
      "type": "number",
      "visible_when": {
        "type": [
          "video"
        ]
      }
    },
    {
      "column": "frames",
      "label": "Frames",
      "type": "number",
      "visible_when": {
        "type": [
          "video"
        ]
      }
    }
  ],
  "chart": {
    "default_metric": "tps",
    "group_by": "device",
    "scenarios": [
      {
        "label": "Text",
        "match": {
          "type": "text"
        }
      },
      {
        "label": "Image · 1280×720",
        "match": {
          "type": "image",
          "res": "1280x720"
        }
      },
      {
        "label": "Video · 1280×720 · 4 FPS",
        "match": {
          "type": "video",
          "res": "1280x720",
          "fps": 4
        }
      }
    ]
  },
  "table_sort": [
    {
      "column": "res",
      "direction": "asc"
    },
    {
      "column": "fps",
      "direction": "desc"
    }
  ],
  "table_group_by": "model",
  "model_families": {
    "Cosmos-Reason2-2B": {
      "data_file": "data/Cosmos-Reason2.csv",
      "table_group_by": [
        "res",
        "fps"
      ],
      "experiment_setup": {
        "agx_thor": "Measurement setup: NVIDIA vLLM 26.01, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.",
        "agx_orin": "Measurement setup: NVIDIA AI IoT vLLM 0.14.0 tegra, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.",
        "orin_nano": "Measurement setup: NVIDIA AI IoT vLLM 0.14.0 tegra, 256 tokens generated, 10 warm-up runs, averaged over 25 runs."
      }
    },
    "Qwen3.5": {
      "data_file": "data/Qwen3.5.csv",
      "table_group_by": [
        "res",
        "fps"
      ],
      "experiment_setup": {
        "agx_thor": "Measurement setup: NVIDIA AI IoT vLLM 0.16.0 arm64, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.",
        "agx_orin": "Measurement setup: NVIDIA AI IoT vLLM 0.16.0 tegra, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.",
        "orin_nano": "Measurement setup: NVIDIA AI IoT vLLM 0.16.0 tegra, 256 tokens generated, 10 warm-up runs, averaged over 25 runs."
      }
    }
  }
}