Spaces:
Running
Running
CrispStrobe commited on
Commit ·
062b038
1
Parent(s): b090cc8
feat: enhance MTEB enrichment with cross-revision aggregation and manual fallbacks for latest models
Browse files- data/benchmarks.json +47 -8
- scripts/fetch-benchmarks.js +32 -5
- scripts/fetch-providers.js +3 -0
data/benchmarks.json
CHANGED
|
@@ -72946,16 +72946,28 @@
|
|
| 72946 |
{
|
| 72947 |
"hf_id": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
| 72948 |
"name": "paraphrase-multilingual-mpnet-base-v2",
|
| 72949 |
-
"mteb_avg":
|
|
|
|
| 72950 |
"sources": {
|
| 72951 |
-
"mteb_avg": "mteb"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72952 |
}
|
| 72953 |
},
|
| 72954 |
{
|
| 72955 |
"hf_id": "BAAI/bge-m3",
|
| 72956 |
"name": "bge-m3",
|
| 72957 |
-
"mteb_avg":
|
| 72958 |
-
"mteb_retrieval":
|
| 72959 |
"sources": {
|
| 72960 |
"mteb_avg": "mteb",
|
| 72961 |
"mteb_retrieval": "mteb"
|
|
@@ -72964,8 +72976,8 @@
|
|
| 72964 |
{
|
| 72965 |
"hf_id": "sentence-transformers/all-MiniLM-L12-v2",
|
| 72966 |
"name": "all-MiniLM-L12-v2",
|
| 72967 |
-
"mteb_avg":
|
| 72968 |
-
"mteb_retrieval":
|
| 72969 |
"sources": {
|
| 72970 |
"mteb_avg": "mteb",
|
| 72971 |
"mteb_retrieval": "mteb"
|
|
@@ -72974,11 +72986,38 @@
|
|
| 72974 |
{
|
| 72975 |
"hf_id": "intfloat/e5-mistral-7b-instruct",
|
| 72976 |
"name": "e5-mistral-7b-instruct",
|
| 72977 |
-
"mteb_avg":
|
| 72978 |
-
"mteb_retrieval":
|
| 72979 |
"sources": {
|
| 72980 |
"mteb_avg": "mteb",
|
| 72981 |
"mteb_retrieval": "mteb"
|
| 72982 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72983 |
}
|
| 72984 |
]
|
|
|
|
| 72946 |
{
|
| 72947 |
"hf_id": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
| 72948 |
"name": "paraphrase-multilingual-mpnet-base-v2",
|
| 72949 |
+
"mteb_avg": 146.35,
|
| 72950 |
+
"mteb_retrieval": 35.31,
|
| 72951 |
"sources": {
|
| 72952 |
+
"mteb_avg": "mteb",
|
| 72953 |
+
"mteb_retrieval": "mteb"
|
| 72954 |
+
}
|
| 72955 |
+
},
|
| 72956 |
+
{
|
| 72957 |
+
"hf_id": "BAAI/bge-large-en-v1.5",
|
| 72958 |
+
"name": "bge-large-en-v1.5",
|
| 72959 |
+
"mteb_avg": 46.8,
|
| 72960 |
+
"mteb_retrieval": 38.8,
|
| 72961 |
+
"sources": {
|
| 72962 |
+
"mteb_avg": "mteb",
|
| 72963 |
+
"mteb_retrieval": "mteb"
|
| 72964 |
}
|
| 72965 |
},
|
| 72966 |
{
|
| 72967 |
"hf_id": "BAAI/bge-m3",
|
| 72968 |
"name": "bge-m3",
|
| 72969 |
+
"mteb_avg": 69.8,
|
| 72970 |
+
"mteb_retrieval": 47.34,
|
| 72971 |
"sources": {
|
| 72972 |
"mteb_avg": "mteb",
|
| 72973 |
"mteb_retrieval": "mteb"
|
|
|
|
| 72976 |
{
|
| 72977 |
"hf_id": "sentence-transformers/all-MiniLM-L12-v2",
|
| 72978 |
"name": "all-MiniLM-L12-v2",
|
| 72979 |
+
"mteb_avg": 47.65,
|
| 72980 |
+
"mteb_retrieval": 29.72,
|
| 72981 |
"sources": {
|
| 72982 |
"mteb_avg": "mteb",
|
| 72983 |
"mteb_retrieval": "mteb"
|
|
|
|
| 72986 |
{
|
| 72987 |
"hf_id": "intfloat/e5-mistral-7b-instruct",
|
| 72988 |
"name": "e5-mistral-7b-instruct",
|
| 72989 |
+
"mteb_avg": 62.08,
|
| 72990 |
+
"mteb_retrieval": 55.06,
|
| 72991 |
"sources": {
|
| 72992 |
"mteb_avg": "mteb",
|
| 72993 |
"mteb_retrieval": "mteb"
|
| 72994 |
}
|
| 72995 |
+
},
|
| 72996 |
+
{
|
| 72997 |
+
"hf_id": "BAAI/bge-multilingual-gemma2",
|
| 72998 |
+
"mteb_avg": 70.3,
|
| 72999 |
+
"mteb_retrieval": 67.5,
|
| 73000 |
+
"sources": {
|
| 73001 |
+
"mteb_avg": "manual",
|
| 73002 |
+
"mteb_retrieval": "manual"
|
| 73003 |
+
}
|
| 73004 |
+
},
|
| 73005 |
+
{
|
| 73006 |
+
"hf_id": "Qwen/Qwen3-Embedding-8B",
|
| 73007 |
+
"mteb_avg": 71.2,
|
| 73008 |
+
"mteb_retrieval": 72.1,
|
| 73009 |
+
"sources": {
|
| 73010 |
+
"mteb_avg": "manual",
|
| 73011 |
+
"mteb_retrieval": "manual"
|
| 73012 |
+
}
|
| 73013 |
+
},
|
| 73014 |
+
{
|
| 73015 |
+
"hf_id": "BAAI/bge-en-icl",
|
| 73016 |
+
"mteb_avg": 64.9,
|
| 73017 |
+
"mteb_retrieval": 58.2,
|
| 73018 |
+
"sources": {
|
| 73019 |
+
"mteb_avg": "manual",
|
| 73020 |
+
"mteb_retrieval": "manual"
|
| 73021 |
+
}
|
| 73022 |
}
|
| 73023 |
]
|
scripts/fetch-benchmarks.js
CHANGED
|
@@ -512,8 +512,18 @@ async function fetchMTEB() {
|
|
| 512 |
if (!resultPaths) continue;
|
| 513 |
|
| 514 |
const revisions = [...new Set(resultPaths.map(p => p.split('/')[2]))];
|
| 515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
|
|
|
|
| 517 |
process.stdout.write(` MTEB: ${hfId} (${latestPaths.length} tasks)\r`);
|
| 518 |
|
| 519 |
let total = 0, count = 0, retTotal = 0, retCount = 0;
|
|
@@ -526,16 +536,22 @@ async function fetchMTEB() {
|
|
| 526 |
const data = scores.test || scores.dev || scores.validation;
|
| 527 |
if (!data) return;
|
| 528 |
const arr = Array.isArray(data) ? data : [data];
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
if (typeof s === 'number' && s > 0) {
|
| 533 |
const norm = s <= 1.0 ? s * 100 : s;
|
| 534 |
total += norm; count++;
|
| 535 |
const task = res.mteb_dataset_name || res.task_name || '';
|
| 536 |
if (task.includes('Retrieval') || task.includes('Search')) { retTotal += norm; retCount++; }
|
| 537 |
}
|
| 538 |
-
}
|
| 539 |
});
|
| 540 |
}
|
| 541 |
if (count > 0) {
|
|
@@ -554,6 +570,17 @@ async function fetchMTEB() {
|
|
| 554 |
|
| 555 |
function mergeMTEB(entries, mtebEntries) {
|
| 556 |
const map = new Map(mtebEntries.map(m => [m.hf_id.toLowerCase(), m]));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
let matched = 0;
|
| 558 |
for (const e of entries) {
|
| 559 |
const m = e.hf_id ? map.get(e.hf_id.toLowerCase()) : null;
|
|
|
|
| 512 |
if (!resultPaths) continue;
|
| 513 |
|
| 514 |
const revisions = [...new Set(resultPaths.map(p => p.split('/')[2]))];
|
| 515 |
+
// Aggregation: we'll take all unique tasks across all revisions,
|
| 516 |
+
// prioritizing the latest revision for each task.
|
| 517 |
+
const taskPaths = new Map();
|
| 518 |
+
revisions.forEach(rev => {
|
| 519 |
+
const pathsInRev = resultPaths.filter(p => p.includes(`/${rev}/`));
|
| 520 |
+
pathsInRev.forEach(p => {
|
| 521 |
+
const taskName = p.split('/').pop().replace('.json', '');
|
| 522 |
+
taskPaths.set(taskName, p);
|
| 523 |
+
});
|
| 524 |
+
});
|
| 525 |
|
| 526 |
+
const latestPaths = [...taskPaths.values()];
|
| 527 |
process.stdout.write(` MTEB: ${hfId} (${latestPaths.length} tasks)\r`);
|
| 528 |
|
| 529 |
let total = 0, count = 0, retTotal = 0, retCount = 0;
|
|
|
|
| 536 |
const data = scores.test || scores.dev || scores.validation;
|
| 537 |
if (!data) return;
|
| 538 |
const arr = Array.isArray(data) ? data : [data];
|
| 539 |
+
|
| 540 |
+
// Find English or default subset
|
| 541 |
+
let targetRes = arr.find(r => r.languages && r.languages.some(l => l.startsWith('eng') || l === 'en'));
|
| 542 |
+
if (!targetRes && arr.length === 1) targetRes = arr[0];
|
| 543 |
+
if (!targetRes) targetRes = arr.find(r => r.hf_subset === 'default');
|
| 544 |
+
if (!targetRes && arr.length > 0) targetRes = arr[0];
|
| 545 |
+
|
| 546 |
+
if (targetRes) {
|
| 547 |
+
const s = targetRes.main_score || targetRes.ndcg_at_10 || targetRes.accuracy;
|
| 548 |
if (typeof s === 'number' && s > 0) {
|
| 549 |
const norm = s <= 1.0 ? s * 100 : s;
|
| 550 |
total += norm; count++;
|
| 551 |
const task = res.mteb_dataset_name || res.task_name || '';
|
| 552 |
if (task.includes('Retrieval') || task.includes('Search')) { retTotal += norm; retCount++; }
|
| 553 |
}
|
| 554 |
+
}
|
| 555 |
});
|
| 556 |
}
|
| 557 |
if (count > 0) {
|
|
|
|
| 570 |
|
| 571 |
function mergeMTEB(entries, mtebEntries) {
|
| 572 |
const map = new Map(mtebEntries.map(m => [m.hf_id.toLowerCase(), m]));
|
| 573 |
+
|
| 574 |
+
// Manual overrides for famous models not yet in the results repo or needing fixed values
|
| 575 |
+
const overrides = [
|
| 576 |
+
{ hf_id: 'BAAI/bge-multilingual-gemma2', mteb_avg: 70.3, mteb_retrieval: 67.5, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
|
| 577 |
+
{ hf_id: 'Qwen/Qwen3-Embedding-8B', mteb_avg: 71.2, mteb_retrieval: 72.1, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
|
| 578 |
+
{ hf_id: 'BAAI/bge-en-icl', mteb_avg: 64.9, mteb_retrieval: 58.2, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
|
| 579 |
+
];
|
| 580 |
+
overrides.forEach(o => {
|
| 581 |
+
if (!map.has(o.hf_id.toLowerCase())) map.set(o.hf_id.toLowerCase(), o);
|
| 582 |
+
});
|
| 583 |
+
|
| 584 |
let matched = 0;
|
| 585 |
for (const e of entries) {
|
| 586 |
const m = e.hf_id ? map.get(e.hf_id.toLowerCase()) : null;
|
scripts/fetch-providers.js
CHANGED
|
@@ -219,6 +219,9 @@ const MANUAL_HF_ID_MAP = {
|
|
| 219 |
'mistral embed': 'mistralai/mistral-embed',
|
| 220 |
'codestral embed': 'mistralai/mistral-embed',
|
| 221 |
'e5 mistral 7b instruct': 'intfloat/e5-mistral-7b-instruct',
|
|
|
|
|
|
|
|
|
|
| 222 |
};
|
| 223 |
|
| 224 |
const MANUAL_OLLAMA_ID_MAP = {
|
|
|
|
| 219 |
'mistral embed': 'mistralai/mistral-embed',
|
| 220 |
'codestral embed': 'mistralai/mistral-embed',
|
| 221 |
'e5 mistral 7b instruct': 'intfloat/e5-mistral-7b-instruct',
|
| 222 |
+
'qwen3-embedding-8b': 'Qwen/Qwen3-Embedding-8B',
|
| 223 |
+
'bge-multilingual-gemma2': 'BAAI/bge-multilingual-gemma2',
|
| 224 |
+
'bge-en-icl': 'BAAI/bge-en-icl',
|
| 225 |
};
|
| 226 |
|
| 227 |
const MANUAL_OLLAMA_ID_MAP = {
|