| import json |
| from datasets import load_dataset |
| from tqdm import tqdm |
|
|
| JSONL_PATH = "/workspace/BiomedEnriched.jsonl" |
|
|
| commercial = load_dataset( |
| "almanach/Biomed-Enriched", |
| split="commercial", |
| streaming=True |
| ) |
|
|
| noncommercial = load_dataset( |
| "almanach/Biomed-Enriched", |
| split="noncommercial", |
| streaming=True |
| ) |
|
|
| with open(JSONL_PATH, "w", encoding="utf-8") as f: |
| for i, row in enumerate(tqdm(commercial, desc="commercial")): |
| rec = { |
| "key": f"commercial_{i}", |
| "split": "commercial", |
| "text": row.get("text"), |
| "path": row.get("path"), |
| "license_url": row.get("license_url"), |
| "authors": row.get("authors"), |
| "document_type": row.get("document_type"), |
| "domain": row.get("domain"), |
| "educational_score": row.get("educational_score"), |
| } |
| f.write(json.dumps(rec, ensure_ascii=False) + "\n") |
|
|
| for i, row in enumerate(tqdm(noncommercial, desc="noncommercial")): |
| rec = { |
| "key": f"noncommercial_{i}", |
| "split": "noncommercial", |
| "path": row.get("path"), |
| "license_url": row.get("license_url"), |
| "authors": row.get("authors"), |
| "document_type": row.get("document_type"), |
| "domain": row.get("domain"), |
| "educational_score": row.get("educational_score"), |
| } |
| f.write(json.dumps(rec, ensure_ascii=False) + "\n") |
|
|