| import json |
| import random |
| from pathlib import Path |
|
|
| random.seed(0) |
|
|
| |
| PKG_VERSIONS = { |
| "numpy": ["1.21.0", "1.22.0", "1.23.5"], |
| "pandas": ["1.3.5", "1.4.4", "2.0.3"], |
| "scipy": ["1.7.3", "1.8.1", "1.10.0"], |
| "scikit-learn": ["0.24.2", "1.0.2", "1.2.2"], |
| "torch": ["1.8.0", "1.13.1", "2.1.0"], |
| "torchvision": ["0.9.0", "0.14.1", "0.16.0"], |
| "torchaudio": ["0.8.0", "0.13.1", "2.1.0"], |
| "pytorch-lightning": ["1.5.0", "2.0.0", "2.2.0"], |
| "tensorflow": ["1.15.0", "2.9.0", "2.15.0"], |
| "keras": ["2.4.0", "2.9.0", "3.0.0"], |
| "jax": ["0.3.25", "0.4.13"], |
| "flax": ["0.5.1", "0.7.2"], |
| "fastapi": ["0.78.0", "0.99.0"], |
| "uvicorn[standard]": ["0.17.6", "0.23.2"], |
| "starlette": ["0.19.1", "0.27.0"], |
| "pydantic": ["1.10.13", "2.3.0"], |
| "sqlalchemy": ["1.4.46", "2.0.20"], |
| "alembic": ["1.7.7", "1.12.0"], |
| "psycopg2-binary": ["2.9.3"], |
| "requests": ["2.27.1", "2.31.0"], |
| "httpx": ["0.23.0", "0.25.1"], |
| "beautifulsoup4": ["4.10.0", "4.12.2"], |
| "scrapy": ["2.5.1", "2.9.0"], |
| "opencv-python": ["4.5.5.64", "4.8.0.76"], |
| "pillow": ["9.0.1", "10.0.0"], |
| "matplotlib": ["3.5.1", "3.7.2"], |
| "seaborn": ["0.11.2", "0.13.0"], |
| "plotly": ["5.6.0", "5.17.0"], |
| "langchain": ["0.0.350", "0.1.0"], |
| "openai": ["0.28.0", "1.6.0"], |
| "tiktoken": ["0.5.1"], |
| "chromadb": ["0.4.8", "0.4.23"], |
| "weaviate-client": ["3.21.0"], |
| "redis": ["4.3.4", "5.0.1"], |
| "celery": ["5.2.7", "5.3.4"], |
| "gunicorn": ["20.1.0"], |
| "uvloop": ["0.17.0"], |
| } |
|
|
| PKG_NAMES = list(PKG_VERSIONS.keys()) |
|
|
|
|
| def make_requirements(num_lines: int, force_conflict: bool = False): |
| """ |
| Create one synthetic requirements.txt-style env. |
| Some are valid, some invalid. |
| """ |
| chosen = random.sample(PKG_NAMES, num_lines) |
| req_lines = [] |
| pinned_versions = {} |
|
|
| |
| for pkg in chosen: |
| ver = random.choice(PKG_VERSIONS[pkg]) |
| pinned_versions[pkg] = ver |
| |
| if random.random() < 0.2: |
| line = pkg |
| else: |
| line = f"{pkg}=={ver}" |
| req_lines.append(line) |
|
|
| label = "valid" |
| conflict_reason = None |
|
|
| |
| |
| if "torch" in pinned_versions and "pytorch-lightning" in pinned_versions: |
| tver = pinned_versions["torch"] |
| plver = pinned_versions["pytorch-lightning"] |
| if force_conflict or (random.random() < 0.5 and tver.startswith("1.") and plver.startswith("2.")): |
| |
| for i, line in enumerate(req_lines): |
| if line.startswith("torch"): |
| req_lines[i] = "torch==1.8.0" |
| if line.startswith("pytorch-lightning"): |
| req_lines[i] = "pytorch-lightning==2.2.0" |
| label = "invalid" |
| conflict_reason = "pytorch-lightning>=2.0 is assumed to require torch>=2.0 but torch==1.8.0 is pinned." |
|
|
| |
| if label == "valid" and "tensorflow" in pinned_versions and "keras" in pinned_versions: |
| tver = pinned_versions["tensorflow"] |
| kver = pinned_versions["keras"] |
| if force_conflict or (random.random() < 0.5 and tver.startswith("1.") and kver.startswith("3.")): |
| for i, line in enumerate(req_lines): |
| if line.startswith("tensorflow"): |
| req_lines[i] = "tensorflow==1.15.0" |
| if line.startswith("keras"): |
| req_lines[i] = "keras==3.0.0" |
| label = "invalid" |
| conflict_reason = "keras==3.0.0 is assumed to require TensorFlow 2.x but tensorflow==1.15.0 is pinned." |
|
|
| |
| if label == "valid" and "fastapi" in pinned_versions and "pydantic" in pinned_versions: |
| fver = pinned_versions["fastapi"] |
| pver = pinned_versions["pydantic"] |
| |
| if force_conflict or (random.random() < 0.5 and fver.startswith("0.78") and pver.startswith("2.")): |
| for i, line in enumerate(req_lines): |
| if line.startswith("fastapi"): |
| req_lines[i] = "fastapi==0.78.0" |
| if line.startswith("pydantic"): |
| req_lines[i] = "pydantic==2.3.0" |
| label = "invalid" |
| conflict_reason = "fastapi==0.78.0 is assumed to require pydantic v1, but pydantic==2.3.0 is pinned." |
|
|
| |
| if label == "valid" and force_conflict: |
| pkg = chosen[0] |
| existing_ver = pinned_versions[pkg] |
| alt_candidates = [v for v in PKG_VERSIONS[pkg] if v != existing_ver] |
| if alt_candidates: |
| alt_ver = random.choice(alt_candidates) |
| else: |
| alt_ver = existing_ver |
| req_lines.append(f"{pkg}=={alt_ver}") |
| label = "invalid" |
| conflict_reason = f"{pkg} is pinned to multiple incompatible versions." |
|
|
| return "\n".join(req_lines), label, conflict_reason |
|
|
|
|
| def generate_dataset(n_samples: int = 100): |
| samples = [] |
| for i in range(n_samples): |
| num_lines = random.randint(4, 10) |
| |
| force_conflict = (i % 2 == 1) |
| req_str, label, reason = make_requirements(num_lines, force_conflict=force_conflict) |
| samples.append( |
| { |
| "id": i + 1, |
| "requirements": req_str, |
| "label": label, |
| "conflict_reason": reason, |
| } |
| ) |
| return samples |
|
|
|
|
| if __name__ == "__main__": |
| samples = generate_dataset(n_samples=120) |
|
|
| out_path = Path("synthetic_requirements_dataset.json") |
| out_path.write_text(json.dumps(samples, indent=2)) |
| print(f"Wrote {len(samples)} samples to {out_path.resolve()}") |
|
|
| |
| base_dir = Path("synthetic_requirements_txt") |
| base_dir.mkdir(exist_ok=True) |
| for s in samples: |
| fname = base_dir / f"requirements_{s['id']:03d}_{s['label']}.txt" |
| fname.write_text(s["requirements"]) |
|
|