LLM_Dataset_Inference / task_template.py

Upload task_template.py

4bdf9a2 verified 4 days ago

9.65 kB

	import torch
	import pandas as pd
	import requests
	import sys
	import torchvision.models as models
	import os

	from transformers import AutoTokenizer, PreTrainedModel, AutoModelForSequenceClassification

	import utils

	# --------------------------------
	# DATASET
	# --------------------------------

	"""
	Dataset contents:

	- 1000 subsets of text data, each subset stored under the key "subset_{i}" where i ranges from 0 to 999.
	Each subset is a dictionary with:
	-"prompts": List of 100 prompts in the subset
	-"labels": Tensor of true labels for the prompts in the subset, has shape (100)
	-"subset_id": Integer ID of the subset (from 0 to 999)
	"""

	# Load the dataset
	dataset = torch.load("datasets/fulltuning.pt")

	# Example: Acessing subsets
	subset_0 = dataset["subset_0"]

	print("Subset 0 keys:", subset_0.keys())
	print("Subset ID:", subset_0["subset_id"])
	print("Labels length:", len(subset_0["labels"]))
	print("First prompts:", subset_0["prompts"][:5])
	print("First 5 labels:", subset_0["labels"][:5])

	# --------------------------------
	# QUERYING THE CLASSIFIER
	# --------------------------------

	# This Code can be used to load and query the fully fine-tuned models. You also need to the available utils.py file.

	#\|---------------------------------------------------------------------------------------------------\|
	#\| NOTE: "Missing or unexpected params" warnings are no reason for concern. They stem from the \|
	#\| fact that the model is first loaded without a classifier head, which is added afterwards. \|
	#\|---------------------------------------------------------------------------------------------------\|

	# Use this tokenizer for OLMO...

	tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-hf", trust_remote_code=True)

	# ...and this one for Pythia

	tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m", trust_remote_code=True)


	tokenizer.padding_side = "left"

	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token


	# Usage example (fulltuning):

	model_path = "models/olmo-fulltuning"

	model = utils.get_fulltuning_model(model_path, model_type="olmo") # model_type can be "olmo" or "pythia"

	example_prompt = "I think, therefore I am.\n\nI am."

	inputs = tokenizer(example_prompt, return_tensors="pt", truncation=True)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = model(**inputs)

	logits = outputs.logits

	print(f"Logits shape: {logits.shape}")
	print(f"Logits: {logits}")



	# Usage example (softprompt):

	model_path = "models/olmo-softprompt"

	model = utils.get_peft_model(model_path, model_type="olmo") # model_type can be "olmo" or "pythia"

	example_prompt = "I think, but do I exist?\n\nSince you think, you exist."

	inputs = tokenizer(example_prompt, return_tensors="pt", truncation=True)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = model(**inputs)

	logits = outputs.logits

	print(f"Logits shape: {logits.shape}")
	print(f"Logits: {logits}")



	# Usage example (lora):

	model_path = "models/olmo-lora"

	model = utils.get_peft_model(model_path, model_type="olmo") # model_type can be "olmo" or "pythia"

	example_prompt = "Who am I?\n\nWhat am I?"

	inputs = tokenizer(example_prompt, return_tensors="pt", truncation=True)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = model(**inputs)

	logits = outputs.logits

	print(f"Logits shape: {logits.shape}")
	print(f"Logits: {logits}")



	# Usage example (lastlayer):

	model_path = "models/olmo-lastlayer"

	model = utils.get_peft_model(model_path, model_type="olmo") # model_type can be "olmo" or "pythia"

	example_prompt = "I love to exist!"

	inputs = tokenizer(example_prompt, return_tensors="pt", truncation=True)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = model(**inputs)

	logits = outputs.logits

	print(f"Logits shape: {logits.shape}")
	print(f"Logits: {logits}")



	# Usage example (prefix):

	model_path = "models/olmo-prefix"

	model = utils.get_peft_model(model_path, model_type="olmo") # model_type can be "olmo" or "pythia"

	example_prompt = "I will exist yesterday."

	inputs = tokenizer(example_prompt, return_tensors="pt", truncation=True)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = utils.forward_peft_seqcls(model, **inputs)

	logits = outputs.logits

	print(f"Logits shape: {logits.shape}")
	print(f"Logits: {logits}")

	# --------------------------------
	# SUBMISSION FORMAT
	# --------------------------------

	"""
	The submission must be a .csv file with the following format:

	-"type": Name of the model (e.g., "softprompt", "fulltuning", etc.)
	-"subset_id": ID of the subset (from 0 to 999, per type)
	-"membership": Membership score for each subset (float)
	"""

	# Example Submission:

	types = ["softprompt", "fulltuning", "lora", "lastlayer", "prefix"]
	type_list = []

	for t in types:
	type_list.extend([t] * 1000)

	subset_ids = []
	for _ in types:
	subset_ids.extend(list(range(1000)))

	membership_scores = torch.rand(5000).tolist()
	submission_df = pd.DataFrame({
	"type": type_list,
	"subset_id": subset_ids,
	"membership": membership_scores
	})
	submission_df.to_csv("example_submission.csv", index=None)

	# --------------------------------
	# SUBMISSION PROCESS
	# --------------------------------

	"""
	Example submission script for the LLM Dataset Membership Inference Task.

	Submission Requirements (read carefully to avoid automatic rejection):

	1. CSV FORMAT
	----------------
	- The file must be a CSV with extension `.csv`.
	- It must contain exactly three columns, named:
	type, subset_id, membership
	→ Column names must match exactly (lowercase, no extra spaces).
	→ Column order does not matter, but all three must be present.

	2. ROW COUNT AND IDENTIFIERS
	-------------------------------
	- Your file must contain exactly 5000 rows.
	- Each row corresponds to one unique `subset_id`/`type` pair, with ids in the range 0–999 (inclusive).
	- Every subset_id must appear exactly once for each type.
	- Do not add, remove, or rename any IDs.
	- Do not include duplicates or missing entries.
	- The evaluator checks:
	subset_id.min() == 0
	subset_id.max() == 999
	subset_id.unique().size == 1000

	3. MEMBERSHIP SCORES
	----------------------
	- The `membership` column must contain numeric values representing your model’s predicted confidence
	that the corresponding subset is a member of the training set.

	Examples of valid membership values:
	- Probabilities: values in [0.0, 1.0]
	- Raw model scores: any finite numeric values (will be ranked for TPR@FPR=0.05)

	- Do not submit string labels like "yes"/"no" or "member"/"non-member".
	- The evaluator converts your `membership` column to numeric using `pd.to_numeric()`.
	→ Any non-numeric, NaN, or infinite entries will cause automatic rejection.

	4. TECHNICAL LIMITS
	----------------------
	- Maximum file size: 20 MB
	- Encoding: UTF-8 recommended.
	- Avoid extra columns, blank lines, or formulas.
	- Ensure all values are numeric and finite.
	- Supported data types: int, float (e.g., float32, float64)

	5. VALIDATION SUMMARY
	------------------------
	Your submission will fail if:
	- Columns don’t match exactly ("type", "subset_id", "membership")
	- Row count differs from 5000
	- Any type name is unexpected or not in the allowed set
	- Any subset_id is missing, duplicated, or outside [0, 999] for any type
	- Any membership value is NaN, Inf, or non-numeric
	- File is too large or not a valid CSV

	Two key metrics are computed:
	1. ROC-AUC (Area Under the ROC Curve) — measures overall discriminative ability.
	2. TPR@FPR=0.05 — true positive rate when the false positive rate is at 5%.

	"""

	BASE_URL = "http://35.192.205.84:80"
	API_KEY = "YOUR_API_KEY_HERE" # replace with your actual API key

	TASK_ID = "14-llm-dataset-inference"
	FILE_PATH = "Your-Submission-File.csv" # replace with your actual file path

	SUBMIT = False # Set to True to enable submission

	def die(msg):
	print(f"{msg}", file=sys.stderr)
	sys.exit(1)

	if SUBMIT:
	if not os.path.isfile(FILE_PATH):
	die(f"File not found: {FILE_PATH}")

	try:
	with open(FILE_PATH, "rb") as f:
	files = {
	# (fieldname) -> (filename, fileobj, content_type)
	"file": (os.path.basename(FILE_PATH), f, "csv"),
	}
	resp = requests.post(
	f"{BASE_URL}/submit/{TASK_ID}",
	headers={"X-API-Key": API_KEY},
	files=files,
	timeout=(10, 120), # (connect timeout, read timeout)
	)
	# Helpful output even on non-2xx
	try:
	body = resp.json()
	except Exception:
	body = {"raw_text": resp.text}

	if resp.status_code == 413:
	die("Upload rejected: file too large (HTTP 413). Reduce size and try again.")

	resp.raise_for_status()

	submission_id = body.get("submission_id")
	print("Successfully submitted.")
	print("Server response:", body)
	if submission_id:
	print(f"Submission ID: {submission_id}")

	except requests.exceptions.RequestException as e:
	detail = getattr(e, "response", None)
	print(f"Submission error: {e}")
	if detail is not None:
	try:
	print("Server response:", detail.json())
	except Exception:
	print("Server response (text):", detail.text)
	sys.exit(1)