|
|
| def main(args): |
| MODEL_NAME = args.model |
|
|
| EVAL_FILE = args.file |
| print(f"Using evaluation file: {EVAL_FILE}") |
|
|
| |
| import shutil |
| import os |
| import time |
|
|
| run_start_time = time.time() |
|
|
| os.makedirs("tmp", exist_ok=True) |
| os.makedirs("evals_res", exist_ok=True) |
|
|
| |
| EVAL_FILE_BASENAME = os.path.basename(EVAL_FILE) |
| MODEL_NAME_STR = "+".join(args.model.split("/")) |
| SAVED_EVAL_FILE = f"{str(run_start_time)}_{MODEL_NAME_STR}_{EVAL_FILE_BASENAME}_seq{args.num_seqs}_tok{args.tokens}_q{args.quant_policy}_tpp{args.top_p}_mnp{args.min_p}_tpk{args.top_k}" |
|
|
| import os |
|
|
| os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" |
| os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda/bin/ptxas" |
| import re |
| import random |
| import warnings |
| from collections import Counter |
| import numpy as np, pandas as pd, polars as pl |
|
|
| import torch |
| import lmdeploy |
| from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig |
| from transformers import AutoTokenizer |
|
|
| warnings.simplefilter("ignore") |
| print("PyTorch version:", torch.__version__) |
| print("LMDeploy:", lmdeploy.__version__) |
|
|
| def seed_everything(seed): |
| os.environ["PYTHONHASHSEED"] = str(seed) |
| random.seed(seed) |
| np.random.seed(seed) |
| torch.manual_seed(seed) |
| torch.cuda.manual_seed(seed) |
| torch.backends.cudnn.benchmark = True |
| torch.backends.cudnn.deterministic = True |
|
|
| seed_everything(seed=0) |
|
|
| |
| |
| |
| |
|
|
| llm_model_pth = MODEL_NAME |
|
|
| MAX_NUM_SEQS = args.num_seqs |
| MAX_MODEL_LEN = 1024 * 12 |
| EVAL = True |
| EVAL_SELECTED_QUESTIONS_ONLY = False |
|
|
| engine_config = TurbomindEngineConfig( |
| |
| quant_policy=args.quant_policy, |
| cache_max_entry_count=0.95, |
| session_len=MAX_MODEL_LEN, |
| enable_prefix_caching=True, |
| max_batch_size=MAX_NUM_SEQS, |
| ) |
|
|
| pipe = pipeline(llm_model_pth, backend_config=engine_config) |
|
|
| tokenizer = AutoTokenizer.from_pretrained(llm_model_pth, trust_remote_code=False) |
|
|
| import re |
|
|
| def extract_boxed_text(text): |
| pattern = r"oxed{(.*?)}" |
| matches = re.findall(pattern, text) |
| if not matches: |
| return "" |
| for match in matches[::-1]: |
| if match != "": |
| return match |
| return "" |
|
|
| def batch_message_filter(list_of_messages) -> tuple[list[list[dict]], list[str]]: |
| extracted_answers = [] |
| list_of_messages_to_keep = [] |
| for messages in list_of_messages: |
| answer = extract_boxed_text(messages[-1]["content"]) |
| if answer: |
| extracted_answers.append(answer) |
| else: |
| list_of_messages_to_keep.append(messages) |
| return list_of_messages_to_keep, extracted_answers |
|
|
| def select_answer(answers): |
| counter = Counter() |
| for answer in answers: |
| try: |
| if int(answer) == float(answer): |
| counter[int(answer)] += 1 + random.random() / 1_000 |
| except: |
| pass |
| if not counter: |
| return 210 |
| _, answer = sorted([(v, k) for k, v in counter.items()], reverse=True)[0] |
| return answer % 1000 |
|
|
| def batch_message_generate(list_of_messages) -> list[list[dict]]: |
| max_tokens = args.tokens |
| |
| |
| |
|
|
| list_of_texts = [ |
| tokenizer.apply_chat_template( |
| conversation=messages, tokenize=False, add_generation_prompt=True |
| ) |
| for messages in list_of_messages |
| ] |
|
|
| gen_configs = [ |
| GenerationConfig( |
| do_sample=True, |
| temperature=1.0, |
| top_k=args.top_k, |
| top_p=args.top_p, |
| min_p=args.min_p, |
| skip_special_tokens=True, |
| max_new_tokens=max_tokens, |
| stop_words=["</think>"], |
| ) |
| for prompt in list_of_texts |
| ] |
|
|
| request_output = pipe( |
| list_of_texts, |
| gen_config=gen_configs, |
| ) |
| print( |
| [ |
| single_request_output.generate_token_len |
| for single_request_output in request_output |
| ] |
| ) |
|
|
| sort_keys_and_list_of_messages = [] |
| for messages, single_request_output in zip(list_of_messages, request_output): |
| |
| |
| |
| messages.append( |
| {"role": "assistant", "content": single_request_output.text} |
| ) |
|
|
| sort_keys_and_list_of_messages.append( |
| (single_request_output.generate_token_len, messages) |
| ) |
| print([sort_key for sort_key, _ in sort_keys_and_list_of_messages]) |
| sort_keys_and_list_of_messages.sort( |
| key=lambda sort_key_and_messages: sort_key_and_messages[0] |
| ) |
| print([sort_key for sort_key, _ in sort_keys_and_list_of_messages]) |
|
|
| list_of_messages = [messages for _, messages in sort_keys_and_list_of_messages] |
| return list_of_messages |
|
|
| def create_starter_messages(question: str, index: int) -> str: |
| options = [] |
| for _ in range(1): |
| options.append( |
| [ |
| { |
| "role": "system", |
| "content": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\boxed{}, after taking modulo 1000.", |
| }, |
| {"role": "user", "content": question}, |
| ] |
| ) |
|
|
| return options[index % len(options)] |
|
|
| def predict_for_question(question: str, question_id=time.time()) -> int: |
| import os |
| import time |
|
|
| start_time = time.time() |
|
|
| if EVAL_SELECTED_QUESTIONS_ONLY and not os.getenv( |
| "KAGGLE_IS_COMPETITION_RERUN" |
| ): |
| |
| |
| if ( |
| "Triangle" not in question |
| and "delightful" not in question |
| and "George" not in question |
| ): |
| return 210 |
|
|
| """ if time.time() > cutoff_time: |
| return 210 """ |
|
|
| print(question) |
|
|
| num_seqs = MAX_NUM_SEQS |
|
|
| list_of_messages = [ |
| create_starter_messages(question, index) for index in range(num_seqs) |
| ] |
|
|
| all_extracted_answers = [] |
| for _ in range(1): |
| list_of_messages = batch_message_generate(list_of_messages) |
|
|
| if not os.getenv("KAGGLE_IS_COMPETITION_RERUN"): |
| df = pd.DataFrame( |
| { |
| "question": [question] * len(list_of_messages), |
| "message": [ |
| messages[-1]["content"] for messages in list_of_messages |
| ], |
| } |
| ) |
| df.to_csv(f"tmp/{str(question_id)}_{SAVED_EVAL_FILE}.csv", index=False) |
|
|
| list_of_messages, extracted_answers = batch_message_filter(list_of_messages) |
| all_extracted_answers.extend(extracted_answers) |
|
|
| print(all_extracted_answers) |
| answer = select_answer(all_extracted_answers) |
| print(answer) |
|
|
| print("\n\n") |
| |
| print(f"Time taken: {time.time() - start_time}") |
| return answer |
|
|
| |
| |
| |
|
|
| |
| import uuid |
|
|
| TEMP_CSV = f"tmp/evals_{SAVED_EVAL_FILE}.csv" |
|
|
| def predict( |
| id_: pl.DataFrame, question: pl.DataFrame |
| ) -> pl.DataFrame | pd.DataFrame: |
| id_ = id_["id"][0] |
| print("------") |
| print(id_) |
|
|
| question = question["problem"][0] |
| answer = predict_for_question(question, question_id=id_) |
| print("------\n\n\n") |
|
|
| if EVAL and not os.getenv("KAGGLE_IS_COMPETITION_RERUN"): |
| |
| row = {"id": id_, "question": question, "answer": answer} |
|
|
| |
| temp_df = pd.DataFrame([row]) |
|
|
| |
| |
| if not os.path.exists(TEMP_CSV): |
| temp_df.to_csv(TEMP_CSV, index=False) |
| else: |
| temp_df.to_csv(TEMP_CSV, mode="a", header=False, index=False) |
|
|
| return pl.DataFrame({"id": id_, "answer": answer}) |
|
|
| """ predict_for_question( |
| "Fred and George take part in a tennis tournament with $4046$ other players. In each round, the players are paired into $2024$ matches. How many ways are there to arrange the first round such that Fred and George do not have to play each other? (Two arrangements for the first round are \\textit{different} if there is a player with a different opponent in the two arrangements.)" |
| ) |
| predict_for_question( |
| "Triangle $ABC$ has side length $AB = 120$ and circumradius $R = 100$. Let $D$ be the foot of the perpendicular from $C$ to the line $AB$. What is the greatest possible length of segment $CD$?" |
| ) |
| |
| return """ |
|
|
| def sample_and_predict(csv_file: str) -> None: |
| """ |
| Reads all rows from the given CSV file, and for each row, |
| calls the predict() function to process the problem. |
| """ |
| |
| df = pd.read_csv(csv_file) |
|
|
| |
| df = df.sample(frac=1, random_state=2024).reset_index(drop=True) |
|
|
| |
| for index, row in df.iterrows(): |
| id_value = row["id"] |
| problem_value = row["problem"] |
|
|
| print(f"Processing row {index}: id = {id_value}, problem = {problem_value}") |
|
|
| |
| id_df = pl.DataFrame({"id": [id_value]}) |
| problem_df = pl.DataFrame({"problem": [problem_value]}) |
|
|
| |
| result = predict(id_df, problem_df) |
| print("Prediction result:") |
| print(result) |
| print("\n") |
| |
| |
|
|
| sample_and_predict(EVAL_FILE) |
|
|
| |
| if ( |
| EVAL |
| and not EVAL_SELECTED_QUESTIONS_ONLY |
| and not os.getenv("KAGGLE_IS_COMPETITION_RERUN") |
| ): |
| import pandas as pd |
|
|
| |
| reference_input_path = EVAL_FILE |
| predictions_path = TEMP_CSV |
|
|
| |
| reference_df = pd.read_csv(reference_input_path) |
| predictions_df = pd.read_csv(predictions_path) |
|
|
| |
| reference_df["id"] = reference_df["id"].astype(str).str.strip() |
| predictions_df["id"] = predictions_df["id"].astype(str).str.strip() |
|
|
| |
| reference_df["answer"] = ( |
| reference_df["answer"].astype(str).str.strip().str.lower() |
| ) |
| predictions_df["answer"] = ( |
| predictions_df["answer"].astype(str).str.strip().str.lower() |
| ) |
|
|
| |
| merged_df = pd.merge( |
| reference_df, |
| predictions_df, |
| on="id", |
| how="inner", |
| suffixes=("_ref", "_pred"), |
| ) |
|
|
| |
| merged_df["is_correct"] = merged_df["answer_ref"] == merged_df["answer_pred"] |
|
|
| |
| total = len(merged_df) |
| correct = merged_df["is_correct"].sum() |
| accuracy = correct / total |
|
|
| std_outputs = "" |
| std_outputs = std_outputs + f"Total predictions compared: {total}" + "\n" |
| std_outputs = std_outputs + f"Number of correct predictions: {correct}" + "\n" |
| std_outputs = std_outputs + f"Accuracy: {accuracy:.2%}" + "\n" |
|
|
| |
| incorrect_df = merged_df[~merged_df["is_correct"]] |
| if not incorrect_df.empty: |
| std_outputs = std_outputs + "\nIncorrect predictions:" + "\n" |
| |
| std_outputs = ( |
| std_outputs |
| + str(incorrect_df[["id", "problem", "answer_ref", "answer_pred"]]) |
| + "\n" |
| ) |
| else: |
| std_outputs = std_outputs + "\nAll predictions match the reference!" + "\n" |
|
|
| time_taken = time.time() - run_start_time |
| std_outputs = std_outputs + f"Time taken: {time_taken:.2f} seconds" + "\n" |
| print(std_outputs) |
|
|
| |
|
|
| |
| with open(f"evals_res/outputs_{SAVED_EVAL_FILE}.log", "w") as f: |
| f.write(std_outputs) |
|
|
| |
| |
| merged_df.to_csv(f"evals_res/evals_{SAVED_EVAL_FILE}.csv", index=False) |
|
|
|
|
| if __name__ == "__main__": |
| import argparse |
| import time |
|
|
| start = time.time() |
|
|
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "--model", |
| type=str, |
| default="casperhansen/deepseek-r1-distill-qwen-7b-awq", |
| help="Model to use", |
| ) |
| parser.add_argument( |
| "--file", |
| type=str, |
| default="hard_batch_1", |
| help="Eval File to use", |
| ) |
| parser.add_argument( |
| "--num_seqs", |
| type=int, |
| default=48, |
| help="Number of sequences to generate per prompt", |
| ) |
| parser.add_argument( |
| "--tokens", |
| type=int, |
| default=1024 * 12, |
| help="Number of sequences to generate per prompt", |
| ) |
|
|
| parser.add_argument( |
| "--quant_policy", |
| type=int, |
| default=8, |
| choices=[8, 4, 0], |
| help="Number of sequences to generate per prompt", |
| ) |
|
|
| parser.add_argument( |
| "--top_k", |
| type=int, |
| default=50, |
| help="Number of sequences to generate per prompt", |
| ) |
|
|
| parser.add_argument( |
| "--top_p", |
| type=float, |
| default=0.90, |
| help="Number of sequences to generate per prompt", |
| ) |
|
|
| parser.add_argument( |
| "--min_p", |
| type=float, |
| default=0.05, |
| help="Number of sequences to generate per prompt", |
| ) |
|
|
| args = parser.parse_args() |
| main(args) |
|
|
| print(f"Time Taken: {time.time() - start}") |
|
|
|
|