| | import argparse
|
| | import json
|
| | import os
|
| | import torch
|
| | from pathlib import Path
|
| | from tqdm import tqdm
|
| |
|
| | data_abs_dir = Path(__file__).parent / "data"
|
| |
|
| | from utils.utils import extract_generation_code, languge_settings
|
| | from transformers import AutoTokenizer, AutoModelForCausalLM
|
| | from human_eval.evaluation import evaluate_functional_correctness
|
| |
|
| | def build_deepseekcoder_instruction(languge: str, question: str):
|
| | return '''
|
| | Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
|
| | ```{}
|
| | {}
|
| | ```
|
| | '''.strip().format(languge.lower(), question.strip())
|
| |
|
| | def generate_one(example, lang, tokenizer, model):
|
| | prompt = build_deepseekcoder_instruction(languge_settings[lang]['full_name'], example['prompt'])
|
| | inputs = tokenizer.apply_chat_template(
|
| | [{'role': 'user', 'content': prompt }],
|
| | return_tensors="pt",
|
| | add_generation_prompt=True
|
| | ).to(model.device)
|
| |
|
| | stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
|
| | assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
|
| |
|
| | outputs = model.generate(
|
| | inputs,
|
| | max_new_tokens=1024,
|
| | do_sample=False,
|
| |
|
| |
|
| | pad_token_id=stop_id,
|
| | eos_token_id=stop_id
|
| | )
|
| |
|
| | output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
|
| | example['output'] = output
|
| |
|
| | return extract_generation_code(example, lang_code=lang)
|
| |
|
| | def generate_main(args):
|
| | model_name_or_path = args.model
|
| | lang = args.language
|
| | saved_path = args.output_path
|
| | temp_dir = args.temp_dir
|
| | os.makedirs(temp_dir, exist_ok=True)
|
| | problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
| |
|
| | print("model", model_name_or_path)
|
| | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
| | print("load tokenizer {} from {} over.".format(tokenizer.__class__, model_name_or_path))
|
| | model = AutoModelForCausalLM.from_pretrained(
|
| | model_name_or_path,
|
| | torch_dtype=torch.bfloat16,
|
| | device_map="auto",
|
| |
|
| | )
|
| | model.eval()
|
| | examples = [json.loads(x) for x in open(problem_file) if x.strip()]
|
| | print("Read {} examples for evaluation over.".format(len(examples)))
|
| |
|
| | generated_examples = []
|
| | for ex in tqdm(examples, desc='Generating'):
|
| | gen_example = generate_one(ex, args.language, tokenizer, model)
|
| | generated_examples.append(gen_example)
|
| |
|
| | print("Generate all over!!!")
|
| | with open(saved_path, 'w', encoding='utf-8') as fw:
|
| | for ex in generated_examples:
|
| | fw.write(json.dumps(ex) + '\n')
|
| | print("Save {} processed examples into {} over!".format(len(generated_examples), saved_path))
|
| |
|
| | result = evaluate_functional_correctness(
|
| | input_file=saved_path,
|
| | tmp_dir=temp_dir,
|
| | n_workers=8,
|
| | timeout=3.0,
|
| | problem_file=problem_file,
|
| | language=lang
|
| | )
|
| | print(lang, result, model_name_or_path)
|
| | pass
|
| |
|
| | def evaluation_only(args):
|
| | lang = args.language
|
| | temp_dir = args.temp_dir
|
| | assert os.path.exists(args.output_path), "Not fond output file: {}".format(args.output_path)
|
| | os.makedirs(temp_dir, exist_ok=True)
|
| |
|
| | output_name = os.path.basename(args.output_path)
|
| | output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
|
| |
|
| | processed_examples = [extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")]
|
| | processed_path = os.path.join(temp_dir, output_name)
|
| | with open(processed_path, 'w', encoding='utf-8') as fw:
|
| | for ex in processed_examples:
|
| | fw.write(json.dumps(ex) + '\n')
|
| | print("Save {} processed examples into {} over!".format(len(processed_examples), processed_path))
|
| |
|
| | problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
| | from human_eval.evaluation import evaluate_functional_correctness
|
| | result = evaluate_functional_correctness(
|
| | input_file=processed_path,
|
| | tmp_dir=temp_dir,
|
| | n_workers=8,
|
| | timeout=3.0,
|
| | problem_file=problem_file,
|
| | language=lang
|
| | )
|
| | print(lang, result)
|
| |
|
| | if __name__ == '__main__':
|
| | parser = argparse.ArgumentParser()
|
| | parser.add_argument('--model', type=str, help="model name or path")
|
| | parser.add_argument('--output_path', type=str, help="output path of your generation")
|
| | parser.add_argument('--language', type=str, help="langauge")
|
| | parser.add_argument('--temp_dir', type=str, help="temp dir for evaluation", default="tmp")
|
| | args = parser.parse_args()
|
| |
|
| | os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| | generate_main(args)
|
| | pass
|
| |
|