1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
| from transformers import AutoTokenizer, AutoModelForCausalLM from vllm import LLM, SamplingParams from typing import Callable, List from cs336_alignment.drgrpo_grader import r1_zero_reward_fn import json import os
model_path = "/gz-data/models/Qwen2.5-Math-1.5B" MATH_DATASET_PATH = "/root/yjx/assignment5-alignment/data/MATH/validation.jsonl" PROMPT_TEMPLATE_PATH = "/root/yjx/assignment5-alignment/cs336_alignment/prompts/r1_zero.prompt"
def load_prompts(file_path: str, template_path: str) -> List[str]: ''' Load prompts from a JSONL file and return them as a list of strings. ''' prompts = [] answers = [] with open(template_path, 'r') as f: template = f.read()
with open(file_path, 'r') as f: for line in f: data = json.loads(line) question = data['problem'] answer = data['answer'] prompt = template.replace("{question}", question) prompts.append(prompt) answers.append(answer) return prompts, answers
def evaluate_vllm( vllm_model: LLM, reward_fn: Callable[[str, str], dict[str, float]], prompts: List[str], groundtruths: List[str], eval_sampling_params: SamplingParams, save_path: str = "/root/yjx/assignment5-alignment/cs336_alignment/zero_shot_results" ) -> None: """ Evaluate a language model on a list of prompts, compute evaluation metrics, and serialize results to disk. """ count_all_correct = 0 count_format_only = 0 count_fail = 0
format_fails = [] answer_fails = []
outputs = vllm_model.generate(prompts, sampling_params=eval_sampling_params)
if not os.path.exists(save_path): os.makedirs(save_path)
with open(os.path.join(save_path, "results.jsonl"), 'w') as f: for prompt, output, groundtruth in zip(prompts, outputs, groundtruths): result = output.outputs[0].text reward = reward_fn(result, groundtruth) if reward["format_reward"] == 1 and reward["answer_reward"] == 1: count_all_correct += 1 elif reward["format_reward"] == 1 and reward["answer_reward"] == 0: count_format_only += 1 elif reward["format_reward"] == 0 and reward["answer_reward"] == 0: count_fail += 1
if reward["format_reward"] == 0 and len(format_fails) < 10: format_fails.append({"res": result, "gt": groundtruth}) elif reward["format_reward"] == 1 and reward["answer_reward"] == 0 and len(answer_fails) < 10: answer_fails.append({"res": result, "gt": groundtruth})
f.write(json.dumps({ "prompt": prompt, "result": result, "groundtruth": groundtruth, "rewards": reward }) + "\n") total = len(prompts) print(f"Total Prompts: {total}") print(f"All Correct: {count_all_correct} ({count_all_correct / total:.2%})") print(f"Format Only: {count_format_only} ({count_format_only / total:.2%})") print(f"Fail: {count_fail} ({count_fail / total:.2%})")
with open(os.path.join(save_path, "format_fails.json"), 'w') as f: json.dump(format_fails, f, indent=4) with open(os.path.join(save_path, "answer_fails.json"), 'w') as f: json.dump(answer_fails, f, indent=4)
if __name__ == "__main__": prompts, answers = load_prompts(MATH_DATASET_PATH, PROMPT_TEMPLATE_PATH) llm = LLM(model=model_path) sampling_params = SamplingParams(temperature=1, top_p=1, max_tokens=1024, stop=["</answer>"], include_stop_str_in_output=True) evaluate_vllm(llm, r1_zero_reward_fn, prompts, answers, sampling_params)
|