feat: add C-Eval

IAAR-Shanghai · Oct 2, 2024 · 2db2b1d · 2db2b1d
1 parent 5d6710f
commit 2db2b1d
Show file tree

Hide file tree

Showing 6 changed files with 473 additions and 0 deletions.
diff --git a/eval/benchs/__init__.py b/eval/benchs/__init__.py
@@ -1,4 +1,5 @@
 from .base_evaluator import BaseEvaluator
+from .ceval.eval_ceval import CEvalEvaluator
 from .exampleqa.eval_exampleqa import ExampleQAEvaluator
 from .halluqa.eval_halluqa_mc import HalluQAMCEvaluator
 from .halueval.eval_halueval_dialog import HaluEvalDialogEvaluator
@@ -11,6 +12,8 @@
 
 # ! Register all evaluators here in alphabetical order.
 __all__ = [
+    # CEval
+    "CEvalEvaluator",
     # ExampleQA
     "ExampleQAEvaluator",
     # HalluQA

diff --git a/eval/benchs/ceval/README.md b/eval/benchs/ceval/README.md
@@ -0,0 +1,34 @@
+# C-Eval
+
+## Information
+
+- **Paper**: C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models
+- **Institution**:
+  - Shanghai Jiao Tong University
+  - Tsinghua University
+  - University of Edinburgh
+  - Hong Kong University of Science and Technology
+- **arXiv**: https://arxiv.org/abs/2305.08322
+- **GitHub**: https://github.com/hkust-nlp/ceval
+- **Website**: https://cevalbenchmark.com/
+
+## Evaluators
+
+| Evaluator        | Metric   | Description       |
+| ---------------- | -------- | ----------------- |
+| `CEvalEvaluator` | Accuracy | Multi-choice task |
+
+## Note
+
+Make sure you can **access Hugging Face** so that the dataset can be downloaded.
+
+## Citation
+
+```bibtex
+@inproceedings{huang2023ceval,
+title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
+author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
+booktitle={Advances in Neural Information Processing Systems},
+year={2023}
+}
+```
diff --git a/eval/benchs/ceval/dataset.py b/eval/benchs/ceval/dataset.py
@@ -0,0 +1,40 @@
+from collections import defaultdict
+from typing import Literal
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+from ..base_dataset import BaseDataset
+from .utils import get_subject_mapping
+
+
+class CEvalDataset(BaseDataset):
+    def __init__(
+        self, disciplines: set[str] = None, split: Literal["test", "val", "dev"] = "val"
+    ):
+        """
+        Args:
+            disciplines: Disciplines to load. If None, all disciplines will be loaded.
+            split: The split to load. One of "test", "val", "dev".
+        """
+        subject_mapping = get_subject_mapping()
+        self.data = []
+        if disciplines is None:
+            disciplines = set(subject_mapping.keys())
+
+        for discipline in tqdm(disciplines, desc=f"Loading CEval > {split}"):
+            ds = load_dataset("ceval/ceval-exam", discipline, split=split)
+            for item in ds:
+                item["id"] = f"{discipline}_{split}_{item['id']:>04}"
+                item["type"] = discipline
+                self.data.append(item)
+
+    def load(self) -> list[dict]:
+        return self.data
+
+    def load_as_dict_of_discipline(self, num_shots: int) -> dict[str, list[dict]]:
+        examples = defaultdict(list)
+        for item in self.data:
+            if len(examples[item["type"]]) < num_shots:
+                examples[item["type"]].append(item)
+        return examples
diff --git a/eval/benchs/ceval/eval_ceval.py b/eval/benchs/ceval/eval_ceval.py
@@ -0,0 +1,124 @@
+from typing import Literal
+
+from ...llms import BaseLLM
+from ..base_evaluator import BaseEvaluator
+from .dataset import CEvalDataset
+from .utils import get_subject_mapping
+
+QA_TEMPLATE = """
+{question}
+A. {choice_a}
+B. {choice_b}
+C. {choice_c}
+D. {choice_d}
+答案：{answer}
+"""
+
+PROMPT_TEMPLATE = """以下是中国关于{discipline}考试的单项选择题，请选出其中的正确答案。
+{qa_examples}
+{qa_test}"""
+
+
+CEVAL_HARD_DISCIPLINES = ",".join(
+    [
+        "advanced_mathematics",
+        "discrete_mathematics",
+        "probability_and_statistics",
+        "college_chemistry",
+        "college_physics",
+        "high_school_mathematics",
+        "high_school_chemistry",
+        "high_school_physics",
+    ]
+)
+
+
+class CEvalEvaluator(BaseEvaluator):
+
+    def __init__(
+        self,
+        model: BaseLLM,
+        num_batches: int = 1,
+        output_dir: str = "./output",
+        disciplines: str = CEVAL_HARD_DISCIPLINES,
+        split: Literal["test", "val", "dev"] = "val",
+        num_shots: int = 2,
+    ):
+        super().__init__(
+            model,
+            num_batches,
+            output_dir,
+            disciplines=disciplines,
+            split=split,
+            num_shots=num_shots,
+        )
+
+        self.split = split
+
+        # ─── Get Valid Disciplines ────────────────────────────────────
+
+        self.all_disciplines = set(get_subject_mapping().keys())
+        if disciplines is None:
+            self.disciplines = self.all_disciplines
+        else:
+            self.disciplines = set(disciplines.split(",")) & self.all_disciplines
+
+        # ─── Load Examples For Few-shot Learning ──────────────────────
+
+        if num_shots > 0:
+            ds = CEvalDataset(self.disciplines, split="dev")
+            self.discipline_examples = ds.load_as_dict_of_discipline(num_shots)
+        else:
+            self.discipline_examples = {}
+
+    def set_generation_configs(self) -> None:
+        new_configs = {"max_new_tokens": 16, "do_sample": False}
+        self.model.update_generation_configs(new_configs)
+
+    def load_batched_dataset(self) -> list[list[dict]]:
+        dataset = CEvalDataset(self.disciplines, split=self.split)
+        batches = dataset.to_batched(self.num_batches)
+        return batches
+
+    def qa_prompt(self, examples: list[dict]) -> str:
+        prompt = "".join(
+            QA_TEMPLATE.format(
+                question=example["question"],
+                choice_a=example["A"],
+                choice_b=example["B"],
+                choice_c=example["C"],
+                choice_d=example["D"],
+                answer=example["answer"],
+            )
+            for example in examples
+        )
+        return prompt
+
+    def scoring(self, data_point: dict) -> dict:
+        discipline = data_point["type"]
+        query = PROMPT_TEMPLATE.format(
+            discipline=get_subject_mapping()[discipline][1],  # Get the Chinese name
+            qa_examples=self.qa_prompt(self.discipline_examples[discipline]),
+            qa_test=self.qa_prompt([data_point]),
+        )
+        query = query.strip()[:-1]  # Remove the answer to be predicted
+        response = self.model.safe_request(query)
+        answer = response.strip().split("\n")[0].strip()  # Get the first line
+        return {
+            "metrics": {
+                "correct": answer == data_point["answer"],
+            },
+            "log": {
+                "answer": answer,
+                "response": response,
+                "query": query,
+            },
+            "valid": answer != "",
+        }
+
+    def compute_overall(self, results: list[dict]) -> dict:
+        return {
+            "accuracy": sum([result["metrics"]["correct"] for result in results])
+            / len(results),
+            "num": len(results),
+        }