Skip to content

Commit

Permalink
feat: add C-Eval
Browse files Browse the repository at this point in the history
  • Loading branch information
Ki-Seki committed Oct 2, 2024
1 parent 5d6710f commit 2db2b1d
Show file tree
Hide file tree
Showing 6 changed files with 473 additions and 0 deletions.
3 changes: 3 additions & 0 deletions eval/benchs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .base_evaluator import BaseEvaluator
from .ceval.eval_ceval import CEvalEvaluator
from .exampleqa.eval_exampleqa import ExampleQAEvaluator
from .halluqa.eval_halluqa_mc import HalluQAMCEvaluator
from .halueval.eval_halueval_dialog import HaluEvalDialogEvaluator
Expand All @@ -11,6 +12,8 @@

# ! Register all evaluators here in alphabetical order.
__all__ = [
# CEval
"CEvalEvaluator",
# ExampleQA
"ExampleQAEvaluator",
# HalluQA
Expand Down
34 changes: 34 additions & 0 deletions eval/benchs/ceval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# C-Eval

## Information

- **Paper**: C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models
- **Institution**:
- Shanghai Jiao Tong University
- Tsinghua University
- University of Edinburgh
- Hong Kong University of Science and Technology
- **arXiv**: https://arxiv.org/abs/2305.08322
- **GitHub**: https://github.com/hkust-nlp/ceval
- **Website**: https://cevalbenchmark.com/

## Evaluators

| Evaluator | Metric | Description |
| ---------------- | -------- | ----------------- |
| `CEvalEvaluator` | Accuracy | Multi-choice task |

## Note

Make sure you can **access Hugging Face** so that the dataset can be downloaded.

## Citation

```bibtex
@inproceedings{huang2023ceval,
title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
booktitle={Advances in Neural Information Processing Systems},
year={2023}
}
```
40 changes: 40 additions & 0 deletions eval/benchs/ceval/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from collections import defaultdict
from typing import Literal

from datasets import load_dataset
from tqdm import tqdm

from ..base_dataset import BaseDataset
from .utils import get_subject_mapping


class CEvalDataset(BaseDataset):
def __init__(
self, disciplines: set[str] = None, split: Literal["test", "val", "dev"] = "val"
):
"""
Args:
disciplines: Disciplines to load. If None, all disciplines will be loaded.
split: The split to load. One of "test", "val", "dev".
"""
subject_mapping = get_subject_mapping()
self.data = []
if disciplines is None:
disciplines = set(subject_mapping.keys())

for discipline in tqdm(disciplines, desc=f"Loading CEval > {split}"):
ds = load_dataset("ceval/ceval-exam", discipline, split=split)
for item in ds:
item["id"] = f"{discipline}_{split}_{item['id']:>04}"
item["type"] = discipline
self.data.append(item)

def load(self) -> list[dict]:
return self.data

def load_as_dict_of_discipline(self, num_shots: int) -> dict[str, list[dict]]:
examples = defaultdict(list)
for item in self.data:
if len(examples[item["type"]]) < num_shots:
examples[item["type"]].append(item)
return examples
124 changes: 124 additions & 0 deletions eval/benchs/ceval/eval_ceval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from typing import Literal

from ...llms import BaseLLM
from ..base_evaluator import BaseEvaluator
from .dataset import CEvalDataset
from .utils import get_subject_mapping

QA_TEMPLATE = """
{question}
A. {choice_a}
B. {choice_b}
C. {choice_c}
D. {choice_d}
答案:{answer}
"""

PROMPT_TEMPLATE = """以下是中国关于{discipline}考试的单项选择题,请选出其中的正确答案。
{qa_examples}
{qa_test}"""


CEVAL_HARD_DISCIPLINES = ",".join(
[
"advanced_mathematics",
"discrete_mathematics",
"probability_and_statistics",
"college_chemistry",
"college_physics",
"high_school_mathematics",
"high_school_chemistry",
"high_school_physics",
]
)


class CEvalEvaluator(BaseEvaluator):

def __init__(
self,
model: BaseLLM,
num_batches: int = 1,
output_dir: str = "./output",
disciplines: str = CEVAL_HARD_DISCIPLINES,
split: Literal["test", "val", "dev"] = "val",
num_shots: int = 2,
):
super().__init__(
model,
num_batches,
output_dir,
disciplines=disciplines,
split=split,
num_shots=num_shots,
)

self.split = split

# ─── Get Valid Disciplines ────────────────────────────────────

self.all_disciplines = set(get_subject_mapping().keys())
if disciplines is None:
self.disciplines = self.all_disciplines
else:
self.disciplines = set(disciplines.split(",")) & self.all_disciplines

# ─── Load Examples For Few-shot Learning ──────────────────────

if num_shots > 0:
ds = CEvalDataset(self.disciplines, split="dev")
self.discipline_examples = ds.load_as_dict_of_discipline(num_shots)
else:
self.discipline_examples = {}

def set_generation_configs(self) -> None:
new_configs = {"max_new_tokens": 16, "do_sample": False}
self.model.update_generation_configs(new_configs)

def load_batched_dataset(self) -> list[list[dict]]:
dataset = CEvalDataset(self.disciplines, split=self.split)
batches = dataset.to_batched(self.num_batches)
return batches

def qa_prompt(self, examples: list[dict]) -> str:
prompt = "".join(
QA_TEMPLATE.format(
question=example["question"],
choice_a=example["A"],
choice_b=example["B"],
choice_c=example["C"],
choice_d=example["D"],
answer=example["answer"],
)
for example in examples
)
return prompt

def scoring(self, data_point: dict) -> dict:
discipline = data_point["type"]
query = PROMPT_TEMPLATE.format(
discipline=get_subject_mapping()[discipline][1], # Get the Chinese name
qa_examples=self.qa_prompt(self.discipline_examples[discipline]),
qa_test=self.qa_prompt([data_point]),
)
query = query.strip()[:-1] # Remove the answer to be predicted
response = self.model.safe_request(query)
answer = response.strip().split("\n")[0].strip() # Get the first line
return {
"metrics": {
"correct": answer == data_point["answer"],
},
"log": {
"answer": answer,
"response": response,
"query": query,
},
"valid": answer != "",
}

def compute_overall(self, results: list[dict]) -> dict:
return {
"accuracy": sum([result["metrics"]["correct"] for result in results])
/ len(results),
"num": len(results),
}
Loading

0 comments on commit 2db2b1d

Please sign in to comment.