-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
473 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# C-Eval | ||
|
||
## Information | ||
|
||
- **Paper**: C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models | ||
- **Institution**: | ||
- Shanghai Jiao Tong University | ||
- Tsinghua University | ||
- University of Edinburgh | ||
- Hong Kong University of Science and Technology | ||
- **arXiv**: https://arxiv.org/abs/2305.08322 | ||
- **GitHub**: https://github.com/hkust-nlp/ceval | ||
- **Website**: https://cevalbenchmark.com/ | ||
|
||
## Evaluators | ||
|
||
| Evaluator | Metric | Description | | ||
| ---------------- | -------- | ----------------- | | ||
| `CEvalEvaluator` | Accuracy | Multi-choice task | | ||
|
||
## Note | ||
|
||
Make sure you can **access Hugging Face** so that the dataset can be downloaded. | ||
|
||
## Citation | ||
|
||
```bibtex | ||
@inproceedings{huang2023ceval, | ||
title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models}, | ||
author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian}, | ||
booktitle={Advances in Neural Information Processing Systems}, | ||
year={2023} | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from collections import defaultdict | ||
from typing import Literal | ||
|
||
from datasets import load_dataset | ||
from tqdm import tqdm | ||
|
||
from ..base_dataset import BaseDataset | ||
from .utils import get_subject_mapping | ||
|
||
|
||
class CEvalDataset(BaseDataset): | ||
def __init__( | ||
self, disciplines: set[str] = None, split: Literal["test", "val", "dev"] = "val" | ||
): | ||
""" | ||
Args: | ||
disciplines: Disciplines to load. If None, all disciplines will be loaded. | ||
split: The split to load. One of "test", "val", "dev". | ||
""" | ||
subject_mapping = get_subject_mapping() | ||
self.data = [] | ||
if disciplines is None: | ||
disciplines = set(subject_mapping.keys()) | ||
|
||
for discipline in tqdm(disciplines, desc=f"Loading CEval > {split}"): | ||
ds = load_dataset("ceval/ceval-exam", discipline, split=split) | ||
for item in ds: | ||
item["id"] = f"{discipline}_{split}_{item['id']:>04}" | ||
item["type"] = discipline | ||
self.data.append(item) | ||
|
||
def load(self) -> list[dict]: | ||
return self.data | ||
|
||
def load_as_dict_of_discipline(self, num_shots: int) -> dict[str, list[dict]]: | ||
examples = defaultdict(list) | ||
for item in self.data: | ||
if len(examples[item["type"]]) < num_shots: | ||
examples[item["type"]].append(item) | ||
return examples |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
from typing import Literal | ||
|
||
from ...llms import BaseLLM | ||
from ..base_evaluator import BaseEvaluator | ||
from .dataset import CEvalDataset | ||
from .utils import get_subject_mapping | ||
|
||
QA_TEMPLATE = """ | ||
{question} | ||
A. {choice_a} | ||
B. {choice_b} | ||
C. {choice_c} | ||
D. {choice_d} | ||
答案:{answer} | ||
""" | ||
|
||
PROMPT_TEMPLATE = """以下是中国关于{discipline}考试的单项选择题,请选出其中的正确答案。 | ||
{qa_examples} | ||
{qa_test}""" | ||
|
||
|
||
CEVAL_HARD_DISCIPLINES = ",".join( | ||
[ | ||
"advanced_mathematics", | ||
"discrete_mathematics", | ||
"probability_and_statistics", | ||
"college_chemistry", | ||
"college_physics", | ||
"high_school_mathematics", | ||
"high_school_chemistry", | ||
"high_school_physics", | ||
] | ||
) | ||
|
||
|
||
class CEvalEvaluator(BaseEvaluator): | ||
|
||
def __init__( | ||
self, | ||
model: BaseLLM, | ||
num_batches: int = 1, | ||
output_dir: str = "./output", | ||
disciplines: str = CEVAL_HARD_DISCIPLINES, | ||
split: Literal["test", "val", "dev"] = "val", | ||
num_shots: int = 2, | ||
): | ||
super().__init__( | ||
model, | ||
num_batches, | ||
output_dir, | ||
disciplines=disciplines, | ||
split=split, | ||
num_shots=num_shots, | ||
) | ||
|
||
self.split = split | ||
|
||
# ─── Get Valid Disciplines ──────────────────────────────────── | ||
|
||
self.all_disciplines = set(get_subject_mapping().keys()) | ||
if disciplines is None: | ||
self.disciplines = self.all_disciplines | ||
else: | ||
self.disciplines = set(disciplines.split(",")) & self.all_disciplines | ||
|
||
# ─── Load Examples For Few-shot Learning ────────────────────── | ||
|
||
if num_shots > 0: | ||
ds = CEvalDataset(self.disciplines, split="dev") | ||
self.discipline_examples = ds.load_as_dict_of_discipline(num_shots) | ||
else: | ||
self.discipline_examples = {} | ||
|
||
def set_generation_configs(self) -> None: | ||
new_configs = {"max_new_tokens": 16, "do_sample": False} | ||
self.model.update_generation_configs(new_configs) | ||
|
||
def load_batched_dataset(self) -> list[list[dict]]: | ||
dataset = CEvalDataset(self.disciplines, split=self.split) | ||
batches = dataset.to_batched(self.num_batches) | ||
return batches | ||
|
||
def qa_prompt(self, examples: list[dict]) -> str: | ||
prompt = "".join( | ||
QA_TEMPLATE.format( | ||
question=example["question"], | ||
choice_a=example["A"], | ||
choice_b=example["B"], | ||
choice_c=example["C"], | ||
choice_d=example["D"], | ||
answer=example["answer"], | ||
) | ||
for example in examples | ||
) | ||
return prompt | ||
|
||
def scoring(self, data_point: dict) -> dict: | ||
discipline = data_point["type"] | ||
query = PROMPT_TEMPLATE.format( | ||
discipline=get_subject_mapping()[discipline][1], # Get the Chinese name | ||
qa_examples=self.qa_prompt(self.discipline_examples[discipline]), | ||
qa_test=self.qa_prompt([data_point]), | ||
) | ||
query = query.strip()[:-1] # Remove the answer to be predicted | ||
response = self.model.safe_request(query) | ||
answer = response.strip().split("\n")[0].strip() # Get the first line | ||
return { | ||
"metrics": { | ||
"correct": answer == data_point["answer"], | ||
}, | ||
"log": { | ||
"answer": answer, | ||
"response": response, | ||
"query": query, | ||
}, | ||
"valid": answer != "", | ||
} | ||
|
||
def compute_overall(self, results: list[dict]) -> dict: | ||
return { | ||
"accuracy": sum([result["metrics"]["correct"] for result in results]) | ||
/ len(results), | ||
"num": len(results), | ||
} |
Oops, something went wrong.