You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
212 lines
8.0 KiB
212 lines
8.0 KiB
"""Evaluation results utilities for the `.eval_results/*.yaml` format.
|
|
|
|
See https://huggingface.co/docs/hub/eval-results for more details.
|
|
Specifications are available at https://github.com/huggingface/hub-docs/blob/main/eval_results.yaml.
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Any, Optional
|
|
|
|
|
|
@dataclass
|
|
class EvalResultEntry:
|
|
"""
|
|
Evaluation result entry for the `.eval_results/*.yaml` format.
|
|
|
|
Represents evaluation scores stored in model repos that automatically appear on
|
|
the model page and the benchmark dataset's leaderboard.
|
|
|
|
For the legacy `model-index` format in `README.md`, use [`EvalResult`] instead.
|
|
|
|
See https://huggingface.co/docs/hub/eval-results for more details.
|
|
|
|
Args:
|
|
dataset_id (`str`):
|
|
Benchmark dataset ID from the Hub. Example: "cais/hle", "Idavidrein/gpqa".
|
|
task_id (`str`):
|
|
Task identifier within the benchmark. Example: "gpqa_diamond".
|
|
value (`Any`):
|
|
The metric value. Example: 20.90.
|
|
dataset_revision (`str`, *optional*):
|
|
Git SHA of the benchmark dataset.
|
|
verify_token (`str`, *optional*):
|
|
A signature that can be used to prove that evaluation is provably auditable and reproducible.
|
|
date (`str`, *optional*):
|
|
When the evaluation was run (ISO-8601 datetime). Defaults to git commit time.
|
|
source_url (`str`, *optional*):
|
|
Link to the evaluation source (e.g., https://huggingface.co/spaces/SaylorTwift/smollm3-mmlu-pro). Required if `source_name`, `source_user`, or `source_org` is provided.
|
|
source_name (`str`, *optional*):
|
|
Display name for the source. Example: "Eval Logs".
|
|
source_user (`str`, *optional*):
|
|
HF user name for attribution. Example: "celinah".
|
|
source_org (`str`, *optional*):
|
|
HF org name for attribution. Example: "cais".
|
|
notes (`str`, *optional*):
|
|
Details about the evaluation setup. Example: "tools", "no-tools", "chain-of-thought".
|
|
|
|
Example:
|
|
```python
|
|
>>> from huggingface_hub import EvalResultEntry
|
|
>>> # Minimal example with required fields only
|
|
>>> result = EvalResultEntry(
|
|
... dataset_id="Idavidrein/gpqa",
|
|
... task_id="gpqa_diamond",
|
|
... value=0.412,
|
|
... )
|
|
>>> # Full example with all fields
|
|
>>> result = EvalResultEntry(
|
|
... dataset_id="cais/hle",
|
|
... task_id="default",
|
|
... value=20.90,
|
|
... dataset_revision="5503434ddd753f426f4b38109466949a1217c2bb",
|
|
... verify_token="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
|
|
... date="2025-01-15T10:30:00Z",
|
|
... source_url="https://huggingface.co/datasets/cais/hle",
|
|
... source_name="CAIS HLE",
|
|
... source_org="cais",
|
|
... notes="no-tools",
|
|
... )
|
|
|
|
```
|
|
"""
|
|
|
|
dataset_id: str
|
|
task_id: str
|
|
value: Any
|
|
dataset_revision: Optional[str] = None
|
|
verify_token: Optional[str] = None
|
|
date: Optional[str] = None
|
|
source_url: Optional[str] = None
|
|
source_name: Optional[str] = None
|
|
source_user: Optional[str] = None
|
|
source_org: Optional[str] = None
|
|
notes: Optional[str] = None
|
|
|
|
def __post_init__(self) -> None:
|
|
if (
|
|
self.source_name is not None or self.source_user is not None or self.source_org is not None
|
|
) and self.source_url is None:
|
|
raise ValueError(
|
|
"If `source_name`, `source_user`, or `source_org` is provided, `source_url` must also be provided."
|
|
)
|
|
|
|
|
|
def eval_result_entries_to_yaml(entries: list[EvalResultEntry]) -> list[dict[str, Any]]:
|
|
"""Convert a list of [`EvalResultEntry`] objects to a YAML-serializable list of dicts.
|
|
|
|
This produces the format expected in `.eval_results/*.yaml` files.
|
|
|
|
Args:
|
|
entries (`list[EvalResultEntry]`):
|
|
List of evaluation result entries to serialize.
|
|
|
|
Returns:
|
|
`list[dict[str, Any]]`: A list of dictionaries ready to be dumped to YAML.
|
|
|
|
Example:
|
|
```python
|
|
>>> from huggingface_hub import EvalResultEntry, eval_result_entries_to_yaml
|
|
>>> entries = [
|
|
... EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
|
|
... EvalResultEntry(dataset_id="Idavidrein/gpqa", task_id="gpqa_diamond", value=0.412),
|
|
... ]
|
|
>>> yaml_data = eval_result_entries_to_yaml(entries)
|
|
>>> yaml_data[0]
|
|
{'dataset': {'id': 'cais/hle', 'task_id': 'default'}, 'value': 20.9}
|
|
|
|
```
|
|
|
|
To upload eval results to the Hub:
|
|
```python
|
|
>>> import yaml
|
|
>>> from huggingface_hub import upload_file, EvalResultEntry, eval_result_entries_to_yaml
|
|
>>> entries = [
|
|
... EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
|
|
... ]
|
|
>>> yaml_content = yaml.dump(eval_result_entries_to_yaml(entries))
|
|
>>> upload_file(
|
|
... path_or_fileobj=yaml_content.encode(),
|
|
... path_in_repo=".eval_results/hle.yaml",
|
|
... repo_id="your-username/your-model",
|
|
... )
|
|
|
|
```
|
|
"""
|
|
result = []
|
|
for entry in entries:
|
|
# build the dataset object
|
|
dataset: dict[str, Any] = {"id": entry.dataset_id, "task_id": entry.task_id}
|
|
if entry.dataset_revision is not None:
|
|
dataset["revision"] = entry.dataset_revision
|
|
|
|
data: dict[str, Any] = {"dataset": dataset, "value": entry.value}
|
|
if entry.verify_token is not None:
|
|
data["verifyToken"] = entry.verify_token
|
|
if entry.date is not None:
|
|
data["date"] = entry.date
|
|
# build the source object
|
|
if entry.source_url is not None:
|
|
source: dict[str, Any] = {"url": entry.source_url}
|
|
if entry.source_name is not None:
|
|
source["name"] = entry.source_name
|
|
if entry.source_user is not None:
|
|
source["user"] = entry.source_user
|
|
if entry.source_org is not None:
|
|
source["org"] = entry.source_org
|
|
data["source"] = source
|
|
if entry.notes is not None:
|
|
data["notes"] = entry.notes
|
|
|
|
result.append(data)
|
|
return result
|
|
|
|
|
|
def parse_eval_result_entries(data: list[dict[str, Any]]) -> list[EvalResultEntry]:
|
|
"""Parse a list of dicts into [`EvalResultEntry`] objects.
|
|
|
|
This parses the `.eval_results/*.yaml` format. For the legacy `model-index` format,
|
|
use [`model_index_to_eval_results`] instead.
|
|
|
|
Args:
|
|
data (`list[dict[str, Any]]`):
|
|
A list of dictionaries (e.g., parsed from YAML or API response).
|
|
|
|
Returns:
|
|
`list[EvalResultEntry]`: A list of evaluation result entry objects.
|
|
|
|
Example:
|
|
```python
|
|
>>> from huggingface_hub import parse_eval_result_entries
|
|
>>> data = [
|
|
... {"dataset": {"id": "cais/hle", "task_id": "default"}, "value": 20.90},
|
|
... {"dataset": {"id": "Idavidrein/gpqa", "task_id": "gpqa_diamond"}, "value": 0.412},
|
|
... ]
|
|
>>> entries = parse_eval_result_entries(data)
|
|
>>> entries[0].dataset_id
|
|
'cais/hle'
|
|
>>> entries[0].value
|
|
20.9
|
|
|
|
```
|
|
"""
|
|
entries = []
|
|
for item in data:
|
|
entry_data = item.get("data", item)
|
|
dataset = entry_data.get("dataset", {})
|
|
source = entry_data.get("source", {})
|
|
entry = EvalResultEntry(
|
|
dataset_id=dataset["id"],
|
|
value=entry_data["value"],
|
|
task_id=dataset["task_id"],
|
|
dataset_revision=dataset.get("revision"),
|
|
verify_token=entry_data.get("verifyToken"),
|
|
date=entry_data.get("date"),
|
|
source_url=source.get("url") if source else None,
|
|
source_name=source.get("name") if source else None,
|
|
source_user=source.get("user") if source else None,
|
|
source_org=source.get("org") if source else None,
|
|
notes=entry_data.get("notes"),
|
|
)
|
|
entries.append(entry)
|
|
return entries
|