跳转至

EvalConfig

DataConfig

Bases: ConfigBaseModel

Data config

源代码位于: utu/config/eval_config.py
10
11
12
13
14
15
16
17
18
19
20
class DataConfig(ConfigBaseModel):
    """Data config"""

    dataset: str  # WebWalkerQA | GAIA_validation | XBench | BrowseComp
    """Built-in dataset name or custom dataset path"""
    type: Literal["single", "mixed"] = "single"
    """Whether the dataset contains only single benchmark data or multiple benchmarks"""
    question_field: str = "question"
    """Question field name in the dataset"""
    gt_field: str = "answer"
    """Ground truth field name in the dataset"""

dataset instance-attribute

dataset: str

Built-in dataset name or custom dataset path

type class-attribute instance-attribute

type: Literal['single', 'mixed'] = 'single'

Whether the dataset contains only single benchmark data or multiple benchmarks

question_field class-attribute instance-attribute

question_field: str = 'question'

Question field name in the dataset

gt_field class-attribute instance-attribute

gt_field: str = 'answer'

Ground truth field name in the dataset

EvalConfig

Bases: ConfigBaseModel

Evaluation config

源代码位于: utu/config/eval_config.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
class EvalConfig(ConfigBaseModel):
    """Evaluation config"""

    exp_id: str = "default"
    """Experiment ID"""

    # data
    db_url: str = EnvUtils.get_env("UTU_DB_URL", "sqlite:///test.db")
    """Database URL"""
    data: DataConfig = None
    """Data config"""

    # rollout
    agent: AgentConfig | None = None
    """Agent config for rollout"""
    concurrency: int = 1
    """Rollout parallelism"""
    pass_k: int = 1
    """Rollout k for each sample"""

    # judgement
    judge_model: ModelConfigs = Field(default_factory=ModelConfigs)
    """Judge model config"""
    judge_concurrency: int = 1
    """Judgement parallelism"""
    eval_method: str = None
    """Evaluation method"""
    # optional verify function for custom judgement (used by `train` processors etc.)
    verify_filename: str | None = None
    """Optional: Python filename under `utu/train/verify/` that contains a verify function."""
    verify_func_name: str | None = None
    """Optional: The function name inside the verify file to call for judgement."""

exp_id class-attribute instance-attribute

exp_id: str = 'default'

Experiment ID

db_url class-attribute instance-attribute

db_url: str = get_env('UTU_DB_URL', 'sqlite:///test.db')

Database URL

data class-attribute instance-attribute

data: DataConfig = None

Data config

agent class-attribute instance-attribute

agent: AgentConfig | None = None

Agent config for rollout

concurrency class-attribute instance-attribute

concurrency: int = 1

Rollout parallelism

pass_k class-attribute instance-attribute

pass_k: int = 1

Rollout k for each sample

judge_model class-attribute instance-attribute

judge_model: ModelConfigs = Field(default_factory=ModelConfigs)

Judge model config

judge_concurrency class-attribute instance-attribute

judge_concurrency: int = 1

Judgement parallelism

eval_method class-attribute instance-attribute

eval_method: str = None

Evaluation method

verify_filename class-attribute instance-attribute

verify_filename: str | None = None

Optional: Python filename under utu/train/verify/ that contains a verify function.

verify_func_name class-attribute instance-attribute

verify_func_name: str | None = None

Optional: The function name inside the verify file to call for judgement.