Coverage for pydantic_evals/pydantic_evals/dataset.py: 97.76%

1"""Dataset management for pydantic evals.

3This module provides functionality for creating, loading, saving, and evaluating datasets of test cases.

4Each case must have inputs, and can optionally have a name, expected output, metadata, and case-specific evaluators.

6Datasets can be loaded from and saved to YAML or JSON files, and can be evaluated against

7a task function to produce an evaluation report.

8"""

10from __future__ import annotations as _annotations (empty)

12import functools (empty)

13import inspect (empty)

14import sys (empty)

15import time (empty)

16import warnings (empty)

17from collections.abc import Awaitable, Mapping, Sequence (empty)

18from contextlib import AsyncExitStack (empty)

19from contextvars import ContextVar (empty)

20from dataclasses import dataclass, field (empty)

21from pathlib import Path (empty)

22from typing import Any, Callable, Generic, Literal, Union, cast (empty)

24import anyio (empty)

25import logfire_api (empty)

26import yaml (empty)

27from pydantic import BaseModel, ConfigDict, Field, TypeAdapter, ValidationError, model_serializer (empty)

28from pydantic._internal import _typing_extra (empty)

29from pydantic_core import to_json, to_jsonable_python (empty)

30from pydantic_core.core_schema import SerializationInfo, SerializerFunctionWrapHandler (empty)

31from typing_extensions import NotRequired, Self, TypedDict, TypeVar (empty)

33from pydantic_evals._utils import get_event_loop (empty)

35from ._utils import get_unwrapped_function_name, task_group_gather (empty)

36from .evaluators import EvaluationResult, Evaluator, run_evaluator (empty)

37from .evaluators._spec import EvaluatorSpec (empty)

38from .evaluators.common import DEFAULT_EVALUATORS (empty)

39from .evaluators.context import EvaluatorContext (empty)

40from .otel import SpanTree (empty)

41from .otel._context_in_memory_span_exporter import context_subtree (empty)

42from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate (empty)

44if sys.version_info < (3, 11): # pragma: no cover (empty)

45 from exceptiongroup import ExceptionGroup (empty)

46else:

47 ExceptionGroup = ExceptionGroup (empty)

49# while waiting for https://github.com/pydantic/logfire/issues/745

50try: (empty)

51 import logfire._internal.stack_info (empty)

52except ImportError: # pragma: no cover

53 pass

54else:

55 from pathlib import Path (empty)

57 logfire._internal.stack_info.NON_USER_CODE_PREFIXES += (str(Path(__file__).parent.absolute()),) (empty)

59_logfire = logfire_api.Logfire(otel_scope='pydantic-evals') (empty)

61InputsT = TypeVar('InputsT', default=Any) (empty)

62"""Generic type for the inputs to the task being evaluated.""" (empty)

63OutputT = TypeVar('OutputT', default=Any) (empty)

64"""Generic type for the expected output of the task being evaluated.""" (empty)

65MetadataT = TypeVar('MetadataT', default=Any) (empty)

66"""Generic type for the metadata associated with the task being evaluated.""" (empty)

68DEFAULT_DATASET_PATH = './test_cases.yaml' (empty)

69"""Default path for saving/loading datasets.""" (empty)

70DEFAULT_SCHEMA_PATH_TEMPLATE = './{stem}_schema.json' (empty)

71"""Default template for schema file paths, where {stem} is replaced with the dataset filename stem.""" (empty)

72_YAML_SCHEMA_LINE_PREFIX = '# yaml-language-server: $schema=' (empty)

75class _CaseModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'): (empty)

76 """Internal model for a case, used for serialization/deserialization."""

78 name: str | None = None (empty)

79 inputs: InputsT (empty)

80 metadata: MetadataT | None = None (empty)

81 expected_output: OutputT | None = None (empty)

82 evaluators: list[EvaluatorSpec] = Field(default_factory=list) (empty)

85class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'): (empty)

86 """Internal model for a dataset, used for serialization/deserialization."""

88 # $schema is included to avoid validation fails from the `$schema` key, see `_add_json_schema` below for context

89 json_schema_path: str | None = Field(default=None, alias='$schema') (empty)

90 cases: list[_CaseModel[InputsT, OutputT, MetadataT]] (empty)

91 evaluators: list[EvaluatorSpec] = Field(default_factory=list) (empty)

94@dataclass(init=False) (empty)

95class Case(Generic[InputsT, OutputT, MetadataT]): (empty)

96 """A single row of a [`Dataset`][pydantic_evals.Dataset].

98 Each case represents a single test scenario with inputs to test. A case may optionally specify a name, expected

99 outputs to compare against, and arbitrary metadata.

100

101 Cases can also have their own specific evaluators which are run in addition to dataset-level evaluators.

102

103 Example:

104 ```python

105 case = Case(

106 name="Simple addition",

107 inputs={"a": 1, "b": 2},

108 expected_output=3,

109 metadata={"description": "Tests basic addition"}

110 )

111 ```

112 """

113

114 name: str | None (empty)

115 """Name of the case. This is used to identify the case in the report and can be used to filter cases.""" (empty)

116 inputs: InputsT (empty)

117 """Inputs to the task. This is the input to the task that will be evaluated.""" (empty)

118 metadata: MetadataT | None (empty)

119 """Metadata to be used in the evaluation. (empty)

120

121 This can be used to provide additional information about the case to the evaluators.

122 """

123 expected_output: OutputT | None (empty)

124 """Expected output of the task. This is the expected output of the task that will be evaluated.""" (empty)

125 evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] (empty)

126 """Evaluators to be used just on this case.""" (empty)

127

128 def __init__( (empty)

129 self,

130 *,

131 name: str | None = None,

132 inputs: InputsT,

133 metadata: MetadataT | None = None,

134 expected_output: OutputT | None = None,

135 evaluators: tuple[Evaluator[InputsT, OutputT, MetadataT], ...] = (),

136 ):

137 """Initialize a new test case.

138

139 Args:

140 name: Optional name for the case. If not provided, a generic name will be assigned when added to a dataset.

141 inputs: The inputs to the task being evaluated.

142 metadata: Optional metadata for the case, which can be used by evaluators.

143 expected_output: Optional expected output of the task, used for comparison in evaluators.

144 evaluators: Tuple of evaluators specific to this case. These are in addition to any

145 dataset-level evaluators.

146

147 """

148 # Note: `evaluators` must be a tuple instead of Sequence due to misbehavior with pyright's generic parameter

149 # inference if it has type `Sequence`

150 self.name = name (empty)

151 self.inputs = inputs (empty)

152 self.metadata = metadata (empty)

153 self.expected_output = expected_output (empty)

154 self.evaluators = list(evaluators) (empty)

155

156

157# TODO: Consider making one or more of the following changes to this type:

158# * Add `task: Callable[[InputsT], Awaitable[OutputT]` as a field

159# * Add `inputs_type`, `output_type`, etc. as kwargs on `__init__`

160# * Rename to `Evaluation`

161# TODO: Allow `task` to be sync _or_ async

162class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', arbitrary_types_allowed=True): (empty)

163 """A dataset of test [cases][pydantic_evals.Case].

164

165 Datasets allow you to organize a collection of test cases and evaluate them against a task function.

166 They can be loaded from and saved to YAML or JSON files, and can have dataset-level evaluators that

167 apply to all cases.

168

169 Example:

170 ```python

171 # Create a dataset with two test cases

172 dataset = Dataset(

173 cases=[

174 Case(name="test1", inputs={"text": "Hello"}, expected_output="HELLO"),

175 Case(name="test2", inputs={"text": "World"}, expected_output="WORLD"),

176 ],

177 evaluators=[ExactMatch()]

178 )

179

180 # Evaluate the dataset against a task function

181 async def uppercase(inputs: dict) -> str:

182 return inputs["text"].upper()

183

184 report = await dataset.evaluate(uppercase)

185 ```

186 """

187

188 cases: list[Case[InputsT, OutputT, MetadataT]] (empty)

189 """List of test cases in the dataset.""" (empty)

190 evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = [] (empty)

191 """List of evaluators to be used on all cases in the dataset.""" (empty)

192

193 def __init__( (empty)

194 self,

195 *,

196 cases: Sequence[Case[InputsT, OutputT, MetadataT]],

197 evaluators: Sequence[Evaluator[InputsT, OutputT, MetadataT]] = (),

198 ):

199 """Initialize a new dataset with test cases and optional evaluators.

200

201 Args:

202 cases: Sequence of test cases to include in the dataset.

203 evaluators: Optional sequence of evaluators to apply to all cases in the dataset.

204 """

205 case_names = set[str]() (empty)

206 for case in cases: (empty)

207 if case.name is None: (empty)

208 continue (empty)

209 if case.name in case_names: (empty)

210 raise ValueError(f'Duplicate case name: {case.name!r}') (empty)

211 case_names.add(case.name) (empty)

212

213 super().__init__( (empty)

214 cases=cases,

215 evaluators=list(evaluators),

216 )

217

218 async def evaluate( (empty)

219 self, task: Callable[[InputsT], Awaitable[OutputT]], name: str | None = None, max_concurrency: int | None = None

220 ) -> EvaluationReport:

221 """Evaluates the test cases in the dataset using the given task.

222

223 This method runs the task on each case in the dataset, applies evaluators,

224 and collects results into a report. Cases are run concurrently, limited by `max_concurrency` if specified.

225

226 Args:

227 task: The task to evaluate. This should be a callable that takes the inputs of the case

228 and returns the output.

229 name: The name of the task being evaluated, this is used to identify the task in the report.

230 If omitted, the name of the task function will be used.

231 max_concurrency: The maximum number of concurrent evaluations of the task to allow.

232 If None, all cases will be evaluated concurrently.

233

234 Returns:

235 A report containing the results of the evaluation.

236 """

237 name = name or get_unwrapped_function_name(task) (empty)

238

239 limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack() (empty)

240 with _logfire.span('evaluate {name}', name=name) as eval_span: (empty)

241

242 async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str): (empty)

243 async with limiter: (empty)

244 return await _run_task_and_evaluators(task, case, report_case_name, self.evaluators) (empty)

245

246 report = EvaluationReport( (empty)

247 name=name,

248 cases=await task_group_gather(

249 [

250 lambda case=case, i=i: _handle_case(case, case.name or f'Case {i}')

251 for i, case in enumerate(self.cases, 1)

252 ]

253 ),

254 )

255 # TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:

256 eval_span.set_attribute('cases', report.cases) (empty)

257 # TODO(DavidM): Remove this 'averages' attribute once we compute it in the details panel

258 eval_span.set_attribute('averages', ReportCaseAggregate.average(report.cases)) (empty)

259

260 return report (empty)

261

262 def evaluate_sync( (empty)

263 self, task: Callable[[InputsT], Awaitable[OutputT]], name: str | None = None, max_concurrency: int | None = None (empty)

264 ) -> EvaluationReport: # pragma: no cover (empty)

265 """Evaluates the test cases in the dataset using the given task.

266

267 This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.

268

269 Args:

270 task: The task to evaluate. This should be a callable that takes the inputs of the case

271 and returns the output.

272 name: The name of the task being evaluated, this is used to identify the task in the report.

273 If omitted, the name of the task function will be used.

274 max_concurrency: The maximum number of concurrent evaluations of the task to allow.

275 If None, all cases will be evaluated concurrently.

276

277 Returns:

278 A report containing the results of the evaluation.

279 """

280 return get_event_loop().run_until_complete(self.evaluate(task, name=name, max_concurrency=max_concurrency)) (empty)

281

282 def add_case( (empty)

283 self,

284 *,

285 name: str | None = None,

286 inputs: InputsT,

287 metadata: MetadataT | None = None,

288 expected_output: OutputT | None = None,

289 evaluators: tuple[Evaluator[InputsT, OutputT, MetadataT], ...] = (),

290 ) -> None:

291 """Adds a case to the dataset.

292

293 This is a convenience method for creating a [`Case`][pydantic_evals.Case] and adding it to the dataset.

294

295 Args:

296 name: Optional name for the case. If not provided, a generic name will be assigned.

297 inputs: The inputs to the task being evaluated.

298 metadata: Optional metadata for the case, which can be used by evaluators.

299 expected_output: The expected output of the task, used for comparison in evaluators.

300 evaluators: Tuple of evaluators specific to this case, in addition to dataset-level evaluators.

301 """

302 if name in {case.name for case in self.cases}: (empty)

303 raise ValueError(f'Duplicate case name: {name!r}') (empty)

304

305 case = Case[InputsT, OutputT, MetadataT]( (empty)

306 name=name,

307 inputs=inputs,

308 metadata=metadata,

309 expected_output=expected_output,

310 evaluators=evaluators,

311 )

312 self.cases.append(case) (empty)

313

314 def add_evaluator( (empty)

315 self,

316 evaluator: Evaluator[InputsT, OutputT, MetadataT],

317 specific_case: str | None = None,

318 ) -> None:

319 """Adds an evaluator to the dataset or a specific case.

320

321 Args:

322 evaluator: The evaluator to add.

323 specific_case: If provided, the evaluator will only be added to the case with this name.

324 If None, the evaluator will be added to all cases in the dataset.

325

326 Raises:

327 ValueError: If `specific_case` is provided but no case with that name exists in the dataset.

328 """

329 if specific_case is None: (empty)

330 self.evaluators.append(evaluator) (empty)

331 else:

332 # If this is too slow, we could try to add a case lookup dict.

333 # Note that if we do that, we'd need to make the cases list private to prevent modification.

334 added = False (empty)

335 for case in self.cases: (empty)

336 if case.name == specific_case: (empty)

337 case.evaluators.append(evaluator) (empty)

338 added = True (empty)

339 if not added: (empty)

340 raise ValueError(f'Case {specific_case!r} not found in the dataset') (empty)

341

342 @classmethod (empty)

343 @functools.cache (empty)

344 def _params(cls) -> tuple[type[InputsT], type[OutputT], type[MetadataT]]: (empty)

345 """Get the type parameters for the Dataset class.

346

347 Returns:

348 A tuple of (InputsT, OutputT, MetadataT) types.

349 """

350 for c in cls.__mro__: (empty)

351 metadata = getattr(c, '__pydantic_generic_metadata__', {}) (empty)

352 if len(args := (metadata.get('args', ()) or getattr(c, '__args__', ()))) == 3: 352 ↛ 350line 352 didn't jump to line 350 because the condition on line 352 was always true(empty)

353 return args (empty)

354 else: # pragma: no cover

355 warnings.warn(

356 f'Could not determine the generic parameters for {cls}; using `Any` for each. '

357 f'You should explicitly set the generic parameters via `Dataset[MyInputs, MyOutput, MyMetadata]`'

358 f'when serializing or deserializing.',

359 UserWarning,

360 )

361 return Any, Any, Any # type: ignore

362

363 @classmethod (empty)

364 def from_file( (empty)

365 cls,

366 path: Path | str,

367 fmt: Literal['yaml', 'json'] | None = None,

368 custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),

369 ) -> Self:

370 """Load a dataset from a file.

371

372 Args:

373 path: Path to the file to load.

374 fmt: Format of the file. If None, the format will be inferred from the file extension.

375 Must be either 'yaml' or 'json'.

376 custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.

377 These are additional evaluators beyond the default ones.

378

379 Returns:

380 A new Dataset instance loaded from the file.

381

382 Raises:

383 ValidationError: If the file cannot be parsed as a valid dataset.

384 ValueError: If the format cannot be inferred from the file extension.

385 """

386 path = Path(path) (empty)

387 fmt = cls._infer_fmt(path, fmt) (empty)

388

389 raw = Path(path).read_text() (empty)

390 try: (empty)

391 return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types) (empty)

392 except ValidationError as e: # pragma: no cover

393 raise ValueError(f'{path} contains data that does not match the schema for {cls.__name__}:\n{e}.') from e

394

395 @classmethod (empty)

396 def from_text( (empty)

397 cls,

398 contents: str,

399 fmt: Literal['yaml', 'json'] = 'yaml',

400 custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),

401 ) -> Self:

402 """Load a dataset from a string.

403

404 Args:

405 contents: The string content to parse.

406 fmt: Format of the content. Must be either 'yaml' or 'json'.

407 custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.

408 These are additional evaluators beyond the default ones.

409

410 Returns:

411 A new Dataset instance parsed from the string.

412

413 Raises:

414 ValidationError: If the content cannot be parsed as a valid dataset.

415 """

416 if fmt == 'yaml': (empty)

417 loaded = yaml.safe_load(contents) (empty)

418 return cls.from_dict(loaded, custom_evaluator_types) (empty)

419 else:

420 dataset_model_type = cls._serialization_type() (empty)

421 dataset_model = dataset_model_type.model_validate_json(contents) (empty)

422 return cls._from_dataset_model(dataset_model, custom_evaluator_types) (empty)

423

424 @classmethod (empty)

425 def from_dict( (empty)

426 cls,

427 data: dict[str, Any],

428 custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),

429 ) -> Self:

430 """Load a dataset from a dictionary.

431

432 Args:

433 data: Dictionary representation of the dataset.

434 custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.

435 These are additional evaluators beyond the default ones.

436

437 Returns:

438 A new Dataset instance created from the dictionary.

439

440 Raises:

441 ValidationError: If the dictionary cannot be converted to a valid dataset.

442 """

443 dataset_model_type = cls._serialization_type() (empty)

444 dataset_model = dataset_model_type.model_validate(data) (empty)

445 return cls._from_dataset_model(dataset_model, custom_evaluator_types) (empty)

446

447 @classmethod (empty)

448 def _from_dataset_model( (empty)

449 cls,

450 dataset_model: _DatasetModel[InputsT, OutputT, MetadataT],

451 custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),

452 ) -> Self:

453 """Create a Dataset from a _DatasetModel.

454

455 Args:

456 dataset_model: The _DatasetModel to convert.

457 custom_evaluator_types: Custom evaluator classes to register for deserialization.

458

459 Returns:

460 A new Dataset instance created from the _DatasetModel.

461 """

462 registry = _get_registry(custom_evaluator_types) (empty)

463

464 cases: list[Case[InputsT, OutputT, MetadataT]] = [] (empty)

465 errors: list[ValueError] = [] (empty)

466 dataset_evaluators: list[Evaluator[Any, Any, Any]] = [] (empty)

467 for spec in dataset_model.evaluators: (empty)

468 try: (empty)

469 dataset_evaluator = _load_evaluator_from_registry(registry, None, spec) (empty)

470 except ValueError as e: (empty)

471 errors.append(e) (empty)

472 continue (empty)

473 dataset_evaluators.append(dataset_evaluator) (empty)

474

475 for row in dataset_model.cases: (empty)

476 evaluators: list[Evaluator[Any, Any, Any]] = [] (empty)

477 for spec in row.evaluators: (empty)

478 try: (empty)

479 evaluator = _load_evaluator_from_registry(registry, row.name, spec) (empty)

480 except ValueError as e: (empty)

481 errors.append(e) (empty)

482 continue (empty)

483 evaluators.append(evaluator) (empty)

484 row = Case[InputsT, OutputT, MetadataT]( (empty)

485 name=row.name,

486 inputs=row.inputs,

487 metadata=row.metadata,

488 expected_output=row.expected_output,

489 )

490 row.evaluators = evaluators (empty)

491 cases.append(row) (empty)

492 if errors: (empty)

493 raise ExceptionGroup(f'{len(errors)} error(s) loading evaluators from registry', errors[:3]) (empty)

494 result = cls(cases=cases) (empty)

495 result.evaluators = dataset_evaluators (empty)

496 return result (empty)

497

498 def to_file( (empty)

499 self,

500 path: Path | str,

501 fmt: Literal['yaml', 'json'] | None = None,

502 schema_path: Path | str | None = DEFAULT_SCHEMA_PATH_TEMPLATE,

503 custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),

504 ):

505 """Save the dataset to a file.

506

507 Args:

508 path: Path to save the dataset to.

509 fmt: Format to use. If None, the format will be inferred from the file extension.

510 Must be either 'yaml' or 'json'.

511 schema_path: Path to save the JSON schema to. If None, no schema will be saved.

512 Can be a string template with {stem} which will be replaced with the dataset filename stem.

513 custom_evaluator_types: Custom evaluator classes to include in the schema.

514 """

515 path = Path(path) (empty)

516 fmt = self._infer_fmt(path, fmt) (empty)

517

518 schema_ref: str | None = None (empty)

519 if schema_path is not None: 519 ↛ 532line 519 didn't jump to line 532 because the condition on line 519 was always true(empty)

520 if isinstance(schema_path, str): 520 ↛ 523line 520 didn't jump to line 523 because the condition on line 520 was always true(empty)

521 schema_path = Path(schema_path.format(stem=path.stem)) (empty)

522

523 if not schema_path.is_absolute(): (empty)

524 schema_ref = str(schema_path) (empty)

525 schema_path = path.parent / schema_path (empty)

526 elif schema_path.is_relative_to(path): # pragma: no cover

527 schema_ref = str(_get_relative_path_reference(schema_path, path))

528 else: # pragma: no cover

529 schema_ref = str(schema_path)

530 self._save_schema(schema_path, custom_evaluator_types) (empty)

531

532 context: dict[str, Any] = {'use_short_form': True} (empty)

533 if fmt == 'yaml': (empty)

534 dumped_data = self.model_dump(mode='json', by_alias=True, exclude_defaults=True, context=context) (empty)

535 content = yaml.dump(dumped_data, sort_keys=False) (empty)

536 if schema_ref: 536 ↛ 539line 536 didn't jump to line 539 because the condition on line 536 was always true(empty)

537 yaml_language_server_line = f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}' (empty)

538 content = f'{yaml_language_server_line}\n{content}' (empty)

539 path.write_text(content) (empty)

540 else:

541 context['$schema'] = schema_ref (empty)

542 json_data = self.model_dump_json(indent=2, by_alias=True, exclude_defaults=True, context=context) (empty)

543 path.write_text(json_data + '\n') (empty)

544

545 @classmethod (empty)

546 def model_json_schema_with_evaluators( (empty)

547 cls,

548 custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),

549 ) -> dict[str, Any]:

550 """Generate a JSON schema for this dataset type, including evaluator details.

551

552 This is useful for generating a schema that can be used to validate YAML-format dataset files.

553

554 Args:

555 custom_evaluator_types: Custom evaluator classes to include in the schema.

556

557 Returns:

558 A dictionary representing the JSON schema.

559 """

560 # Note: this function could maybe be simplified now that Evaluators are always dataclasses

561 registry = _get_registry(custom_evaluator_types) (empty)

562

563 evaluator_schema_types: list[Any] = [] (empty)

564 for name, evaluator_class in registry.items(): (empty)

565 type_hints = _typing_extra.get_function_type_hints(evaluator_class) (empty)

566 type_hints.pop('return', None) (empty)

567 required_type_hints: dict[str, Any] = {} (empty)

568

569 for p in inspect.signature(evaluator_class).parameters.values(): (empty)

570 type_hints.setdefault(p.name, Any) (empty)

571 if p.default is not p.empty: (empty)

572 type_hints[p.name] = NotRequired[type_hints[p.name]] (empty)

573 else:

574 required_type_hints[p.name] = type_hints[p.name] (empty)

575

576 def _make_typed_dict(cls_name_prefix: str, fields: dict[str, Any]) -> Any: (empty)

577 td = TypedDict(f'{cls_name_prefix}_{name}', fields) # pyright: ignore[reportArgumentType] (empty)

578 config = ConfigDict(extra='forbid', arbitrary_types_allowed=True) (empty)

579 # TODO: Replace with pydantic.with_config after pydantic 2.11 is released

580 td.__pydantic_config__ = config # pyright: ignore[reportAttributeAccessIssue] (empty)

581 return td (empty)

582

583 # Shortest form: just the call name

584 if len(type_hints) == 0 or not required_type_hints: (empty)

585 evaluator_schema_types.append(Literal[name]) (empty)

586

587 # Short form: can be called with only one parameter

588 if len(type_hints) == 1: (empty)

589 [type_hint_type] = type_hints.values() (empty)

590 evaluator_schema_types.append(_make_typed_dict('short_evaluator', {name: type_hint_type})) (empty)

591 elif len(required_type_hints) == 1: (empty)

592 [type_hint_type] = required_type_hints.values() (empty)

593 evaluator_schema_types.append(_make_typed_dict('short_evaluator', {name: type_hint_type})) (empty)

594

595 # Long form: multiple parameters, possibly required

596 if len(type_hints) > 1: (empty)

597 params_td = _make_typed_dict('evaluator_params', type_hints) (empty)

598 evaluator_schema_types.append(_make_typed_dict('evaluator', {name: params_td})) (empty)

599

600 in_type, out_type, meta_type = cls._params() (empty)

601

602 class ClsDatasetRow(BaseModel, extra='forbid'): (empty)

603 name: str (empty)

604 inputs: in_type (empty)

605 metadata: meta_type (empty)

606 expected_output: out_type | None = None (empty)

607 if evaluator_schema_types: 607 ↛ exitline 607 didn't exit class 'ClsDatasetRow' because the condition on line 607 was always true(empty)

608 evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007 (empty)

609

610 ClsDatasetRow.__name__ = cls.__name__ + 'Row' (empty)

611

612 class ClsDataset(BaseModel, extra='forbid'): (empty)

613 cases: list[ClsDatasetRow] (empty)

614 if evaluator_schema_types: 614 ↛ exitline 614 didn't exit class 'ClsDataset' because the condition on line 614 was always true(empty)

615 evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007 (empty)

616

617 ClsDataset.__name__ = cls.__name__ (empty)

618

619 json_schema = ClsDataset.model_json_schema() (empty)

620 # See `_add_json_schema` below, since `$schema` is added to the JSON, it has to be supported in the JSON

621 json_schema['properties']['$schema'] = {'type': 'string'} (empty)

622 return json_schema (empty)

623

624 @classmethod (empty)

625 def _save_schema( (empty)

626 cls, path: Path | str, custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = ()

627 ):

628 """Save the JSON schema for this dataset type to a file.

629

630 Args:

631 path: Path to save the schema to.

632 custom_evaluator_types: Custom evaluator classes to include in the schema.

633 """

634 path = Path(path) (empty)

635 json_schema = cls.model_json_schema_with_evaluators(custom_evaluator_types) (empty)

636 schema_content = to_json(json_schema, indent=2).decode() + '\n' (empty)

637 if not path.exists() or path.read_text() != schema_content: 637 ↛ exitline 637 didn't return from function '_save_schema' because the condition on line 637 was always true(empty)

638 path.write_text(schema_content) (empty)

639

640 @classmethod (empty)

641 @functools.cache (empty)

642 def _serialization_type(cls) -> type[_DatasetModel[InputsT, OutputT, MetadataT]]: (empty)

643 """Get the serialization type for this dataset class.

644

645 Returns:

646 A _DatasetModel type with the same generic parameters as this Dataset class.

647 """

648 input_type, output_type, metadata_type = cls._params() (empty)

649 return _DatasetModel[input_type, output_type, metadata_type] (empty)

650

651 @classmethod (empty)

652 def _infer_fmt(cls, path: Path, fmt: Literal['yaml', 'json'] | None) -> Literal['yaml', 'json']: (empty)

653 """Infer the format to use for a file based on its extension.

654

655 Args:

656 path: The path to infer the format for.

657 fmt: The explicitly provided format, if any.

658

659 Returns:

660 The inferred format ('yaml' or 'json').

661

662 Raises:

663 ValueError: If the format cannot be inferred from the file extension.

664 """

665 if fmt is not None: (empty)

666 return fmt (empty)

667 suffix = path.suffix.lower() (empty)

668 if suffix in {'.yaml', '.yml'}: (empty)

669 return 'yaml' (empty)

670 elif suffix == '.json': (empty)

671 return 'json' (empty)

672 raise ValueError( (empty)

673 f'Could not infer format for filename {path.name!r}. Use the `fmt` argument to specify the format.'

674 )

675

676 @model_serializer(mode='wrap') (empty)

677 def _add_json_schema(self, nxt: SerializerFunctionWrapHandler, info: SerializationInfo) -> dict[str, Any]: (empty)

678 """Add the JSON schema path to the serialized output.

679

680 See <https://github.com/json-schema-org/json-schema-spec/issues/828> for context, that seems to be the nearest

681 there is to a spec for this.

682 """

683 context = cast(Union[dict[str, Any], None], info.context) (empty)

684 if isinstance(context, dict) and (schema := context.get('$schema')): (empty)

685 return {'$schema': schema} | nxt(self) (empty)

686 else:

687 return nxt(self) (empty)

688

689

690def _get_relative_path_reference(target: Path, source: Path, _prefix: str = '') -> Path: # pragma: no cover (empty)

691 """Get a relative path reference from source to target.

692

693 Recursively resolve a relative path to target from source, adding '..' as needed.

694 This is useful for creating a relative path reference from a source file to a target file.

695

696 Args:

697 target: The target path to reference.

698 source: The source path to reference from.

699 _prefix: Internal prefix used during recursion.

700

701 Returns:

702 A Path object representing the relative path from source to target.

703

704 Example:

705 If source is '/a/b/c.py' and target is '/a/d/e.py', the relative path reference

706 would be '../../d/e.py'.

707 """

708 # Recursively resolve a relative path to target from source, adding '..' as needed.

709 # This is useful for creating a relative path reference from a source file to a target file.

710 # For example, if source is '/a/b/c.py' and target is '/a/d/e.py', the relative path reference

711 # would be '../../d/e.py'.

712 if not target.is_absolute():

713 target = target.resolve()

714 try:

715 return Path(f'{_prefix}{Path(target).relative_to(source)}')

716 except ValueError:

717 return _get_relative_path_reference(target, source.parent, _prefix=f'{_prefix}../')

718

719

720@dataclass (empty)

721class _TaskRun: (empty)

722 """Internal class to track metrics and attributes for a task run."""

723

724 attributes: dict[str, Any] = field(init=False, default_factory=dict) (empty)

725 metrics: dict[str, int | float] = field(init=False, default_factory=dict) (empty)

726

727 def record_metric(self, name: str, value: int | float) -> None: (empty)

728 """Record a metric value.

729

730 Args:

731 name: The name of the metric.

732 value: The value of the metric.

733 """

734 self.metrics[name] = value (empty)

735

736 def increment_metric(self, name: str, amount: int | float) -> None: (empty)

737 """Increment a metric value.

738

739 Args:

740 name: The name of the metric.

741 amount: The amount to increment by.

742

743 Note:

744 If the current value is 0 and the increment amount is 0, no metric will be recorded.

745 """

746 current_value = self.metrics.get(name, 0) (empty)

747 incremented_value = current_value + amount (empty)

748 if current_value == 0 and incremented_value == 0: (empty)

749 return # Avoid recording a metric that is always zero (empty)

750 self.record_metric(name, incremented_value) (empty)

751

752 def record_attribute(self, name: str, value: Any) -> None: (empty)

753 """Record an attribute value.

754

755 Args:

756 name: The name of the attribute.

757 value: The value of the attribute.

758 """

759 self.attributes[name] = value (empty)

760

761

762async def _run_task( (empty)

763 task: Callable[[InputsT], Awaitable[OutputT]], case: Case[InputsT, OutputT, MetadataT]

764) -> EvaluatorContext[InputsT, OutputT, MetadataT]:

765 """Run a task on a case and return the context for evaluators.

766

767 Args:

768 task: The task to run.

769 case: The case to run the task on.

770

771 Returns:

772 An EvaluatorContext containing the inputs, actual output, expected output, and metadata.

773

774 Raises:

775 Exception: Any exception raised by the task.

776 """

777 task_run = _TaskRun() (empty)

778 if _CURRENT_TASK_RUN.get() is not None: # pragma: no cover (empty)

779 raise RuntimeError('A task run has already been entered. Task runs should not be nested')

780

781 # Note: the current behavior is for task execution errors to just bubble up all the way and kill the evaluation.

782 # Should we handle them for the user in some way? If so, I guess we'd want to do that here.

783 token = _CURRENT_TASK_RUN.set(task_run) (empty)

784 try: (empty)

785 with _logfire.span('execute {task}', task=get_unwrapped_function_name(task)) as task_span: (empty)

786 with context_subtree() as span_tree: (empty)

787 t0 = time.time() (empty)

788 task_output = await task(case.inputs) (empty)

789 fallback_duration = time.time() - t0 (empty)

790 finally:

791 _CURRENT_TASK_RUN.reset(token) (empty)

792

793 if isinstance(span_tree, SpanTree): 793 ↛ 811line 793 didn't jump to line 811 because the condition on line 793 was always true(empty)

794 # TODO: Question: Should we make this metric-attributes functionality more user-configurable in some way before merging?

795 # Note: the use of otel for collecting these metrics is the main reason why I think we should require at least otel as a dependency, if not logfire;

796 # otherwise, we don't have a great way to get usage data from arbitrary frameworks.

797 # Ideally we wouldn't need to hard-code the specific logic here, but I'm not sure a great way to expose it to

798 # users. Maybe via an argument of type Callable[[SpanTree], dict[str, int | float]] or similar?

799 for node in span_tree.flattened(): (empty)

800 if node.attributes.get('gen_ai.operation.name') == 'chat': (empty)

801 task_run.increment_metric('requests', 1) (empty)

802 for k, v in node.attributes.items(): (empty)

803 if not isinstance(v, (int, float)): (empty)

804 continue (empty)

805 # TODO: Revisit this choice to strip the prefix..

806 if k.startswith('gen_ai.usage.details.'): (empty)

807 task_run.increment_metric(k[21:], v) (empty)

808 elif k.startswith('gen_ai.usage.'): (empty)

809 task_run.increment_metric(k[13:], v) (empty)

810

811 return EvaluatorContext[InputsT, OutputT, MetadataT]( (empty)

812 name=case.name,

813 inputs=case.inputs,

814 metadata=case.metadata,

815 expected_output=case.expected_output,

816 output=task_output,

817 duration=_get_span_duration(task_span, fallback_duration),

818 _span_tree=span_tree,

819 attributes=task_run.attributes,

820 metrics=task_run.metrics,

821 )

822

823

824async def _run_task_and_evaluators( (empty)

825 task: Callable[[InputsT], Awaitable[OutputT]],

826 case: Case[InputsT, OutputT, MetadataT],

827 report_case_name: str,

828 dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]],

829) -> ReportCase:

830 """Run a task on a case and evaluate the results.

831

832 Args:

833 task: The task to run.

834 case: The case to run the task on.

835 report_case_name: The name to use for this case in the report.

836 dataset_evaluators: Evaluators from the dataset to apply to this case.

837

838 Returns:

839 A ReportCase containing the evaluation results.

840 """

841 with _logfire.span( (empty)

842 '{task_name}: {case_name}',

843 task_name=get_unwrapped_function_name(task),

844 case_name=case.name,

845 inputs=case.inputs,

846 metadata=case.metadata,

847 ) as case_span:

848 t0 = time.time() (empty)

849 scoring_context = await _run_task(task, case) (empty)

850

851 case_span.set_attribute('output', scoring_context.output) (empty)

852 case_span.set_attribute('task_duration', scoring_context.duration) (empty)

853 case_span.set_attribute('metrics', scoring_context.metrics) (empty)

854 case_span.set_attribute('attributes', scoring_context.attributes) (empty)

855

856 evaluators = case.evaluators + dataset_evaluators (empty)

857 evaluator_outputs: list[EvaluationResult] = [] (empty)

858 if evaluators: (empty)

859 evaluator_outputs_by_task = await task_group_gather( (empty)

860 [lambda ev=ev: run_evaluator(ev, scoring_context) for ev in evaluators]

861 )

862 evaluator_outputs += [out for outputs in evaluator_outputs_by_task for out in outputs] (empty)

863

864 assertions, scores, labels = _group_evaluator_outputs_by_type(evaluator_outputs) (empty)

865 case_span.set_attribute('assertions', _evaluation_results_adapter.dump_python(assertions)) (empty)

866 case_span.set_attribute('scores', _evaluation_results_adapter.dump_python(scores)) (empty)

867 case_span.set_attribute('labels', _evaluation_results_adapter.dump_python(labels)) (empty)

868

869 context = case_span.context (empty)

870 if context is None: # pragma: no cover (empty)

871 trace_id = ''

872 span_id = ''

873 else:

874 trace_id = f'{context.trace_id:032x}' (empty)

875 span_id = f'{context.span_id:016x}' (empty)

876 fallback_duration = time.time() - t0 (empty)

877

878 report_inputs = to_jsonable_python(case.inputs) (empty)

879

880 return ReportCase( (empty)

881 name=report_case_name,

882 inputs=report_inputs,

883 metadata=case.metadata,

884 expected_output=case.expected_output,

885 output=scoring_context.output,

886 metrics=scoring_context.metrics,

887 attributes=scoring_context.attributes,

888 scores=scores,

889 labels=labels,

890 assertions=assertions,

891 task_duration=scoring_context.duration,

892 total_duration=_get_span_duration(case_span, fallback_duration),

893 trace_id=trace_id,

894 span_id=span_id,

895 )

896

897

898_evaluation_results_adapter = TypeAdapter(Mapping[str, EvaluationResult]) (empty)

899

900

901def _group_evaluator_outputs_by_type( (empty)

902 evaluation_results: Sequence[EvaluationResult],

903) -> tuple[

904 dict[str, EvaluationResult[bool]],

905 dict[str, EvaluationResult[int | float]],

906 dict[str, EvaluationResult[str]],

907]:

908 """Group evaluator outputs by their result type.

909

910 Args:

911 evaluation_results: Sequence of evaluation results to group.

912

913 Returns:

914 A tuple of dictionaries mapping evaluator names to their results, grouped by result type:

915 (success_evaluations, metric_evaluations, string_evaluations)

916 """

917 assertions: dict[str, EvaluationResult[bool]] = {} (empty)

918 scores: dict[str, EvaluationResult[int | float]] = {} (empty)

919 labels: dict[str, EvaluationResult[str]] = {} (empty)

920 seen_names = set[str]() (empty)

921 for er in evaluation_results: (empty)

922 name = er.name (empty)

923 # Dedupe repeated names by adding a numeric suffix

924 if name in seen_names: (empty)

925 suffix = 2 (empty)

926 while f'{name}_{suffix}' in seen_names: (empty)

927 suffix += 1 (empty)

928 name = f'{name}_{suffix}' (empty)

929 seen_names.add(name) (empty)

930 if assertion := er.downcast(bool): (empty)

931 assertions[name] = assertion (empty)

932 elif score := er.downcast(int, float): (empty)

933 scores[name] = score (empty)

934 elif label := er.downcast(str): 934 ↛ 921line 934 didn't jump to line 921 because the condition on line 934 was always true(empty)

935 labels[name] = label (empty)

936 return assertions, scores, labels (empty)

937

938

939_CURRENT_TASK_RUN = ContextVar['_TaskRun | None']('_CURRENT_TASK_RUN', default=None) (empty)

940

941

942def set_eval_attribute(name: str, value: Any) -> None: (empty)

943 """Set an attribute on the current task run.

944

945 Args:

946 name: The name of the attribute.

947 value: The value of the attribute.

948 """

949 current_case = _CURRENT_TASK_RUN.get() (empty)

950 if current_case is not None: 950 ↛ exitline 950 didn't return from function 'set_eval_attribute' because the condition on line 950 was always true(empty)

951 current_case.record_attribute(name, value) (empty)

952

953

954def increment_eval_metric(name: str, amount: int | float) -> None: (empty)

955 """Increment a metric on the current task run.

956

957 Args:

958 name: The name of the metric.

959 amount: The amount to increment by.

960 """

961 current_case = _CURRENT_TASK_RUN.get() (empty)

962 if current_case is not None: 962 ↛ exitline 962 didn't return from function 'increment_eval_metric' because the condition on line 962 was always true(empty)

963 current_case.increment_metric(name, amount) (empty)

964

965

966def _get_span_duration(span: logfire_api.LogfireSpan, fallback: float) -> float: (empty)

967 """Calculate the duration of a span in seconds.

968

969 We prefer to obtain the duration from a span for the sake of consistency with observability and to make

970 the values more reliable during testing. However, if the span is not available (e.g. when using logfire_api

971 without logfire installed), we fall back to the provided duration.

972

973 Args:

974 span: The span to calculate the duration for.

975 fallback: The fallback duration to use if unable to obtain the duration from the span.

976

977 Returns:

978 The duration of the span in seconds.

979 """

980 try: (empty)

981 return (span.end_time - span.start_time) / 1_000_000_000 # type: ignore (empty)

982 except (AttributeError, TypeError): # pragma: no cover

983 return fallback

984

985

986def _get_registry( (empty)

987 custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]],

988) -> Mapping[str, type[Evaluator[InputsT, OutputT, MetadataT]]]:

989 """Create a registry of evaluator types from default and custom evaluators.

990

991 Args:

992 custom_evaluator_types: Additional evaluator classes to include in the registry.

993

994 Returns:

995 A mapping from evaluator names to evaluator classes.

996 """

997 registry: dict[str, type[Evaluator[InputsT, OutputT, MetadataT]]] = {} (empty)

998

999 for evaluator_class in custom_evaluator_types: (empty)

1000 if not issubclass(evaluator_class, Evaluator): (empty)

1001 raise ValueError( (empty)

1002 f'All custom evaluator classes must be subclasses of Evaluator, but {evaluator_class} is not'

1003 )

1004 if '__dataclass_fields__' not in evaluator_class.__dict__: (empty)

1005 raise ValueError( (empty)

1006 f'All custom evaluator classes must be decorated with `@dataclass`, but {evaluator_class} is not'

1007 )

1008 name = evaluator_class.name() (empty)

1009 if name in registry: (empty)

1010 raise ValueError(f'Duplicate evaluator class name: {name!r}') (empty)

1011 registry[name] = evaluator_class (empty)

1012

1013 for evaluator_class in DEFAULT_EVALUATORS: (empty)

1014 # Allow overriding the default evaluators with custom evaluators raising an error

1015 registry.setdefault(evaluator_class.name(), evaluator_class) (empty)

1016

1017 return registry (empty)

1018

1019

1020def _load_evaluator_from_registry( (empty)

1021 registry: Mapping[str, type[Evaluator[InputsT, OutputT, MetadataT]]],

1022 case_name: str | None,

1023 spec: EvaluatorSpec,

1024) -> Evaluator[InputsT, OutputT, MetadataT]:

1025 """Load an evaluator from the registry based on a specification.

1026

1027 Args:

1028 registry: Mapping from evaluator names to evaluator classes.

1029 case_name: Name of the case this evaluator will be used for, or None for dataset-level evaluators.

1030 spec: Specification of the evaluator to load.

1031

1032 Returns:

1033 An initialized evaluator instance.

1034

1035 Raises:

1036 ValueError: If the evaluator name is not found in the registry.

1037 """

1038 evaluator_class = registry.get(spec.name) (empty)

1039 if evaluator_class is None: (empty)

1040 raise ValueError( (empty)

1041 f'Evaluator {spec.name!r} is not in the provided registry. Registered choices: {list(registry.keys())}'

1042 )

1043 try: (empty)

1044 return evaluator_class(*spec.args, **spec.kwargs) (empty)

1045 except Exception as e: (empty)

1046 case_detail = f'case {case_name!r}' if case_name is not None else 'dataset' (empty)

1047 raise ValueError(f'Failed to instantiate evaluator {spec.name!r} for {case_detail}: {e}') from e (empty)