Coverage for pydantic_evals/pydantic_evals/evaluators/evaluator.py: 99.24%

1from __future__ import annotations (empty)

3import inspect (empty)

4from abc import ABCMeta, abstractmethod (empty)

5from collections.abc import Awaitable, Mapping (empty)

6from dataclasses import MISSING, dataclass, fields (empty)

7from typing import Any, Generic, Union, cast (empty)

9from pydantic import ( (empty)

10 ConfigDict,

11 TypeAdapter,

12 ValidationError,

13 model_serializer,

14)

15from pydantic_core import to_jsonable_python (empty)

16from pydantic_core.core_schema import SerializationInfo (empty)

17from typing_extensions import TypeVar (empty)

19from .._utils import get_event_loop (empty)

20from ._spec import EvaluatorSpec (empty)

21from .context import EvaluatorContext (empty)

23EvaluationScalar = Union[bool, int, float, str] (empty)

24"""The most primitive output allowed as an output from an Evaluator. (empty)

26`int` and `float` are treated as scores, `str` as labels, and `bool` as assertions.

27"""

30@dataclass (empty)

31class EvaluationReason: (empty)

32 """The result of running an evaluator with an optional explanation.

34 Contains a scalar value and an optional "reason" explaining the value.

36 Args:

37 value: The scalar result of the evaluation (boolean, integer, float, or string).

38 reason: An optional explanation of the evaluation result.

39 """

41 value: EvaluationScalar (empty)

42 reason: str | None = None (empty)

45EvaluatorOutput = Union[EvaluationScalar, EvaluationReason, Mapping[str, Union[EvaluationScalar, EvaluationReason]]] (empty)

46"""Type for the output of an evaluator, which can be a scalar, an EvaluationReason, or a mapping of names to either.""" (empty)

49# TODO(DavidM): Add bound=EvaluationScalar to the following typevar after we upgrade to pydantic 2.11

50EvaluationScalarT = TypeVar('EvaluationScalarT', default=EvaluationScalar, covariant=True) (empty)

51"""Type variable for the scalar result type of an evaluation.""" (empty)

53T = TypeVar('T') (empty)

56@dataclass (empty)

57class EvaluationResult(Generic[EvaluationScalarT]): (empty)

58 """The details of an individual evaluation result.

60 Contains the name, value, reason, and source evaluator for a single evaluation.

62 Args:

63 name: The name of the evaluation.

64 value: The scalar result of the evaluation.

65 reason: An optional explanation of the evaluation result.

66 source: The evaluator that produced this result.

67 """

69 name: str (empty)

70 value: EvaluationScalarT (empty)

71 reason: str | None (empty)

72 source: Evaluator (empty)

74 def downcast(self, *value_types: type[T]) -> EvaluationResult[T] | None: (empty)

75 """Attempt to downcast this result to a more specific type.

77 Args:

78 *value_types: The types to check the value against.

80 Returns:

81 A downcast version of this result if the value is an instance of one of the given types,

82 otherwise None.

83 """

84 # Check if value matches any of the target types, handling bool as a special case

85 for value_type in value_types: (empty)

86 if isinstance(self.value, value_type): (empty)

87 # Only match bool with explicit bool type

88 if isinstance(self.value, bool) and value_type is not bool: (empty)

89 continue (empty)

90 return cast(EvaluationResult[T], self) (empty)

91 return None (empty)

94# Evaluators are contravariant in all of its parameters.

95InputsT = TypeVar('InputsT', default=Any, contravariant=True) (empty)

96"""Type variable for the inputs type of the task being evaluated.""" (empty)

98OutputT = TypeVar('OutputT', default=Any, contravariant=True) (empty)

99"""Type variable for the output type of the task being evaluated.""" (empty)

100

101MetadataT = TypeVar('MetadataT', default=Any, contravariant=True) (empty)

102"""Type variable for the metadata type of the task being evaluated.""" (empty)

103

104

105class _StrictABCMeta(ABCMeta): (empty)

106 """An ABC-like metaclass that goes further and disallows even defining abstract subclasses."""

107

108 def __new__(mcls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], /, **kwargs: Any): (empty)

109 result = super().__new__(mcls, name, bases, namespace, **kwargs) (empty)

110 # Check if this class is a proper subclass of a _StrictABC instance

111 is_proper_subclass = any(isinstance(c, _StrictABCMeta) for c in result.__mro__[1:]) (empty)

112 if is_proper_subclass and result.__abstractmethods__: (empty)

113 abstractmethods = ', '.join([f'{m!r}' for m in result.__abstractmethods__]) (empty)

114 raise TypeError(f'{name} must implement all abstract methods: {abstractmethods}') (empty)

115 return result (empty)

116

117

118@dataclass (empty)

119class Evaluator(Generic[InputsT, OutputT, MetadataT], metaclass=_StrictABCMeta): (empty)

120 """Base class for all evaluators.

121

122 Evaluators can assess the performance of a task in a variety of ways, as a function of the EvaluatorContext.

123

124 Subclasses must implement the `evaluate` method. Note it can be defined with either `def` or `async def`.

125

126 Example:

127 ```python

128 @dataclass

129 class ExactMatch(Evaluator[Any, Any, Any]):

130 def evaluate(self, ctx: EvaluatorContext) -> bool:

131 return ctx.actual_output == ctx.expected_output

132 ```

133 """

134

135 __pydantic_config__ = ConfigDict(arbitrary_types_allowed=True) (empty)

136

137 @classmethod (empty)

138 def name(cls) -> str: (empty)

139 """Return the 'name' of this Evaluator to use during serialization.

140

141 Returns:

142 The name of the Evaluator, which is typically the class name.

143 """

144 # Note: if we wanted to prefer snake_case, we could use:

145 # from pydantic.alias_generators import to_snake

146 # return to_snake(cls.__name__)

147 return cls.__name__ (empty)

148

149 @abstractmethod (empty)

150 def evaluate( (empty)

151 self, ctx: EvaluatorContext[InputsT, OutputT, MetadataT] (empty)

152 ) -> EvaluatorOutput | Awaitable[EvaluatorOutput]: # pragma: no cover (empty)

153 """Evaluate the task output in the given context.

154

155 This is the main evaluation method that subclasses must implement. It can be either synchronous

156 or asynchronous, returning either an EvaluatorOutput directly or an Awaitable[EvaluatorOutput].

157

158 Args:

159 ctx: The context containing the inputs, outputs, and metadata for evaluation.

160

161 Returns:

162 The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping

163 of evaluation names to either of those. Can be returned either synchronously or as an

164 awaitable for asynchronous evaluation.

165 """

166 raise NotImplementedError('You must implement `evaluate`.')

167

168 def evaluate_sync(self, ctx: EvaluatorContext[InputsT, OutputT, MetadataT]) -> EvaluatorOutput: (empty)

169 """Run the evaluator synchronously, handling both sync and async implementations.

170

171 This method ensures synchronous execution by running any async evaluate implementation

172 to completion using run_until_complete.

173

174 Args:

175 ctx: The context containing the inputs, outputs, and metadata for evaluation.

176

177 Returns:

178 The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping

179 of evaluation names to either of those.

180 """

181 output = self.evaluate(ctx) (empty)

182 if inspect.iscoroutine(output): # pragma: no cover (empty)

183 return get_event_loop().run_until_complete(output)

184 else:

185 return cast(EvaluatorOutput, output) (empty)

186

187 async def evaluate_async(self, ctx: EvaluatorContext[InputsT, OutputT, MetadataT]) -> EvaluatorOutput: (empty)

188 """Run the evaluator asynchronously, handling both sync and async implementations.

189

190 This method ensures asynchronous execution by properly awaiting any async evaluate

191 implementation. For synchronous implementations, it returns the result directly.

192

193 Args:

194 ctx: The context containing the inputs, outputs, and metadata for evaluation.

195

196 Returns:

197 The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping

198 of evaluation names to either of those.

199 """

200 # Note: If self.evaluate is synchronous, but you need to prevent this from blocking, override this method with:

201 # return await anyio.to_thread.run_sync(self.evaluate, ctx)

202 output = self.evaluate(ctx) (empty)

203 if inspect.iscoroutine(output): (empty)

204 return await output (empty)

205 else:

206 return cast(EvaluatorOutput, output) (empty)

207

208 @model_serializer(mode='plain') (empty)

209 def serialize(self, info: SerializationInfo) -> Any: (empty)

210 """Serialize this Evaluator to a JSON-serializable form.

211

212 Returns:

213 A JSON-serializable representation of this evaluator as an EvaluatorSpec.

214 """

215 raw_arguments: dict[str, Any] = {} (empty)

216 for field in fields(self): (empty)

217 value = getattr(self, field.name) (empty)

218 # always exclude defaults:

219 if field.default is not MISSING: (empty)

220 if value == field.default: (empty)

221 continue (empty)

222 if field.default_factory is not MISSING: (empty)

223 if value == field.default_factory(): 223 ↛ 225line 223 didn't jump to line 225 because the condition on line 223 was always true(empty)

224 continue (empty)

225 raw_arguments[field.name] = value (empty)

226

227 arguments: None | tuple[Any,] | dict[str, Any]

228 if len(raw_arguments) == 0: (empty)

229 arguments = None (empty)

230 elif len(raw_arguments) == 1: (empty)

231 arguments = (next(iter(raw_arguments.values())),) (empty)

232 else:

233 arguments = raw_arguments (empty)

234 return to_jsonable_python(EvaluatorSpec(name=self.name(), arguments=arguments), context=info.context) (empty)

235

236

237async def run_evaluator( (empty)

238 evaluator: Evaluator[InputsT, OutputT, MetadataT], ctx: EvaluatorContext[InputsT, OutputT, MetadataT]

239) -> list[EvaluationResult]:

240 """Run an evaluator and return the results.

241

242 This function runs an evaluator on the given context and processes the results into

243 a standardized format.

244

245 Args:

246 evaluator: The evaluator to run.

247 ctx: The context containing the inputs, outputs, and metadata for evaluation.

248

249 Returns:

250 A list of evaluation results.

251

252 Raises:

253 ValueError: If the evaluator returns a value of an invalid type.

254 """

255 raw_results = await evaluator.evaluate_async(ctx) (empty)

256

257 try: (empty)

258 results = _EVALUATOR_OUTPUT_ADAPTER.validate_python(raw_results) (empty)

259 except ValidationError as e: (empty)

260 raise ValueError(f'{evaluator!r}.evaluate returned a value of an invalid type: {raw_results!r}.') from e (empty)

261

262 results = _convert_to_mapping(results, scalar_name=evaluator.name()) (empty)

263

264 details: list[EvaluationResult] = [] (empty)

265 for name, result in results.items(): (empty)

266 if not isinstance(result, EvaluationReason): (empty)

267 result = EvaluationReason(value=result) (empty)

268 details.append(EvaluationResult(name=name, value=result.value, reason=result.reason, source=evaluator)) (empty)

269

270 return details (empty)

271

272

273_EVALUATOR_OUTPUT_ADAPTER = TypeAdapter[EvaluatorOutput](EvaluatorOutput) (empty)

274

275

276def _convert_to_mapping( (empty)

277 result: EvaluatorOutput, *, scalar_name: str

278) -> Mapping[str, EvaluationScalar | EvaluationReason]:

279 """Convert an evaluator output to a mapping from names to scalar values or evaluation reasons.

280

281 Args:

282 result: The evaluator output to convert.

283 scalar_name: The name to use for a scalar result.

284

285 Returns:

286 A mapping from names to scalar values or evaluation reasons.

287 """

288 if isinstance(result, Mapping): (empty)

289 return result (empty)

290 return {scalar_name: result} (empty)