Skip to content

Eval runs #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions tutorials/rag/cicd/cicd_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Set up evaluate_rag_cicd.py.

Uploads the Dataset and Evaluators.
Note that while the Evaluators are similar to the example Evaluators created
automatically when you signed up, these Evaluators contain an additional step
processing the model's output.
"""

from contextlib import contextmanager
import os
import pandas as pd
from pathlib import Path
from humanloop import Humanloop, UnprocessableEntityError

# Create a Humanloop client
humanloop = Humanloop(
api_key=os.getenv("HUMANLOOP_KEY"), base_url=os.getenv("HUMANLOOP_BASE_URL")
)

assets_folder = Path(__file__).parents[3].resolve() / "assets"


def upload_dataset():
df = pd.read_json(assets_folder / "datapoints.jsonl", lines=True)

datapoints = [row.to_dict() for _i, row in df.iterrows()][0:20]
with ignore_already_committed():
humanloop.datasets.upsert(
path="evals_demo/medqa-small",
datapoints=datapoints,
commit_message=f"Added {len(datapoints)} datapoints from MedQA test dataset.",
)


def upload_evaluators():
# Upload Code Evaluators
for evaluator_name, return_type in [
("exact_match", "boolean"),
("levenshtein", "number"),
]:
with open(assets_folder / f"evaluators/{evaluator_name}.py", "r") as f:
code = f.read()

with ignore_already_committed():
humanloop.evaluators.upsert(
path=f"evals_demo/{evaluator_name}",
spec={
"evaluator_type": "python",
"arguments_type": "target_required",
"return_type": return_type,
"code": code,
},
commit_message=f"New version from {evaluator_name}.py",
)

# Upload an LLM Evaluator
with ignore_already_committed():
humanloop.evaluators.upsert(
path="evals_demo/reasoning",
spec={
"evaluator_type": "llm",
"arguments_type": "target_free",
"return_type": "boolean",
"prompt": {
"model": "gpt-4o",
"endpoint": "complete",
"temperature": 0,
"template": 'An answer is shown below. The answer contains 3 sections, separated by "---". The first section is the final answer. The second section is an explanation. The third section is a citation.\n\nEvaluate if the final answer follows from the citation and the reasoning in the explanation section. Give a brief explanation/discussion. Do not make your judgment based on factuality, but purely based on the logic presented.\nOn a new line, give a final verdict of "True" or "False".\n\nAnswer:\n{{log.output}}',
},
},
commit_message="Initial reasoning evaluator.",
)


@contextmanager
def ignore_already_committed():
"""Context manager to ignore the error where a version has already been committed."""
try:
yield
except UnprocessableEntityError as e:
try:
if "already been committed" in e.body.detail["description"]:
return
except Exception:
pass
raise e


if __name__ == "__main__":
upload_dataset()
upload_evaluators()
print("Datasets and Evaluators uploaded.")
62 changes: 21 additions & 41 deletions tutorials/rag/cicd/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
""" Util methods for formatting and checking Evaluation results."""
"""Util methods for formatting and checking Evaluation results."""

from prettytable import PrettyTable
from humanloop import EvaluationResponse, EvaluationStats, VersionStatsResponse
from humanloop import EvaluationResponse, EvaluationStats, RunStatsResponse
from humanloop import BooleanEvaluatorStatsResponse as BooleanStats
from humanloop import NumericEvaluatorStatsResponse as NumericStats

Expand Down Expand Up @@ -40,35 +39,18 @@ def get_score_from_evaluator_stat(stat: NumericStats | BooleanStats) -> float |
return score


def get_sorted_version_stats(
stats: EvaluationStats,
evaluation: EvaluationResponse
) -> list[VersionStatsResponse]:
"""Sort the VersionStats by created_at."""
versions_by_id = {
evaluatee.version["version_id"]: evaluatee
for evaluatee in evaluation.evaluatees
}
sorted_stats = sorted(
stats.version_stats,
key=lambda x: versions_by_id[x.version_id].version["created_at"]
)
if len(sorted_stats) == 0:
raise ValueError("No VersionStats found.")

return sorted_stats


def get_evaluator_stats_by_path(
stat: VersionStatsResponse, evaluation: EvaluationResponse
stat: RunStatsResponse, evaluation: EvaluationResponse
) -> dict[str, NumericStats | BooleanStats]:
"""Get the Evaluator stats by path."""
evaluators_by_id = {
evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators
}
evaluator_stats_by_path = {
evaluators_by_id[evaluator_stat.evaluator_version_id].version.path: evaluator_stat
for evaluator_stat in stat.evaluator_version_stats
evaluators_by_id[
evaluator_stat.evaluator_version_id
].version.path: evaluator_stat
for evaluator_stat in stat.evaluator_stats
}
return evaluator_stats_by_path

Expand All @@ -81,15 +63,16 @@ def check_evaluation_threshold(
) -> bool:
"""Checks if the latest version has an average Evaluator result above a threshold."""

version_stats = get_sorted_version_stats(stats=stats, evaluation=evaluation)
latest_run_stats = stats.run_stats[0]
evaluator_stats_by_path = get_evaluator_stats_by_path(
stat=version_stats[-1],
evaluation=evaluation
stat=latest_run_stats, evaluation=evaluation
)

if evaluator_path in evaluator_stats_by_path:
evaluator_stat = evaluator_stats_by_path[evaluator_path]
score = get_score_from_evaluator_stat(stat=evaluator_stat)
if score is None:
raise ValueError(f"Score not found for evaluator {evaluator_path}.")
if score >= threshold:
print(
f"{GREEN}✅ Latest eval [{score}] above threshold [{threshold}] for evaluator {evaluator_path}.{RESET}"
Expand All @@ -105,28 +88,25 @@ def check_evaluation_threshold(


def check_evaluation_improvement(
evaluation: EvaluationResponse,
evaluator_path: str,
stats: EvaluationStats
evaluation: EvaluationResponse, evaluator_path: str, stats: EvaluationStats
) -> bool:
"""Check the latest version has improved across for a specific Evaluator."""

version_stats = get_sorted_version_stats(stats=stats, evaluation=evaluation)
latest_run_stats = stats.run_stats[0]
latest_evaluator_stats_by_path = get_evaluator_stats_by_path(
stat=version_stats[-1],
evaluation=evaluation
stat=latest_run_stats, evaluation=evaluation
)
if len(version_stats) == 1:
print(
f"{YELLOW}⚠️ No previous versions to compare with.{RESET}"
)
if len(stats.run_stats) == 1:
print(f"{YELLOW}⚠️ No previous versions to compare with.{RESET}")
return True

previous_evaluator_stats_by_path = get_evaluator_stats_by_path(
stat=version_stats[-2],
evaluation=evaluation
stat=stats.run_stats[-2], evaluation=evaluation
)
if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path:
if (
evaluator_path in latest_evaluator_stats_by_path
and evaluator_path in previous_evaluator_stats_by_path
):
latest_evaluator_stat = latest_evaluator_stats_by_path[evaluator_path]
previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path]
latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat)
Expand Down
24 changes: 19 additions & 5 deletions tutorials/rag/evaluate-rag-flow.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[34mInstalling dependencies from lock file\u001B[39m\n",
"\u001b[34mInstalling dependencies from lock file\u001b[39m\n",
"\n",
"No dependencies to install or update\n"
]
Expand Down Expand Up @@ -552,6 +552,14 @@
"Now we can start to trigger Evaluations on Humanloop using our Dataset and Evaluators:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "09ccddff",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 15,
Expand All @@ -574,14 +582,20 @@
"evaluation = humanloop.evaluations.create(\n",
" name=\"Demo evals 2\",\n",
" file={\"path\":\"evals_demo/medqa-flow\"},\n",
" dataset={\"path\": \"evals_demo/medqa-test\"},\n",
" evaluators=[\n",
" {\"path\": \"evals_demo/exact_match\"},\n",
" {\"path\": \"evals_demo/levenshtein\"},\n",
" {\"path\": \"evals_demo/reasoning\"},\n",
" ],\n",
")\n",
"print(f\"Evaluation created: {evaluation.id}\")\n"
"print(f\"Evaluation created: {evaluation.id}\")\n",
"\n",
"run = humanloop.evaluations.create_run(\n",
" id=evaluation.id,\n",
" dataset={\"path\": \"evals_demo/medqa-test\"},\n",
")\n",
"print(f\"Run created: {run.id}\")\n",
"\n"
]
},
{
Expand All @@ -602,7 +616,7 @@
"def populate_evaluation():\n",
" \"\"\"Run a variation of your Pipeline over the Dataset to populate results\"\"\"\n",
" retrieved_dataset = humanloop.datasets.get(\n",
" id=evaluation.dataset.id,\n",
" id=run.dataset.id,\n",
" include_datapoints=True,\n",
" )\n",
" for datapoint in tqdm(retrieved_dataset.datapoints):\n",
Expand Down Expand Up @@ -633,7 +647,7 @@
" trace_status=\"complete\",\n",
" # NB: New arguments to link to Evaluation and Dataset\n",
" source_datapoint_id=datapoint.id,\n",
" evaluation_id=evaluation.id,\n",
" run_id=run.id,\n",
")\n"
]
},
Expand Down
Loading