humanloop · harry-humanloop · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/tutorials/rag/cicd/cicd_setup.py b/tutorials/rag/cicd/cicd_setup.py
@@ -0,0 +1,92 @@
+"""Set up evaluate_rag_cicd.py.
+
+Uploads the Dataset and Evaluators.
+Note that while the Evaluators are similar to the example Evaluators created
+automatically when you signed up, these Evaluators contain an additional step
+processing the model's output.
+"""
+
+from contextlib import contextmanager
+import os
+import pandas as pd
+from pathlib import Path
+from humanloop import Humanloop, UnprocessableEntityError
+
+# Create a Humanloop client
+humanloop = Humanloop(
+    api_key=os.getenv("HUMANLOOP_KEY"), base_url=os.getenv("HUMANLOOP_BASE_URL")
+)
+
+assets_folder = Path(__file__).parents[3].resolve() / "assets"
+
+
+def upload_dataset():
+    df = pd.read_json(assets_folder / "datapoints.jsonl", lines=True)
+
+    datapoints = [row.to_dict() for _i, row in df.iterrows()][0:20]
+    with ignore_already_committed():
+        humanloop.datasets.upsert(
+            path="evals_demo/medqa-small",
+            datapoints=datapoints,
+            commit_message=f"Added {len(datapoints)} datapoints from MedQA test dataset.",
+        )
+
+
+def upload_evaluators():
+    # Upload Code Evaluators
+    for evaluator_name, return_type in [
+        ("exact_match", "boolean"),
+        ("levenshtein", "number"),
+    ]:
+        with open(assets_folder / f"evaluators/{evaluator_name}.py", "r") as f:
+            code = f.read()
+
+        with ignore_already_committed():
+            humanloop.evaluators.upsert(
+                path=f"evals_demo/{evaluator_name}",
+                spec={
+                    "evaluator_type": "python",
+                    "arguments_type": "target_required",
+                    "return_type": return_type,
+                    "code": code,
+                },
+                commit_message=f"New version from {evaluator_name}.py",
+            )
+
+    # Upload an LLM Evaluator
+    with ignore_already_committed():
+        humanloop.evaluators.upsert(
+            path="evals_demo/reasoning",
+            spec={
+                "evaluator_type": "llm",
+                "arguments_type": "target_free",
+                "return_type": "boolean",
+                "prompt": {
+                    "model": "gpt-4o",
+                    "endpoint": "complete",
+                    "temperature": 0,
+                    "template": 'An answer is shown below. The answer contains 3 sections, separated by "---". The first section is the final answer. The second section is an explanation. The third section is a citation.\n\nEvaluate if the final answer follows from the citation and the reasoning in the explanation section. Give a brief explanation/discussion. Do not make your judgment based on factuality, but purely based on the logic presented.\nOn a new line, give a final verdict of "True" or "False".\n\nAnswer:\n{{log.output}}',
+                },
+            },
+            commit_message="Initial reasoning evaluator.",
+        )
+
+
+@contextmanager
+def ignore_already_committed():
+    """Context manager to ignore the error where a version has already been committed."""
+    try:
+        yield
+    except UnprocessableEntityError as e:
+        try:
+            if "already been committed" in e.body.detail["description"]:
+                return
+        except Exception:
+            pass
+        raise e
+
+
+if __name__ == "__main__":
+    upload_dataset()
+    upload_evaluators()
+    print("Datasets and Evaluators uploaded.")
diff --git a/tutorials/rag/cicd/utils.py b/tutorials/rag/cicd/utils.py
@@ -1,7 +1,6 @@
-""" Util methods for formatting and checking Evaluation results."""
+"""Util methods for formatting and checking Evaluation results."""
 
-from prettytable import PrettyTable
-from humanloop import EvaluationResponse, EvaluationStats, VersionStatsResponse
+from humanloop import EvaluationResponse, EvaluationStats, RunStatsResponse
 from humanloop import BooleanEvaluatorStatsResponse as BooleanStats
 from humanloop import NumericEvaluatorStatsResponse as NumericStats
 
@@ -40,35 +39,18 @@ def get_score_from_evaluator_stat(stat: NumericStats | BooleanStats) -> float |
     return score
 
 
-def get_sorted_version_stats(
-    stats: EvaluationStats,
-    evaluation: EvaluationResponse
-) -> list[VersionStatsResponse]:
-    """Sort the VersionStats by created_at."""
-    versions_by_id = {
-        evaluatee.version["version_id"]: evaluatee
-        for evaluatee in evaluation.evaluatees
-    }
-    sorted_stats = sorted(
-        stats.version_stats,
-        key=lambda x: versions_by_id[x.version_id].version["created_at"]
-    )
-    if len(sorted_stats) == 0:
-        raise ValueError("No VersionStats found.")
-
-    return sorted_stats
-
-
 def get_evaluator_stats_by_path(
-    stat: VersionStatsResponse, evaluation: EvaluationResponse
+    stat: RunStatsResponse, evaluation: EvaluationResponse
 ) -> dict[str, NumericStats | BooleanStats]:
     """Get the Evaluator stats by path."""
     evaluators_by_id = {
         evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators
     }
     evaluator_stats_by_path = {
-        evaluators_by_id[evaluator_stat.evaluator_version_id].version.path: evaluator_stat
-        for evaluator_stat in stat.evaluator_version_stats
+        evaluators_by_id[
+            evaluator_stat.evaluator_version_id
+        ].version.path: evaluator_stat
+        for evaluator_stat in stat.evaluator_stats
     }
     return evaluator_stats_by_path
 
@@ -81,15 +63,16 @@ def check_evaluation_threshold(
 ) -> bool:
     """Checks if the latest version has an average Evaluator result above a threshold."""
 
-    version_stats = get_sorted_version_stats(stats=stats, evaluation=evaluation)
+    latest_run_stats = stats.run_stats[0]
     evaluator_stats_by_path = get_evaluator_stats_by_path(
-        stat=version_stats[-1],
-        evaluation=evaluation
+        stat=latest_run_stats, evaluation=evaluation
     )
 
     if evaluator_path in evaluator_stats_by_path:
         evaluator_stat = evaluator_stats_by_path[evaluator_path]
         score = get_score_from_evaluator_stat(stat=evaluator_stat)
+        if score is None:
+            raise ValueError(f"Score not found for evaluator {evaluator_path}.")
         if score >= threshold:
             print(
                 f"{GREEN}✅ Latest eval [{score}] above threshold [{threshold}] for evaluator {evaluator_path}.{RESET}"
@@ -105,28 +88,25 @@ def check_evaluation_threshold(
 
 
 def check_evaluation_improvement(
-    evaluation: EvaluationResponse,
-    evaluator_path: str,
-    stats: EvaluationStats
+    evaluation: EvaluationResponse, evaluator_path: str, stats: EvaluationStats
 ) -> bool:
     """Check the latest version has improved across for a specific Evaluator."""
 
-    version_stats = get_sorted_version_stats(stats=stats, evaluation=evaluation)
+    latest_run_stats = stats.run_stats[0]
     latest_evaluator_stats_by_path = get_evaluator_stats_by_path(
-        stat=version_stats[-1],
-        evaluation=evaluation
+        stat=latest_run_stats, evaluation=evaluation
     )
-    if len(version_stats) == 1:
-        print(
-            f"{YELLOW}⚠️ No previous versions to compare with.{RESET}"
-        )
+    if len(stats.run_stats) == 1:
+        print(f"{YELLOW}⚠️ No previous versions to compare with.{RESET}")
         return True
 
     previous_evaluator_stats_by_path = get_evaluator_stats_by_path(
-        stat=version_stats[-2],
-        evaluation=evaluation
+        stat=stats.run_stats[-2], evaluation=evaluation
     )
-    if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path:
+    if (
+        evaluator_path in latest_evaluator_stats_by_path
+        and evaluator_path in previous_evaluator_stats_by_path
+    ):
         latest_evaluator_stat = latest_evaluator_stats_by_path[evaluator_path]
         previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path]
         latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat)

diff --git a/tutorials/rag/evaluate-rag-flow.ipynb b/tutorials/rag/evaluate-rag-flow.ipynb
@@ -90,7 +90,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001B[34mInstalling dependencies from lock file\u001B[39m\n",
+      "\u001b[34mInstalling dependencies from lock file\u001b[39m\n",
       "\n",
       "No dependencies to install or update\n"
      ]
@@ -552,6 +552,14 @@
     "Now we can start to trigger Evaluations on Humanloop using our Dataset and Evaluators:"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09ccddff",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 15,
@@ -574,14 +582,20 @@
     "evaluation = humanloop.evaluations.create(\n",
     "    name=\"Demo evals 2\",\n",
     "    file={\"path\":\"evals_demo/medqa-flow\"},\n",
-    "    dataset={\"path\": \"evals_demo/medqa-test\"},\n",
     "    evaluators=[\n",
     "        {\"path\": \"evals_demo/exact_match\"},\n",
     "        {\"path\": \"evals_demo/levenshtein\"},\n",
     "        {\"path\": \"evals_demo/reasoning\"},\n",
     "    ],\n",
     ")\n",
-    "print(f\"Evaluation created: {evaluation.id}\")\n"
+    "print(f\"Evaluation created: {evaluation.id}\")\n",
+    "\n",
+    "run = humanloop.evaluations.create_run(\n",
+    "    id=evaluation.id,\n",
+    "    dataset={\"path\": \"evals_demo/medqa-test\"},\n",
+    ")\n",
+    "print(f\"Run created: {run.id}\")\n",
+    "\n"
    ]
   },
   {
@@ -602,7 +616,7 @@
     "def populate_evaluation():\n",
     "    \"\"\"Run a variation of your Pipeline over the Dataset to populate results\"\"\"\n",
     "    retrieved_dataset = humanloop.datasets.get(\n",
-    "        id=evaluation.dataset.id,\n",
+    "        id=run.dataset.id,\n",
     "        include_datapoints=True,\n",
     "    )\n",
     "    for datapoint in tqdm(retrieved_dataset.datapoints):\n",
@@ -633,7 +647,7 @@
     "            trace_status=\"complete\",\n",
     "            # NB: New arguments to link to Evaluation and Dataset\n",
     "            source_datapoint_id=datapoint.id,\n",
-    "            evaluation_id=evaluation.id,\n",
+    "            run_id=run.id,\n",
     ")\n"
    ]
   },