From 3792d76047ac2c513db162e5dc1b633cd9a40edf Mon Sep 17 00:00:00 2001
From: Harry Xie <harry@humanloop.com>
Date: Wed, 30 Oct 2024 21:51:53 +0000
Subject: [PATCH 1/4] update evaluate_rag_cicd for runs

---
 tutorials/rag/evaluate_rag_cicd.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/tutorials/rag/evaluate_rag_cicd.py b/tutorials/rag/evaluate_rag_cicd.py
index 1ea0f37..04f82df 100644
--- a/tutorials/rag/evaluate_rag_cicd.py
+++ b/tutorials/rag/evaluate_rag_cicd.py
@@ -17,7 +17,6 @@
 import os
 from dotenv import load_dotenv
 import inspect
-import uuid
 import pandas as pd
 from chromadb import chromadb
 from openai import OpenAI
@@ -46,7 +45,7 @@
 )
 
 # Define your Prompt details in code
-model = "gpt-4-turbo"
+model = "gpt-4o-mini"
 temperature = 0
 template = [
     {
@@ -122,6 +121,7 @@ def run_evaluation(
     pipeline: Callable,
     flow_id: str,
     evaluation_id: str,
+    dataset_id: str,
     attributes: dict,
     max_workers: int = 5,
 ) -> bool:
@@ -142,11 +142,15 @@ def run_evaluation(
     # Pull down your dataset
     evaluation = humanloop.evaluations.get(id=evaluation_id)
     dataset = humanloop.datasets.get(
-        id=evaluation.dataset.id,
+        id=dataset_id,
         include_datapoints=True,
     )
     # Add a batch ID for this run so that you get a new column in report, even if your pipeline is the same
-    batch_id = uuid.uuid4().hex
+    run = humanloop.evaluations.create_run(
+        id=evaluation_id,
+        dataset={"file_id": dataset_id},
+    )
+    run_id = run.id
 
     # Define the function to execute your pipeline in parallel and Log to Humanloop
     def process_datapoint(datapoint):
@@ -161,11 +165,10 @@ def process_datapoint(datapoint):
                 inputs=datapoint.inputs,
                 output=output,
                 source_datapoint_id=datapoint.id,
-                evaluation_id=evaluation.id,
+                run_id=run.id,
                 trace_status="complete",
                 start_time=start_time,
                 end_time=datetime.now(),
-                batch_id=batch_id,
             )
 
         except Exception as error:
@@ -176,11 +179,10 @@ def process_datapoint(datapoint):
                 inputs=datapoint.inputs,
                 error=str(error),
                 source_datapoint_id=datapoint.id,
-                evaluation_id=evaluation.id,
+                run_id=run_id,
                 trace_status="complete",
                 start_time=start_time,
                 end_time=datetime.now(),
-                batch_id=batch_id,
             )
 
     # Execute your pipeline and send the logs to Humanloop in parallel
@@ -197,9 +199,11 @@ def process_datapoint(datapoint):
     while not complete:
         stats = humanloop.evaluations.get_stats(id=evaluation.id)
         print(stats.progress)
-        complete = stats.status == "completed"
+        run_stats = next((run for run in stats.run_stats if run.run_id == run_id), None)
+        complete = run_stats and run_stats.status == "completed"
         if not complete:
             time.sleep(10)
+    assert stats is not None
 
     # Print Evaluation results
     print(stats.report)
@@ -234,8 +238,14 @@ def process_datapoint(datapoint):
         type=str,
         help="Evaluation ID for the run. If not specified, a new one will be created.",
     )
+    parser.add_argument(
+        "--dataset_id",
+        type=str,
+        help="Dataset ID for the run.",
+    )
     args = parser.parse_args()
     evaluation_id = args.evaluation_id
+    dataset_id = args.dataset_id
 
     # These attributes should represent the configuration of your pipeline
     attributes = {
@@ -264,7 +274,6 @@ def process_datapoint(datapoint):
             # NB: you can use `path`or `id` for references on Humanloop
             file={"id": flow.id},
             # Assume Evaluators and Datasets already exist
-            dataset={"path": "evals_demo/medqa-test"},
             evaluators=[
                 {"path": "evals_demo/exact_match"},
                 {"path": "evals_demo/levenshtein"},
@@ -279,6 +288,7 @@ def process_datapoint(datapoint):
         pipeline=ask_question,
         flow_id=flow.id,
         evaluation_id=evaluation_id,
+        dataset_id=dataset_id,
         # attributes specify what version of the pipeline is being evaluated
         attributes=attributes,
     )

From 1057d81a2bbda875561a84eb03c754cc176b8587 Mon Sep 17 00:00:00 2001
From: Harry Xie <harry@humanloop.com>
Date: Wed, 30 Oct 2024 21:52:24 +0000
Subject: [PATCH 2/4] add cicd_setup script for easy setting up prereqs for
 evaluate_rag_cicd

---
 tutorials/rag/cicd/cicd_setup.py | 92 ++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 tutorials/rag/cicd/cicd_setup.py

diff --git a/tutorials/rag/cicd/cicd_setup.py b/tutorials/rag/cicd/cicd_setup.py
new file mode 100644
index 0000000..c2da82f
--- /dev/null
+++ b/tutorials/rag/cicd/cicd_setup.py
@@ -0,0 +1,92 @@
+"""Set up evaluate_rag_cicd.py.
+
+Uploads the Dataset and Evaluators.
+Note that while the Evaluators are similar to the example Evaluators created
+automatically when you signed up, these Evaluators contain an additional step
+processing the model's output.
+"""
+
+from contextlib import contextmanager
+import os
+import pandas as pd
+from pathlib import Path
+from humanloop import Humanloop, UnprocessableEntityError
+
+# Create a Humanloop client
+humanloop = Humanloop(
+    api_key=os.getenv("HUMANLOOP_KEY"), base_url=os.getenv("HUMANLOOP_BASE_URL")
+)
+
+assets_folder = Path(__file__).parents[3].resolve() / "assets"
+
+
+def upload_dataset():
+    df = pd.read_json(assets_folder / "datapoints.jsonl", lines=True)
+
+    datapoints = [row.to_dict() for _i, row in df.iterrows()][0:20]
+    with ignore_already_committed():
+        humanloop.datasets.upsert(
+            path="evals_demo/medqa-small",
+            datapoints=datapoints,
+            commit_message=f"Added {len(datapoints)} datapoints from MedQA test dataset.",
+        )
+
+
+def upload_evaluators():
+    # Upload Code Evaluators
+    for evaluator_name, return_type in [
+        ("exact_match", "boolean"),
+        ("levenshtein", "number"),
+    ]:
+        with open(assets_folder / f"evaluators/{evaluator_name}.py", "r") as f:
+            code = f.read()
+
+        with ignore_already_committed():
+            humanloop.evaluators.upsert(
+                path=f"evals_demo/{evaluator_name}",
+                spec={
+                    "evaluator_type": "python",
+                    "arguments_type": "target_required",
+                    "return_type": return_type,
+                    "code": code,
+                },
+                commit_message=f"New version from {evaluator_name}.py",
+            )
+
+    # Upload an LLM Evaluator
+    with ignore_already_committed():
+        humanloop.evaluators.upsert(
+            path="evals_demo/reasoning",
+            spec={
+                "evaluator_type": "llm",
+                "arguments_type": "target_free",
+                "return_type": "boolean",
+                "prompt": {
+                    "model": "gpt-4o",
+                    "endpoint": "complete",
+                    "temperature": 0,
+                    "template": 'An answer is shown below. The answer contains 3 sections, separated by "---". The first section is the final answer. The second section is an explanation. The third section is a citation.\n\nEvaluate if the final answer follows from the citation and the reasoning in the explanation section. Give a brief explanation/discussion. Do not make your judgment based on factuality, but purely based on the logic presented.\nOn a new line, give a final verdict of "True" or "False".\n\nAnswer:\n{{log.output}}',
+                },
+            },
+            commit_message="Initial reasoning evaluator.",
+        )
+
+
+@contextmanager
+def ignore_already_committed():
+    """Context manager to ignore the error where a version has already been committed."""
+    try:
+        yield
+    except UnprocessableEntityError as e:
+        try:
+            if "already been committed" in e.body.detail["description"]:
+                return
+        except Exception:
+            pass
+        raise e
+
+
+if __name__ == "__main__":
+    upload_dataset()
+    upload_evaluators()
+    print("Datasets and Evaluators uploaded.")

From a554e9dce519a310c04794076a4720b90d4f1dd9 Mon Sep 17 00:00:00 2001
From: Harry Xie <harry@humanloop.com>
Date: Wed, 30 Oct 2024 21:54:06 +0000
Subject: [PATCH 3/4] update utils to handle eval runs

---
 tutorials/rag/cicd/utils.py | 62 +++++++++++++------------------------
 1 file changed, 21 insertions(+), 41 deletions(-)

diff --git a/tutorials/rag/cicd/utils.py b/tutorials/rag/cicd/utils.py
index 7003fb1..e8c3acf 100644
--- a/tutorials/rag/cicd/utils.py
+++ b/tutorials/rag/cicd/utils.py
@@ -1,7 +1,6 @@
-""" Util methods for formatting and checking Evaluation results."""
+"""Util methods for formatting and checking Evaluation results."""
 
-from prettytable import PrettyTable
-from humanloop import EvaluationResponse, EvaluationStats, VersionStatsResponse
+from humanloop import EvaluationResponse, EvaluationStats, RunStatsResponse
 from humanloop import BooleanEvaluatorStatsResponse as BooleanStats
 from humanloop import NumericEvaluatorStatsResponse as NumericStats
 
@@ -40,35 +39,18 @@ def get_score_from_evaluator_stat(stat: NumericStats | BooleanStats) -> float |
     return score
 
 
-def get_sorted_version_stats(
-    stats: EvaluationStats,
-    evaluation: EvaluationResponse
-) -> list[VersionStatsResponse]:
-    """Sort the VersionStats by created_at."""
-    versions_by_id = {
-        evaluatee.version["version_id"]: evaluatee
-        for evaluatee in evaluation.evaluatees
-    }
-    sorted_stats = sorted(
-        stats.version_stats,
-        key=lambda x: versions_by_id[x.version_id].version["created_at"]
-    )
-    if len(sorted_stats) == 0:
-        raise ValueError("No VersionStats found.")
-
-    return sorted_stats
-
-
 def get_evaluator_stats_by_path(
-    stat: VersionStatsResponse, evaluation: EvaluationResponse
+    stat: RunStatsResponse, evaluation: EvaluationResponse
 ) -> dict[str, NumericStats | BooleanStats]:
     """Get the Evaluator stats by path."""
     evaluators_by_id = {
         evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators
     }
     evaluator_stats_by_path = {
-        evaluators_by_id[evaluator_stat.evaluator_version_id].version.path: evaluator_stat
-        for evaluator_stat in stat.evaluator_version_stats
+        evaluators_by_id[
+            evaluator_stat.evaluator_version_id
+        ].version.path: evaluator_stat
+        for evaluator_stat in stat.evaluator_stats
     }
     return evaluator_stats_by_path
 
@@ -81,15 +63,16 @@ def check_evaluation_threshold(
 ) -> bool:
     """Checks if the latest version has an average Evaluator result above a threshold."""
 
-    version_stats = get_sorted_version_stats(stats=stats, evaluation=evaluation)
+    latest_run_stats = stats.run_stats[0]
     evaluator_stats_by_path = get_evaluator_stats_by_path(
-        stat=version_stats[-1],
-        evaluation=evaluation
+        stat=latest_run_stats, evaluation=evaluation
     )
 
     if evaluator_path in evaluator_stats_by_path:
         evaluator_stat = evaluator_stats_by_path[evaluator_path]
         score = get_score_from_evaluator_stat(stat=evaluator_stat)
+        if score is None:
+            raise ValueError(f"Score not found for evaluator {evaluator_path}.")
         if score >= threshold:
             print(
                 f"{GREEN}✅ Latest eval [{score}] above threshold [{threshold}] for evaluator {evaluator_path}.{RESET}"
@@ -105,28 +88,25 @@ def check_evaluation_threshold(
 
 
 def check_evaluation_improvement(
-    evaluation: EvaluationResponse,
-    evaluator_path: str,
-    stats: EvaluationStats
+    evaluation: EvaluationResponse, evaluator_path: str, stats: EvaluationStats
 ) -> bool:
     """Check the latest version has improved across for a specific Evaluator."""
 
-    version_stats = get_sorted_version_stats(stats=stats, evaluation=evaluation)
+    latest_run_stats = stats.run_stats[0]
     latest_evaluator_stats_by_path = get_evaluator_stats_by_path(
-        stat=version_stats[-1],
-        evaluation=evaluation
+        stat=latest_run_stats, evaluation=evaluation
     )
-    if len(version_stats) == 1:
-        print(
-            f"{YELLOW}⚠️ No previous versions to compare with.{RESET}"
-        )
+    if len(stats.run_stats) == 1:
+        print(f"{YELLOW}⚠️ No previous versions to compare with.{RESET}")
         return True
 
     previous_evaluator_stats_by_path = get_evaluator_stats_by_path(
-        stat=version_stats[-2],
-        evaluation=evaluation
+        stat=stats.run_stats[-2], evaluation=evaluation
     )
-    if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path:
+    if (
+        evaluator_path in latest_evaluator_stats_by_path
+        and evaluator_path in previous_evaluator_stats_by_path
+    ):
         latest_evaluator_stat = latest_evaluator_stats_by_path[evaluator_path]
         previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path]
         latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat)

From 516859e4feb961f82a388ef1492e1ae4544e7f3d Mon Sep 17 00:00:00 2001
From: Harry Xie <harry@humanloop.com>
Date: Wed, 30 Oct 2024 22:02:41 +0000
Subject: [PATCH 4/4] update notebooks

---
 tutorials/rag/evaluate-rag-flow.ipynb | 24 ++++++++---
 tutorials/rag/evaluate-rag.ipynb      | 58 ++++++++++++++++-----------
 2 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/tutorials/rag/evaluate-rag-flow.ipynb b/tutorials/rag/evaluate-rag-flow.ipynb
index d2904c2..c15bd5f 100644
--- a/tutorials/rag/evaluate-rag-flow.ipynb
+++ b/tutorials/rag/evaluate-rag-flow.ipynb
@@ -90,7 +90,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001B[34mInstalling dependencies from lock file\u001B[39m\n",
+      "\u001b[34mInstalling dependencies from lock file\u001b[39m\n",
       "\n",
       "No dependencies to install or update\n"
      ]
@@ -552,6 +552,14 @@
     "Now we can start to trigger Evaluations on Humanloop using our Dataset and Evaluators:"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09ccddff",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 15,
@@ -574,14 +582,20 @@
     "evaluation = humanloop.evaluations.create(\n",
     "    name=\"Demo evals 2\",\n",
     "    file={\"path\":\"evals_demo/medqa-flow\"},\n",
-    "    dataset={\"path\": \"evals_demo/medqa-test\"},\n",
     "    evaluators=[\n",
     "        {\"path\": \"evals_demo/exact_match\"},\n",
     "        {\"path\": \"evals_demo/levenshtein\"},\n",
     "        {\"path\": \"evals_demo/reasoning\"},\n",
     "    ],\n",
     ")\n",
-    "print(f\"Evaluation created: {evaluation.id}\")\n"
+    "print(f\"Evaluation created: {evaluation.id}\")\n",
+    "\n",
+    "run = humanloop.evaluations.create_run(\n",
+    "    id=evaluation.id,\n",
+    "    dataset={\"path\": \"evals_demo/medqa-test\"},\n",
+    ")\n",
+    "print(f\"Run created: {run.id}\")\n",
+    "\n"
    ]
   },
   {
@@ -602,7 +616,7 @@
     "def populate_evaluation():\n",
     "    \"\"\"Run a variation of your Pipeline over the Dataset to populate results\"\"\"\n",
     "    retrieved_dataset = humanloop.datasets.get(\n",
-    "        id=evaluation.dataset.id,\n",
+    "        id=run.dataset.id,\n",
     "        include_datapoints=True,\n",
     "    )\n",
     "    for datapoint in tqdm(retrieved_dataset.datapoints):\n",
@@ -633,7 +647,7 @@
     "            trace_status=\"complete\",\n",
     "            # NB: New arguments to link to Evaluation and Dataset\n",
     "            source_datapoint_id=datapoint.id,\n",
-    "            evaluation_id=evaluation.id,\n",
+    "            run_id=run.id,\n",
     ")\n"
    ]
   },
diff --git a/tutorials/rag/evaluate-rag.ipynb b/tutorials/rag/evaluate-rag.ipynb
index 670eeae..21f6f03 100644
--- a/tutorials/rag/evaluate-rag.ipynb
+++ b/tutorials/rag/evaluate-rag.ipynb
@@ -98,7 +98,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "47ac94aa",
    "metadata": {},
    "outputs": [],
@@ -130,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "15c5158d1d159535",
    "metadata": {},
    "outputs": [],
@@ -163,7 +163,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "187af8c5",
    "metadata": {},
    "outputs": [],
@@ -228,7 +228,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "53c95ad9790ade59",
    "metadata": {},
    "outputs": [],
@@ -297,7 +297,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "6c2102bcad49c932",
    "metadata": {},
    "outputs": [],
@@ -306,7 +306,7 @@
     "from humanloop import Humanloop\n",
     "\n",
     "load_dotenv()\n",
-    "humanloop = Humanloop(api_key=os.getenv(\"HUMANLOOP_KEY\"))"
+    "humanloop = Humanloop(api_key=os.getenv(\"HUMANLOOP_KEY\"), base_url=os.getenv(\"HUMANLOOP_BASE_URL\"))"
    ]
   },
   {
@@ -402,7 +402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "id": "87c70e0c",
    "metadata": {},
    "outputs": [],
@@ -420,7 +420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "deae78c4",
    "metadata": {},
    "outputs": [],
@@ -444,7 +444,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "0dcb9069",
    "metadata": {},
    "outputs": [],
@@ -508,7 +508,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "12405c95",
    "metadata": {},
    "outputs": [],
@@ -519,14 +519,19 @@
     "# Create the Evaluation specifying the Dataset and Evaluators to use\n",
     "evaluation = humanloop.evaluations.create(\n",
     "    name=\"Demo evals\",\n",
-    "    path=\"evals_demo/medqa-answer\",\n",
-    "    dataset={\"path\": \"evals_demo/medqa-test\"},\n",
+    "    file={\"path\": \"evals_demo/medqa-answer\"},\n",
     "    evaluators=[\n",
     "        {\"path\": \"evals_demo/exact_match\"},\n",
     "        {\"path\": \"evals_demo/levenshtein\"},\n",
     "        {\"path\": \"evals_demo/reasoning\"},\n",
     "    ],\n",
-    ")"
+    ")\n",
+    "\n",
+    "run = humanloop.evaluations.create_run(\n",
+    "    id=evaluation.id,\n",
+    "    dataset={\"path\": \"evals_demo/medqa-test\"},\n",
+    ")\n",
+    "print(f\"Run created: {run.id}\")"
    ]
   },
   {
@@ -539,7 +544,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "id": "9b36091812b178d7",
    "metadata": {},
    "outputs": [],
@@ -547,7 +552,7 @@
     "def populate_evaluation():\n",
     "    \"\"\"Run a variation of your Pipeline over the Dataset to populate results\"\"\"\n",
     "    retrieved_dataset = humanloop.datasets.get(\n",
-    "        id=evaluation.dataset.id,\n",
+    "        id=run.dataset.id,\n",
     "        include_datapoints=True,\n",
     "    )\n",
     "    for datapoint in tqdm(retrieved_dataset.datapoints):\n",
@@ -566,7 +571,7 @@
     "            output_message=chat_completion,\n",
     "            # NB: New arguments to link to Evaluation and Dataset\n",
     "            source_datapoint_id=datapoint.id,\n",
-    "            evaluation_id=evaluation.id,\n",
+    "            run_id==run.id,\n",
     "        )\n"
    ]
   },
@@ -624,7 +629,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "id": "faa5e7cb",
    "metadata": {},
    "outputs": [],
@@ -674,7 +679,6 @@
     "\n",
     "# Create a new Evaluation with the external Evaluator\n",
     "evaluation = humanloop.evaluations.create(\n",
-    "    dataset={\"path\": \"evals_demo/medqa-test\"},\n",
     "    evaluators=[\n",
     "        {\"path\": \"evals_demo/exact_match\"},\n",
     "        {\"path\": \"evals_demo/reasoning\"},\n",
@@ -683,7 +687,13 @@
     "        {\"path\": \"evals_demo/external-levenshtein\", \"orchestrated\": False},\n",
     "    ],\n",
     ")\n",
-    "print(f\"Evaluation created: {evaluation.id}\")"
+    "print(f\"Evaluation created: {evaluation.id}\")\n",
+    "\n",
+    "run = humanloop.evaluations.create_run(\n",
+    "    id=evaluation.id,\n",
+    "    dataset={\"path\": \"evals_demo/medqa-test\"},\n",
+    ")\n",
+    "print(f\"Run created: {run.id}\")"
    ]
   },
   {
@@ -698,7 +708,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "id": "0463bc9d",
    "metadata": {},
    "outputs": [],
@@ -710,7 +720,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
    "id": "41ef4712",
    "metadata": {},
    "outputs": [],
@@ -718,7 +728,7 @@
     "def populate_evaluation():\n",
     "    \"\"\"Run a variation of your Pipeline over the Dataset to populate results\"\"\"\n",
     "    retrieved_dataset = humanloop.datasets.get(\n",
-    "        id=evaluation.dataset.id,\n",
+    "        id=run.dataset.id,\n",
     "        include_datapoints=True,\n",
     "    )\n",
     "    for datapoint in tqdm(retrieved_dataset.datapoints):\n",
@@ -737,7 +747,7 @@
     "            output_message=chat_completion,\n",
     "            # NB: New arguments to link to Evaluation and Dataset\n",
     "            source_datapoint_id=datapoint.id,\n",
-    "            evaluation_id=evaluation.id,\n",
+    "            run_id=run.id,\n",
     "        )\n",
     "\n",
     "        # 2 additional steps for external Evaluator:\n",
@@ -788,7 +798,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "venv",
    "language": "python",
    "name": "python3"
   },