From 3792d76047ac2c513db162e5dc1b633cd9a40edf Mon Sep 17 00:00:00 2001 From: Harry Xie Date: Wed, 30 Oct 2024 21:51:53 +0000 Subject: [PATCH 1/4] update evaluate_rag_cicd for runs --- tutorials/rag/evaluate_rag_cicd.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/tutorials/rag/evaluate_rag_cicd.py b/tutorials/rag/evaluate_rag_cicd.py index 1ea0f37..04f82df 100644 --- a/tutorials/rag/evaluate_rag_cicd.py +++ b/tutorials/rag/evaluate_rag_cicd.py @@ -17,7 +17,6 @@ import os from dotenv import load_dotenv import inspect -import uuid import pandas as pd from chromadb import chromadb from openai import OpenAI @@ -46,7 +45,7 @@ ) # Define your Prompt details in code -model = "gpt-4-turbo" +model = "gpt-4o-mini" temperature = 0 template = [ { @@ -122,6 +121,7 @@ def run_evaluation( pipeline: Callable, flow_id: str, evaluation_id: str, + dataset_id: str, attributes: dict, max_workers: int = 5, ) -> bool: @@ -142,11 +142,15 @@ def run_evaluation( # Pull down your dataset evaluation = humanloop.evaluations.get(id=evaluation_id) dataset = humanloop.datasets.get( - id=evaluation.dataset.id, + id=dataset_id, include_datapoints=True, ) # Add a batch ID for this run so that you get a new column in report, even if your pipeline is the same - batch_id = uuid.uuid4().hex + run = humanloop.evaluations.create_run( + id=evaluation_id, + dataset={"file_id": dataset_id}, + ) + run_id = run.id # Define the function to execute your pipeline in parallel and Log to Humanloop def process_datapoint(datapoint): @@ -161,11 +165,10 @@ def process_datapoint(datapoint): inputs=datapoint.inputs, output=output, source_datapoint_id=datapoint.id, - evaluation_id=evaluation.id, + run_id=run.id, trace_status="complete", start_time=start_time, end_time=datetime.now(), - batch_id=batch_id, ) except Exception as error: @@ -176,11 +179,10 @@ def process_datapoint(datapoint): inputs=datapoint.inputs, error=str(error), source_datapoint_id=datapoint.id, - evaluation_id=evaluation.id, + run_id=run_id, trace_status="complete", start_time=start_time, end_time=datetime.now(), - batch_id=batch_id, ) # Execute your pipeline and send the logs to Humanloop in parallel @@ -197,9 +199,11 @@ def process_datapoint(datapoint): while not complete: stats = humanloop.evaluations.get_stats(id=evaluation.id) print(stats.progress) - complete = stats.status == "completed" + run_stats = next((run for run in stats.run_stats if run.run_id == run_id), None) + complete = run_stats and run_stats.status == "completed" if not complete: time.sleep(10) + assert stats is not None # Print Evaluation results print(stats.report) @@ -234,8 +238,14 @@ def process_datapoint(datapoint): type=str, help="Evaluation ID for the run. If not specified, a new one will be created.", ) + parser.add_argument( + "--dataset_id", + type=str, + help="Dataset ID for the run.", + ) args = parser.parse_args() evaluation_id = args.evaluation_id + dataset_id = args.dataset_id # These attributes should represent the configuration of your pipeline attributes = { @@ -264,7 +274,6 @@ def process_datapoint(datapoint): # NB: you can use `path`or `id` for references on Humanloop file={"id": flow.id}, # Assume Evaluators and Datasets already exist - dataset={"path": "evals_demo/medqa-test"}, evaluators=[ {"path": "evals_demo/exact_match"}, {"path": "evals_demo/levenshtein"}, @@ -279,6 +288,7 @@ def process_datapoint(datapoint): pipeline=ask_question, flow_id=flow.id, evaluation_id=evaluation_id, + dataset_id=dataset_id, # attributes specify what version of the pipeline is being evaluated attributes=attributes, ) From 1057d81a2bbda875561a84eb03c754cc176b8587 Mon Sep 17 00:00:00 2001 From: Harry Xie Date: Wed, 30 Oct 2024 21:52:24 +0000 Subject: [PATCH 2/4] add cicd_setup script for easy setting up prereqs for evaluate_rag_cicd --- tutorials/rag/cicd/cicd_setup.py | 92 ++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 tutorials/rag/cicd/cicd_setup.py diff --git a/tutorials/rag/cicd/cicd_setup.py b/tutorials/rag/cicd/cicd_setup.py new file mode 100644 index 0000000..c2da82f --- /dev/null +++ b/tutorials/rag/cicd/cicd_setup.py @@ -0,0 +1,92 @@ +"""Set up evaluate_rag_cicd.py. + +Uploads the Dataset and Evaluators. +Note that while the Evaluators are similar to the example Evaluators created +automatically when you signed up, these Evaluators contain an additional step +processing the model's output. +""" + +from contextlib import contextmanager +import os +import pandas as pd +from pathlib import Path +from humanloop import Humanloop, UnprocessableEntityError + +# Create a Humanloop client +humanloop = Humanloop( + api_key=os.getenv("HUMANLOOP_KEY"), base_url=os.getenv("HUMANLOOP_BASE_URL") +) + +assets_folder = Path(__file__).parents[3].resolve() / "assets" + + +def upload_dataset(): + df = pd.read_json(assets_folder / "datapoints.jsonl", lines=True) + + datapoints = [row.to_dict() for _i, row in df.iterrows()][0:20] + with ignore_already_committed(): + humanloop.datasets.upsert( + path="evals_demo/medqa-small", + datapoints=datapoints, + commit_message=f"Added {len(datapoints)} datapoints from MedQA test dataset.", + ) + + +def upload_evaluators(): + # Upload Code Evaluators + for evaluator_name, return_type in [ + ("exact_match", "boolean"), + ("levenshtein", "number"), + ]: + with open(assets_folder / f"evaluators/{evaluator_name}.py", "r") as f: + code = f.read() + + with ignore_already_committed(): + humanloop.evaluators.upsert( + path=f"evals_demo/{evaluator_name}", + spec={ + "evaluator_type": "python", + "arguments_type": "target_required", + "return_type": return_type, + "code": code, + }, + commit_message=f"New version from {evaluator_name}.py", + ) + + # Upload an LLM Evaluator + with ignore_already_committed(): + humanloop.evaluators.upsert( + path="evals_demo/reasoning", + spec={ + "evaluator_type": "llm", + "arguments_type": "target_free", + "return_type": "boolean", + "prompt": { + "model": "gpt-4o", + "endpoint": "complete", + "temperature": 0, + "template": 'An answer is shown below. The answer contains 3 sections, separated by "---". The first section is the final answer. The second section is an explanation. The third section is a citation.\n\nEvaluate if the final answer follows from the citation and the reasoning in the explanation section. Give a brief explanation/discussion. Do not make your judgment based on factuality, but purely based on the logic presented.\nOn a new line, give a final verdict of "True" or "False".\n\nAnswer:\n{{log.output}}', + }, + }, + commit_message="Initial reasoning evaluator.", + ) + + +@contextmanager +def ignore_already_committed(): + """Context manager to ignore the error where a version has already been committed.""" + try: + yield + except UnprocessableEntityError as e: + try: + if "already been committed" in e.body.detail["description"]: + return + except Exception: + pass + raise e + + +if __name__ == "__main__": + upload_dataset() + upload_evaluators() + print("Datasets and Evaluators uploaded.") From a554e9dce519a310c04794076a4720b90d4f1dd9 Mon Sep 17 00:00:00 2001 From: Harry Xie Date: Wed, 30 Oct 2024 21:54:06 +0000 Subject: [PATCH 3/4] update utils to handle eval runs --- tutorials/rag/cicd/utils.py | 62 +++++++++++++------------------------ 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/tutorials/rag/cicd/utils.py b/tutorials/rag/cicd/utils.py index 7003fb1..e8c3acf 100644 --- a/tutorials/rag/cicd/utils.py +++ b/tutorials/rag/cicd/utils.py @@ -1,7 +1,6 @@ -""" Util methods for formatting and checking Evaluation results.""" +"""Util methods for formatting and checking Evaluation results.""" -from prettytable import PrettyTable -from humanloop import EvaluationResponse, EvaluationStats, VersionStatsResponse +from humanloop import EvaluationResponse, EvaluationStats, RunStatsResponse from humanloop import BooleanEvaluatorStatsResponse as BooleanStats from humanloop import NumericEvaluatorStatsResponse as NumericStats @@ -40,35 +39,18 @@ def get_score_from_evaluator_stat(stat: NumericStats | BooleanStats) -> float | return score -def get_sorted_version_stats( - stats: EvaluationStats, - evaluation: EvaluationResponse -) -> list[VersionStatsResponse]: - """Sort the VersionStats by created_at.""" - versions_by_id = { - evaluatee.version["version_id"]: evaluatee - for evaluatee in evaluation.evaluatees - } - sorted_stats = sorted( - stats.version_stats, - key=lambda x: versions_by_id[x.version_id].version["created_at"] - ) - if len(sorted_stats) == 0: - raise ValueError("No VersionStats found.") - - return sorted_stats - - def get_evaluator_stats_by_path( - stat: VersionStatsResponse, evaluation: EvaluationResponse + stat: RunStatsResponse, evaluation: EvaluationResponse ) -> dict[str, NumericStats | BooleanStats]: """Get the Evaluator stats by path.""" evaluators_by_id = { evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators } evaluator_stats_by_path = { - evaluators_by_id[evaluator_stat.evaluator_version_id].version.path: evaluator_stat - for evaluator_stat in stat.evaluator_version_stats + evaluators_by_id[ + evaluator_stat.evaluator_version_id + ].version.path: evaluator_stat + for evaluator_stat in stat.evaluator_stats } return evaluator_stats_by_path @@ -81,15 +63,16 @@ def check_evaluation_threshold( ) -> bool: """Checks if the latest version has an average Evaluator result above a threshold.""" - version_stats = get_sorted_version_stats(stats=stats, evaluation=evaluation) + latest_run_stats = stats.run_stats[0] evaluator_stats_by_path = get_evaluator_stats_by_path( - stat=version_stats[-1], - evaluation=evaluation + stat=latest_run_stats, evaluation=evaluation ) if evaluator_path in evaluator_stats_by_path: evaluator_stat = evaluator_stats_by_path[evaluator_path] score = get_score_from_evaluator_stat(stat=evaluator_stat) + if score is None: + raise ValueError(f"Score not found for evaluator {evaluator_path}.") if score >= threshold: print( f"{GREEN}✅ Latest eval [{score}] above threshold [{threshold}] for evaluator {evaluator_path}.{RESET}" @@ -105,28 +88,25 @@ def check_evaluation_threshold( def check_evaluation_improvement( - evaluation: EvaluationResponse, - evaluator_path: str, - stats: EvaluationStats + evaluation: EvaluationResponse, evaluator_path: str, stats: EvaluationStats ) -> bool: """Check the latest version has improved across for a specific Evaluator.""" - version_stats = get_sorted_version_stats(stats=stats, evaluation=evaluation) + latest_run_stats = stats.run_stats[0] latest_evaluator_stats_by_path = get_evaluator_stats_by_path( - stat=version_stats[-1], - evaluation=evaluation + stat=latest_run_stats, evaluation=evaluation ) - if len(version_stats) == 1: - print( - f"{YELLOW}⚠️ No previous versions to compare with.{RESET}" - ) + if len(stats.run_stats) == 1: + print(f"{YELLOW}⚠️ No previous versions to compare with.{RESET}") return True previous_evaluator_stats_by_path = get_evaluator_stats_by_path( - stat=version_stats[-2], - evaluation=evaluation + stat=stats.run_stats[-2], evaluation=evaluation ) - if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path: + if ( + evaluator_path in latest_evaluator_stats_by_path + and evaluator_path in previous_evaluator_stats_by_path + ): latest_evaluator_stat = latest_evaluator_stats_by_path[evaluator_path] previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path] latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat) From 516859e4feb961f82a388ef1492e1ae4544e7f3d Mon Sep 17 00:00:00 2001 From: Harry Xie Date: Wed, 30 Oct 2024 22:02:41 +0000 Subject: [PATCH 4/4] update notebooks --- tutorials/rag/evaluate-rag-flow.ipynb | 24 ++++++++--- tutorials/rag/evaluate-rag.ipynb | 58 ++++++++++++++++----------- 2 files changed, 53 insertions(+), 29 deletions(-) diff --git a/tutorials/rag/evaluate-rag-flow.ipynb b/tutorials/rag/evaluate-rag-flow.ipynb index d2904c2..c15bd5f 100644 --- a/tutorials/rag/evaluate-rag-flow.ipynb +++ b/tutorials/rag/evaluate-rag-flow.ipynb @@ -90,7 +90,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001B[34mInstalling dependencies from lock file\u001B[39m\n", + "\u001b[34mInstalling dependencies from lock file\u001b[39m\n", "\n", "No dependencies to install or update\n" ] @@ -552,6 +552,14 @@ "Now we can start to trigger Evaluations on Humanloop using our Dataset and Evaluators:" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "09ccddff", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 15, @@ -574,14 +582,20 @@ "evaluation = humanloop.evaluations.create(\n", " name=\"Demo evals 2\",\n", " file={\"path\":\"evals_demo/medqa-flow\"},\n", - " dataset={\"path\": \"evals_demo/medqa-test\"},\n", " evaluators=[\n", " {\"path\": \"evals_demo/exact_match\"},\n", " {\"path\": \"evals_demo/levenshtein\"},\n", " {\"path\": \"evals_demo/reasoning\"},\n", " ],\n", ")\n", - "print(f\"Evaluation created: {evaluation.id}\")\n" + "print(f\"Evaluation created: {evaluation.id}\")\n", + "\n", + "run = humanloop.evaluations.create_run(\n", + " id=evaluation.id,\n", + " dataset={\"path\": \"evals_demo/medqa-test\"},\n", + ")\n", + "print(f\"Run created: {run.id}\")\n", + "\n" ] }, { @@ -602,7 +616,7 @@ "def populate_evaluation():\n", " \"\"\"Run a variation of your Pipeline over the Dataset to populate results\"\"\"\n", " retrieved_dataset = humanloop.datasets.get(\n", - " id=evaluation.dataset.id,\n", + " id=run.dataset.id,\n", " include_datapoints=True,\n", " )\n", " for datapoint in tqdm(retrieved_dataset.datapoints):\n", @@ -633,7 +647,7 @@ " trace_status=\"complete\",\n", " # NB: New arguments to link to Evaluation and Dataset\n", " source_datapoint_id=datapoint.id,\n", - " evaluation_id=evaluation.id,\n", + " run_id=run.id,\n", ")\n" ] }, diff --git a/tutorials/rag/evaluate-rag.ipynb b/tutorials/rag/evaluate-rag.ipynb index 670eeae..21f6f03 100644 --- a/tutorials/rag/evaluate-rag.ipynb +++ b/tutorials/rag/evaluate-rag.ipynb @@ -98,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "47ac94aa", "metadata": {}, "outputs": [], @@ -130,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "15c5158d1d159535", "metadata": {}, "outputs": [], @@ -163,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "187af8c5", "metadata": {}, "outputs": [], @@ -228,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "53c95ad9790ade59", "metadata": {}, "outputs": [], @@ -297,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "6c2102bcad49c932", "metadata": {}, "outputs": [], @@ -306,7 +306,7 @@ "from humanloop import Humanloop\n", "\n", "load_dotenv()\n", - "humanloop = Humanloop(api_key=os.getenv(\"HUMANLOOP_KEY\"))" + "humanloop = Humanloop(api_key=os.getenv(\"HUMANLOOP_KEY\"), base_url=os.getenv(\"HUMANLOOP_BASE_URL\"))" ] }, { @@ -402,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "87c70e0c", "metadata": {}, "outputs": [], @@ -420,7 +420,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "deae78c4", "metadata": {}, "outputs": [], @@ -444,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "0dcb9069", "metadata": {}, "outputs": [], @@ -508,7 +508,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "12405c95", "metadata": {}, "outputs": [], @@ -519,14 +519,19 @@ "# Create the Evaluation specifying the Dataset and Evaluators to use\n", "evaluation = humanloop.evaluations.create(\n", " name=\"Demo evals\",\n", - " path=\"evals_demo/medqa-answer\",\n", - " dataset={\"path\": \"evals_demo/medqa-test\"},\n", + " file={\"path\": \"evals_demo/medqa-answer\"},\n", " evaluators=[\n", " {\"path\": \"evals_demo/exact_match\"},\n", " {\"path\": \"evals_demo/levenshtein\"},\n", " {\"path\": \"evals_demo/reasoning\"},\n", " ],\n", - ")" + ")\n", + "\n", + "run = humanloop.evaluations.create_run(\n", + " id=evaluation.id,\n", + " dataset={\"path\": \"evals_demo/medqa-test\"},\n", + ")\n", + "print(f\"Run created: {run.id}\")" ] }, { @@ -539,7 +544,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "9b36091812b178d7", "metadata": {}, "outputs": [], @@ -547,7 +552,7 @@ "def populate_evaluation():\n", " \"\"\"Run a variation of your Pipeline over the Dataset to populate results\"\"\"\n", " retrieved_dataset = humanloop.datasets.get(\n", - " id=evaluation.dataset.id,\n", + " id=run.dataset.id,\n", " include_datapoints=True,\n", " )\n", " for datapoint in tqdm(retrieved_dataset.datapoints):\n", @@ -566,7 +571,7 @@ " output_message=chat_completion,\n", " # NB: New arguments to link to Evaluation and Dataset\n", " source_datapoint_id=datapoint.id,\n", - " evaluation_id=evaluation.id,\n", + " run_id==run.id,\n", " )\n" ] }, @@ -624,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "faa5e7cb", "metadata": {}, "outputs": [], @@ -674,7 +679,6 @@ "\n", "# Create a new Evaluation with the external Evaluator\n", "evaluation = humanloop.evaluations.create(\n", - " dataset={\"path\": \"evals_demo/medqa-test\"},\n", " evaluators=[\n", " {\"path\": \"evals_demo/exact_match\"},\n", " {\"path\": \"evals_demo/reasoning\"},\n", @@ -683,7 +687,13 @@ " {\"path\": \"evals_demo/external-levenshtein\", \"orchestrated\": False},\n", " ],\n", ")\n", - "print(f\"Evaluation created: {evaluation.id}\")" + "print(f\"Evaluation created: {evaluation.id}\")\n", + "\n", + "run = humanloop.evaluations.create_run(\n", + " id=evaluation.id,\n", + " dataset={\"path\": \"evals_demo/medqa-test\"},\n", + ")\n", + "print(f\"Run created: {run.id}\")" ] }, { @@ -698,7 +708,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "0463bc9d", "metadata": {}, "outputs": [], @@ -710,7 +720,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "41ef4712", "metadata": {}, "outputs": [], @@ -718,7 +728,7 @@ "def populate_evaluation():\n", " \"\"\"Run a variation of your Pipeline over the Dataset to populate results\"\"\"\n", " retrieved_dataset = humanloop.datasets.get(\n", - " id=evaluation.dataset.id,\n", + " id=run.dataset.id,\n", " include_datapoints=True,\n", " )\n", " for datapoint in tqdm(retrieved_dataset.datapoints):\n", @@ -737,7 +747,7 @@ " output_message=chat_completion,\n", " # NB: New arguments to link to Evaluation and Dataset\n", " source_datapoint_id=datapoint.id,\n", - " evaluation_id=evaluation.id,\n", + " run_id=run.id,\n", " )\n", "\n", " # 2 additional steps for external Evaluator:\n", @@ -788,7 +798,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv", "language": "python", "name": "python3" },