From 90bb419b3846c50d2b7d4745c3357a6c7d1e7969 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Fri, 25 Jul 2025 14:58:34 -0700 Subject: [PATCH] Add agent evaluation notebook samples --- .../Evaluate_SK_AzureAIAgent.ipynb | 360 +++++++++++++++++ ...valuate_SK_AzureOpenAIAssistantAgent.ipynb | 366 ++++++++++++++++++ 2 files changed, 726 insertions(+) create mode 100644 python/samples/getting_started/Evaluate_SK_AzureAIAgent.ipynb create mode 100644 python/samples/getting_started/Evaluate_SK_AzureOpenAIAssistantAgent.ipynb diff --git a/python/samples/getting_started/Evaluate_SK_AzureAIAgent.ipynb b/python/samples/getting_started/Evaluate_SK_AzureAIAgent.ipynb new file mode 100644 index 000000000000..12cf364838af --- /dev/null +++ b/python/samples/getting_started/Evaluate_SK_AzureAIAgent.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bf5280e2", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "# Evaluate Semantic Kernel Azure AI Agents in Azure AI Foundry" + ] + }, + { + "cell_type": "markdown", + "id": "0330c099", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This sample demonstrates how to evaluate an AI agent (Azure AI Agent Service) on these important aspects of your agentic workflow:\n", + "\n", + "- Intent Resolution: Measures how well the agent identifies the user’s request, including how well it scopes the user’s intent, asks clarifying questions, and reminds end users of its scope of capabilities.\n", + "- Tool Call Accuracy: Evaluates the agent's ability to select the appropriate tools, and process correct parameters from previous steps.\n", + "- Task Adherence: Measures how well the agent’s response adheres to its assigned tasks, according to its system message and prior steps." + ] + }, + { + "cell_type": "markdown", + "id": "b364c694", + "metadata": {}, + "source": [ + "## Time\n", + "You can expect to complete this sample in approximately 20 minutes." + ] + }, + { + "cell_type": "markdown", + "id": "919c6017", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "### Packages\n", + "- `semantic-kernel` installed (`pip install semantic-kernel`)\n", + "- `azure-ai-evaluation` SDK installed\n", + "- An Azure OpenAI resource with a deployment configured\n", + "\n", + "### Environment Variables\n", + "- For AzureChatService:\n", + " - `AZURE_OPENAI_API_KEY`\n", + " - `AZURE_OPENAI_CHAT_DEPLOYMENT_NAME`\n", + " - `AZURE_OPENAI_ENDPOINT`\n", + " - `AZURE_OPENAI_API_VERSION=\"2025-04-01-preview\"`\n", + "- For evaluating agents:\n", + " - `PROJECT_CONNECTION_STRING`\n", + " - `AZURE_OPENAI_ENDPOINT`\n", + " - `AZURE_OPENAI_API_KEY`\n", + " - `AZURE_OPENAI_API_VERSION`\n", + " - `MODEL_DEPLOYMENT_NAME`\n", + "- For Azure AI Foundry (Bonus):\n", + " - `AZURE_SUBSCRIPTION_ID`\n", + " - `PROJECT_NAME`\n", + " - `RESOURCE_GROUP_NAME`" + ] + }, + { + "cell_type": "markdown", + "id": "ba1d6576", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### Create an OpenAIResponsesAgent with a plugin - [reference](https://learn.microsoft.com/en-us/semantic-kernel/frameworks/agent/agent-types/responses-agent?pivots=programming-language-python)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7dc6ce40", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Annotated\n", + "\n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "from semantic_kernel.agents import AzureAIAgent, AzureAIAgentSettings\n", + "from semantic_kernel.functions import kernel_function\n", + "\n", + "\n", + "# Define a sample plugin for the sample\n", + "class MenuPlugin:\n", + " \"\"\"A sample Menu Plugin used for the concept sample.\"\"\"\n", + "\n", + " @kernel_function(description=\"Provides a list of specials from the menu.\")\n", + " def get_specials(self) -> Annotated[str, \"Returns the specials from the menu.\"]:\n", + " return \"\"\"\n", + " Special Soup: Clam Chowder\n", + " Special Salad: Cobb Salad\n", + " Special Drink: Chai Tea\n", + " \"\"\"\n", + "\n", + " @kernel_function(description=\"Provides the price of the requested menu item.\")\n", + " def get_item_price(\n", + " self, menu_item: Annotated[str, \"The name of the menu item.\"]\n", + " ) -> Annotated[str, \"Returns the price of the menu item.\"]:\n", + " return \"$9.99\"\n", + "\n", + "\n", + "# Create an agent\n", + "creds = DefaultAzureCredential()\n", + "project_client = AzureAIAgent.create_client(credential=creds)\n", + "\n", + "agent_definition = await project_client.agents.create_agent(\n", + " model=AzureAIAgentSettings().model_deployment_name,\n", + " name=\"Host\",\n", + " instructions=\"Answer questions about the menu.\",\n", + ")\n", + "agent = AzureAIAgent(\n", + " client=project_client,\n", + " definition=agent_definition,\n", + " plugins=[MenuPlugin()],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ca0a35a0", + "metadata": {}, + "source": [ + "### Invoke the agent" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3b7b9ba3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## User: Hello\n", + "## Host: Hello! How can I assist you today?\n", + "## User: What is the special soup?\n", + "## Host: The special soup is Clam Chowder. If you need more information or want to know about other specials, just let me know!\n", + "## User: What is the special drink?\n", + "## Host: The special drink is Chai Tea. If you have any more questions or need further assistance, feel free to ask!\n", + "## User: How much is it?\n", + "## Host: The special drink, Chai Tea, is priced at $9.99. If you have any more questions or need further assistance, just let me know!\n", + "## User: Thank you\n", + "## Host: You're welcome! If you need anything else, feel free to ask. Have a great day!\n" + ] + } + ], + "source": [ + "USER_INPUTS = [\n", + " \"Hello\",\n", + " \"What is the special soup?\",\n", + " \"What is the special drink?\",\n", + " \"How much is it?\",\n", + " \"Thank you\",\n", + "]\n", + "\n", + "thread = None\n", + "for user_input in USER_INPUTS:\n", + " print(f\"## User: {user_input}\")\n", + " response = await agent.get_response(messages=user_input, thread=thread)\n", + " print(f\"## {response.name}: {response.content}\")\n", + " thread = response.thread" + ] + }, + { + "cell_type": "markdown", + "id": "2586d3e5", + "metadata": {}, + "source": [ + "### Converter: Get data from agent" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7813b5eb", + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "'AsyncItemPaged' object is not iterable", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[5], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m file_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mevaluation_data.jsonl\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m# Save the agent thread data to a JSONL file (all turns)\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m evaluation_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[43mconverter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_evaluation_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mthread\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# print(json.dumps(evaluation_data, indent=4))\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28mlen\u001b[39m(evaluation_data) \u001b[38;5;66;03m# number of turns in the thread\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:574\u001b[0m, in \u001b[0;36mAIAgentConverter.prepare_evaluation_data\u001b[1;34m(self, thread_ids, filename)\u001b[0m\n\u001b[0;32m 569\u001b[0m futures \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m 570\u001b[0m executor\u001b[38;5;241m.\u001b[39msubmit(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_single_thread_evaluation_data, \u001b[38;5;28mstr\u001b[39m(thread_id), \u001b[38;5;28;01mNone\u001b[39;00m): thread_id\n\u001b[0;32m 571\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m thread_id \u001b[38;5;129;01min\u001b[39;00m thread_ids\n\u001b[0;32m 572\u001b[0m }\n\u001b[0;32m 573\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m future \u001b[38;5;129;01min\u001b[39;00m as_completed(futures):\n\u001b[1;32m--> 574\u001b[0m evaluations\u001b[38;5;241m.\u001b[39mextend(\u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 576\u001b[0m \u001b[38;5;66;03m# So, if we have the filename, we can write it to the file, which is expected to be a JSONL file.\u001b[39;00m\n\u001b[0;32m 577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m filename:\n", + "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\_base.py:449\u001b[0m, in \u001b[0;36mFuture.result\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 447\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[0;32m 448\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m--> 449\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_condition\u001b[38;5;241m.\u001b[39mwait(timeout)\n\u001b[0;32m 453\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n", + "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\_base.py:401\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception:\n\u001b[0;32m 400\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 401\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[0;32m 402\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 403\u001b[0m \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[0;32m 404\u001b[0m \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\thread.py:58\u001b[0m, in \u001b[0;36m_WorkItem.run\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m 57\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 58\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 59\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m 60\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mset_exception(exc)\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:475\u001b[0m, in \u001b[0;36mAIAgentConverter._prepare_single_thread_evaluation_data\u001b[1;34m(self, thread_id, filename)\u001b[0m\n\u001b[0;32m 472\u001b[0m list_of_run_evaluations: List[\u001b[38;5;28mdict\u001b[39m] \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m 474\u001b[0m \u001b[38;5;66;03m# These are all the run IDs.\u001b[39;00m\n\u001b[1;32m--> 475\u001b[0m run_ids \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_data_retriever\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_list_run_ids_chronological\u001b[49m\u001b[43m(\u001b[49m\u001b[43mthread_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 477\u001b[0m \u001b[38;5;66;03m# If there were no messages in the thread, we can return an empty list.\u001b[39;00m\n\u001b[0;32m 478\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(run_ids) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:846\u001b[0m, in \u001b[0;36mFDPAgentDataRetriever._list_run_ids_chronological\u001b[1;34m(self, thread_id)\u001b[0m\n\u001b[0;32m 844\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_list_run_ids_chronological\u001b[39m(\u001b[38;5;28mself\u001b[39m, thread_id: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[0;32m 845\u001b[0m runs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mproject_client\u001b[38;5;241m.\u001b[39magents\u001b[38;5;241m.\u001b[39mruns\u001b[38;5;241m.\u001b[39mlist(thread_id\u001b[38;5;241m=\u001b[39mthread_id, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124masc\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 846\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43mrun\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mruns\u001b[49m\u001b[43m]\u001b[49m\n", + "\u001b[1;31mTypeError\u001b[0m: 'AsyncItemPaged' object is not iterable" + ] + } + ], + "source": [ + "from azure.ai.evaluation import AIAgentConverter\n", + "\n", + "converter = AIAgentConverter(project_client)\n", + "\n", + "file_name = \"evaluation_data.jsonl\"\n", + "# Save the agent thread data to a JSONL file (all turns)\n", + "evaluation_data = await converter.prepare_evaluation_data([thread.id], filename=file_name)\n", + "# print(json.dumps(evaluation_data, indent=4))\n", + "len(evaluation_data) # number of turns in the thread" + ] + }, + { + "cell_type": "markdown", + "id": "8bf87cab", + "metadata": {}, + "source": [ + "### Setting up evaluator\n", + "\n", + "We will select the following evaluators to assess the different aspects relevant for agent quality: \n", + "\n", + "- [Intent resolution](https://aka.ms/intentresolution-sample): measures the extent of which an agent identifies the correct intent from a user query. Scale: integer 1-5. Higher is better.\n", + "- [Tool call accuracy](https://aka.ms/toolcallaccuracy-sample): evaluates the agent’s ability to select the appropriate tools, and process correct parameters from previous steps. Scale: float 0-1. Higher is better.\n", + "- [Task adherence](https://aka.ms/taskadherence-sample): measures the extent of which an agent’s final response adheres to the task based on its system message and a user query. Scale: integer 1-5. Higher is better.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6ee09df", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Class IntentResolutionEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Class ToolCallAccuracyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", + "Class TaskAdherenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n" + ] + } + ], + "source": [ + "import os\n", + "from pprint import pprint\n", + "\n", + "from azure.ai.evaluation import (\n", + " AzureOpenAIModelConfiguration,\n", + " IntentResolutionEvaluator,\n", + " TaskAdherenceEvaluator,\n", + " ToolCallAccuracyEvaluator,\n", + ")\n", + "\n", + "model_config = AzureOpenAIModelConfiguration(\n", + " azure_endpoint=os.environ[\"AZURE_OPENAI_ENDPOINT\"],\n", + " api_key=os.environ[\"AZURE_OPENAI_API_KEY\"],\n", + " api_version=os.environ[\"AZURE_OPENAI_API_VERSION\"],\n", + " azure_deployment=os.environ[\"MODEL_DEPLOYMENT_NAME\"],\n", + ")\n", + "\n", + "intent_resolution = IntentResolutionEvaluator(model_config=model_config)\n", + "\n", + "tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)\n", + "\n", + "task_adherence = TaskAdherenceEvaluator(model_config=model_config)" + ] + }, + { + "cell_type": "markdown", + "id": "a7a3d235", + "metadata": {}, + "source": [ + "### Run Evaluator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31eb7ecb", + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.evaluation import evaluate\n", + "\n", + "response = evaluate(\n", + " data=file_name,\n", + " evaluators={\n", + " \"tool_call_accuracy\": tool_call_accuracy,\n", + " \"intent_resolution\": intent_resolution,\n", + " \"task_adherence\": task_adherence,\n", + " },\n", + " azure_ai_project={\n", + " \"subscription_id\": os.environ[\"AZURE_SUBSCRIPTION_ID\"],\n", + " \"project_name\": os.environ[\"PROJECT_NAME\"],\n", + " \"resource_group_name\": os.environ[\"RESOURCE_GROUP_NAME\"],\n", + " },\n", + ")\n", + "pprint(f\"AI Foundary URL: {response.get('studio_url')}\")" + ] + }, + { + "cell_type": "markdown", + "id": "ac38d924", + "metadata": {}, + "source": [ + "## Inspect results on Azure AI Foundry\n", + "\n", + "Go to AI Foundry URL for rich Azure AI Foundry data visualization to inspect the evaluation scores and reasoning to quickly identify bugs and issues of your agent to fix and improve." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "225ae69a", + "metadata": {}, + "outputs": [], + "source": [ + "# alternatively, you can use the following to get the evaluation results in memory\n", + "\n", + "# average scores across all runs\n", + "pprint(response[\"metrics\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/samples/getting_started/Evaluate_SK_AzureOpenAIAssistantAgent.ipynb b/python/samples/getting_started/Evaluate_SK_AzureOpenAIAssistantAgent.ipynb new file mode 100644 index 000000000000..07b0af451cac --- /dev/null +++ b/python/samples/getting_started/Evaluate_SK_AzureOpenAIAssistantAgent.ipynb @@ -0,0 +1,366 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bf5280e2", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "# Evaluate Semantic Kernel OpenAI Responses Agents in Azure AI Foundry" + ] + }, + { + "cell_type": "markdown", + "id": "0330c099", + "metadata": {}, + "source": [ + "## Objective\n", + "\n", + "This sample demonstrates how to evaluate Semantic Kernel OpenAIResponses agent in Azure AI Foundry. It provides a step-by-step guide to set up the environment, create an agent, and evaluate its performance." + ] + }, + { + "cell_type": "markdown", + "id": "b364c694", + "metadata": {}, + "source": [ + "## Time\n", + "You can expect to complete this sample in approximately 20 minutes." + ] + }, + { + "cell_type": "markdown", + "id": "919c6017", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "### Packages\n", + "- `semantic-kernel` installed (`pip install semantic-kernel`)\n", + "- `azure-ai-evaluation` SDK installed\n", + "- An Azure OpenAI resource with a deployment configured\n", + "\n", + "### Environment Variables\n", + "- For AzureChatService:\n", + " - `AZURE_OPENAI_API_KEY`\n", + " - `AZURE_OPENAI_CHAT_DEPLOYMENT_NAME`\n", + " - `AZURE_OPENAI_ENDPOINT`\n", + " - `AZURE_OPENAI_API_VERSION=\"2025-04-01-preview\"`\n", + "- For evaluating agents:\n", + " - `PROJECT_CONNECTION_STRING`\n", + " - `AZURE_OPENAI_ENDPOINT`\n", + " - `AZURE_OPENAI_API_KEY`\n", + " - `AZURE_OPENAI_API_VERSION`\n", + " - `MODEL_DEPLOYMENT_NAME`\n", + "- For Azure AI Foundry (Bonus):\n", + " - `AZURE_SUBSCRIPTION_ID`\n", + " - `PROJECT_NAME`\n", + " - `RESOURCE_GROUP_NAME`" + ] + }, + { + "cell_type": "markdown", + "id": "ba1d6576", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "### Create an OpenAIResponsesAgent with a plugin - [reference](https://learn.microsoft.com/en-us/semantic-kernel/frameworks/agent/agent-types/responses-agent?pivots=programming-language-python)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7dc6ce40", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Annotated\n", + "\n", + "from semantic_kernel.agents import AzureAssistantAgent\n", + "from semantic_kernel.connectors.ai.open_ai import AzureOpenAISettings\n", + "from semantic_kernel.functions import kernel_function\n", + "\n", + "\n", + "# Define a sample plugin for the sample\n", + "class MenuPlugin:\n", + " \"\"\"A sample Menu Plugin used for the concept sample.\"\"\"\n", + "\n", + " @kernel_function(description=\"Provides a list of specials from the menu.\")\n", + " def get_specials(self) -> Annotated[str, \"Returns the specials from the menu.\"]:\n", + " return \"\"\"\n", + " Special Soup: Clam Chowder\n", + " Special Salad: Cobb Salad\n", + " Special Drink: Chai Tea\n", + " \"\"\"\n", + "\n", + " @kernel_function(description=\"Provides the price of the requested menu item.\")\n", + " def get_item_price(\n", + " self, menu_item: Annotated[str, \"The name of the menu item.\"]\n", + " ) -> Annotated[str, \"Returns the price of the menu item.\"]:\n", + " return \"$9.99\"\n", + "\n", + "\n", + "# Create an agent\n", + "client = AzureAssistantAgent.create_client()\n", + "definition = await client.beta.assistants.create(\n", + " model=AzureOpenAISettings().chat_deployment_name,\n", + " instructions=\"Answer questions about the menu.\",\n", + " name=\"Assistant\",\n", + ")\n", + "agent = AzureAssistantAgent(\n", + " client=client,\n", + " definition=definition,\n", + " plugins=[MenuPlugin()],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ca0a35a0", + "metadata": {}, + "source": [ + "### Invoke the agent" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3b7b9ba3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## User: Hello\n", + "## Assistant: Hello! How can I assist you today?\n", + "## User: What is the special soup?\n", + "## Assistant: The special soup is Clam Chowder. Would you like to know more about it or any other special items?\n", + "## User: What is the special drink?\n", + "## Assistant: The special drink is Chai Tea. Would you like more details or information about anything else on the menu?\n", + "## User: How much is it?\n", + "## Assistant: The price of the Chai Tea is $9.99. Is there anything else you would like to know?\n", + "## User: Thank you\n", + "## Assistant: You're welcome! If you have any more questions in the future, feel free to ask. Have a great day!\n" + ] + } + ], + "source": [ + "USER_INPUTS = [\n", + " \"Hello\",\n", + " \"What is the special soup?\",\n", + " \"What is the special drink?\",\n", + " \"How much is it?\",\n", + " \"Thank you\",\n", + "]\n", + "\n", + "thread = None\n", + "for user_input in USER_INPUTS:\n", + " print(f\"## User: {user_input}\")\n", + " response = await agent.get_response(messages=user_input, thread=thread)\n", + " print(f\"## {response.name}: {response.content}\")\n", + " thread = response.thread" + ] + }, + { + "cell_type": "markdown", + "id": "2586d3e5", + "metadata": {}, + "source": [ + "### Converter: Get data from agent" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7813b5eb", + "metadata": {}, + "outputs": [ + { + "ename": "ResourceNotFoundError", + "evalue": "(None) No thread found with id 'thread_PEshQVS2Kqv0t0uRRzmLvESb'.\nCode: None\nMessage: No thread found with id 'thread_PEshQVS2Kqv0t0uRRzmLvESb'.", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mResourceNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[6], line 15\u001b[0m\n\u001b[0;32m 13\u001b[0m file_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mevaluation_data.jsonl\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;66;03m# Save the agent thread data to a JSONL file (all turns)\u001b[39;00m\n\u001b[1;32m---> 15\u001b[0m evaluation_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[43mconverter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_evaluation_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mthread\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 16\u001b[0m \u001b[38;5;66;03m# print(json.dumps(evaluation_data, indent=4))\u001b[39;00m\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28mlen\u001b[39m(evaluation_data) \u001b[38;5;66;03m# number of turns in the thread\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:574\u001b[0m, in \u001b[0;36mAIAgentConverter.prepare_evaluation_data\u001b[1;34m(self, thread_ids, filename)\u001b[0m\n\u001b[0;32m 569\u001b[0m futures \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m 570\u001b[0m executor\u001b[38;5;241m.\u001b[39msubmit(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_single_thread_evaluation_data, \u001b[38;5;28mstr\u001b[39m(thread_id), \u001b[38;5;28;01mNone\u001b[39;00m): thread_id\n\u001b[0;32m 571\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m thread_id \u001b[38;5;129;01min\u001b[39;00m thread_ids\n\u001b[0;32m 572\u001b[0m }\n\u001b[0;32m 573\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m future \u001b[38;5;129;01min\u001b[39;00m as_completed(futures):\n\u001b[1;32m--> 574\u001b[0m evaluations\u001b[38;5;241m.\u001b[39mextend(\u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 576\u001b[0m \u001b[38;5;66;03m# So, if we have the filename, we can write it to the file, which is expected to be a JSONL file.\u001b[39;00m\n\u001b[0;32m 577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m filename:\n", + "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\_base.py:449\u001b[0m, in \u001b[0;36mFuture.result\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 447\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[0;32m 448\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m--> 449\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_condition\u001b[38;5;241m.\u001b[39mwait(timeout)\n\u001b[0;32m 453\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n", + "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\_base.py:401\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception:\n\u001b[0;32m 400\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 401\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[0;32m 402\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 403\u001b[0m \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[0;32m 404\u001b[0m \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\thread.py:58\u001b[0m, in \u001b[0;36m_WorkItem.run\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m 57\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 58\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 59\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m 60\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mset_exception(exc)\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:475\u001b[0m, in \u001b[0;36mAIAgentConverter._prepare_single_thread_evaluation_data\u001b[1;34m(self, thread_id, filename)\u001b[0m\n\u001b[0;32m 472\u001b[0m list_of_run_evaluations: List[\u001b[38;5;28mdict\u001b[39m] \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m 474\u001b[0m \u001b[38;5;66;03m# These are all the run IDs.\u001b[39;00m\n\u001b[1;32m--> 475\u001b[0m run_ids \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_data_retriever\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_list_run_ids_chronological\u001b[49m\u001b[43m(\u001b[49m\u001b[43mthread_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 477\u001b[0m \u001b[38;5;66;03m# If there were no messages in the thread, we can return an empty list.\u001b[39;00m\n\u001b[0;32m 478\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(run_ids) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:846\u001b[0m, in \u001b[0;36mFDPAgentDataRetriever._list_run_ids_chronological\u001b[1;34m(self, thread_id)\u001b[0m\n\u001b[0;32m 844\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_list_run_ids_chronological\u001b[39m(\u001b[38;5;28mself\u001b[39m, thread_id: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[0;32m 845\u001b[0m runs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mproject_client\u001b[38;5;241m.\u001b[39magents\u001b[38;5;241m.\u001b[39mruns\u001b[38;5;241m.\u001b[39mlist(thread_id\u001b[38;5;241m=\u001b[39mthread_id, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124masc\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 846\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43mrun\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mruns\u001b[49m\u001b[43m]\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:846\u001b[0m, in \u001b[0;36m\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 844\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_list_run_ids_chronological\u001b[39m(\u001b[38;5;28mself\u001b[39m, thread_id: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[0;32m 845\u001b[0m runs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mproject_client\u001b[38;5;241m.\u001b[39magents\u001b[38;5;241m.\u001b[39mruns\u001b[38;5;241m.\u001b[39mlist(thread_id\u001b[38;5;241m=\u001b[39mthread_id, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124masc\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 846\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43mrun\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mruns\u001b[49m\u001b[43m]\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\core\\paging.py:136\u001b[0m, in \u001b[0;36mItemPaged.__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 134\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_page_iterator \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 135\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_page_iterator \u001b[38;5;241m=\u001b[39m itertools\u001b[38;5;241m.\u001b[39mchain\u001b[38;5;241m.\u001b[39mfrom_iterable(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mby_page())\n\u001b[1;32m--> 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_page_iterator\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\core\\paging.py:82\u001b[0m, in \u001b[0;36mPageIterator.__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 80\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnd of paging\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 81\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 82\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_next\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontinuation_token\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 83\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m AzureError \u001b[38;5;28;01mas\u001b[39;00m error:\n\u001b[0;32m 84\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m error\u001b[38;5;241m.\u001b[39mcontinuation_token:\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\agents\\operations\\_operations.py:2742\u001b[0m, in \u001b[0;36mRunsOperations.list..get_next\u001b[1;34m(_continuation_token)\u001b[0m\n\u001b[0;32m 2739\u001b[0m response \u001b[38;5;241m=\u001b[39m pipeline_response\u001b[38;5;241m.\u001b[39mhttp_response\n\u001b[0;32m 2741\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;241m200\u001b[39m]:\n\u001b[1;32m-> 2742\u001b[0m \u001b[43mmap_error\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstatus_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstatus_code\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresponse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merror_map\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2743\u001b[0m error \u001b[38;5;241m=\u001b[39m _failsafe_deserialize(_models\u001b[38;5;241m.\u001b[39mAgentV1Error, response\u001b[38;5;241m.\u001b[39mjson())\n\u001b[0;32m 2744\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HttpResponseError(response\u001b[38;5;241m=\u001b[39mresponse, model\u001b[38;5;241m=\u001b[39merror)\n", + "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\core\\exceptions.py:163\u001b[0m, in \u001b[0;36mmap_error\u001b[1;34m(status_code, response, error_map)\u001b[0m\n\u001b[0;32m 161\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m 162\u001b[0m error \u001b[38;5;241m=\u001b[39m error_type(response\u001b[38;5;241m=\u001b[39mresponse)\n\u001b[1;32m--> 163\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error\n", + "\u001b[1;31mResourceNotFoundError\u001b[0m: (None) No thread found with id 'thread_PEshQVS2Kqv0t0uRRzmLvESb'.\nCode: None\nMessage: No thread found with id 'thread_PEshQVS2Kqv0t0uRRzmLvESb'." + ] + } + ], + "source": [ + "import os\n", + "\n", + "from azure.ai.evaluation import AIAgentConverter\n", + "from azure.ai.projects import AIProjectClient\n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "project_client = AIProjectClient(\n", + " endpoint=\"https://sc-ly6269080-1392-resource.services.ai.azure.com/api/projects/sc-ly6269080-1392\",\n", + " credential=DefaultAzureCredential(),\n", + ")\n", + "converter = AIAgentConverter(project_client)\n", + "\n", + "file_name = \"evaluation_data.jsonl\"\n", + "# Save the agent thread data to a JSONL file (all turns)\n", + "evaluation_data = await converter.prepare_evaluation_data([thread.id], filename=file_name)\n", + "# print(json.dumps(evaluation_data, indent=4))\n", + "len(evaluation_data) # number of turns in the thread" + ] + }, + { + "cell_type": "markdown", + "id": "8bf87cab", + "metadata": {}, + "source": [ + "### Setting up evaluator\n", + "\n", + "We will select the following evaluators to assess the different aspects relevant for agent quality: \n", + "\n", + "- [Intent resolution](https://aka.ms/intentresolution-sample): measures the extent of which an agent identifies the correct intent from a user query. Scale: integer 1-5. Higher is better.\n", + "- [Tool call accuracy](https://aka.ms/toolcallaccuracy-sample): evaluates the agent’s ability to select the appropriate tools, and process correct parameters from previous steps. Scale: float 0-1. Higher is better.\n", + "- [Task adherence](https://aka.ms/taskadherence-sample): measures the extent of which an agent’s final response adheres to the task based on its system message and a user query. Scale: integer 1-5. Higher is better.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6ee09df", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Class IntentResolutionEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Class ToolCallAccuracyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", + "Class TaskAdherenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n" + ] + } + ], + "source": [ + "import os\n", + "from pprint import pprint\n", + "\n", + "from azure.ai.evaluation import (\n", + " AzureOpenAIModelConfiguration,\n", + " IntentResolutionEvaluator,\n", + " TaskAdherenceEvaluator,\n", + " ToolCallAccuracyEvaluator,\n", + ")\n", + "\n", + "model_config = AzureOpenAIModelConfiguration(\n", + " azure_endpoint=os.environ[\"AZURE_OPENAI_ENDPOINT\"],\n", + " api_key=os.environ[\"AZURE_OPENAI_API_KEY\"],\n", + " api_version=os.environ[\"AZURE_OPENAI_API_VERSION\"],\n", + " azure_deployment=os.environ[\"MODEL_DEPLOYMENT_NAME\"],\n", + ")\n", + "\n", + "intent_resolution = IntentResolutionEvaluator(model_config=model_config)\n", + "\n", + "tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)\n", + "\n", + "task_adherence = TaskAdherenceEvaluator(model_config=model_config)" + ] + }, + { + "cell_type": "markdown", + "id": "a7a3d235", + "metadata": {}, + "source": [ + "### Run Evaluator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31eb7ecb", + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.evaluation import evaluate\n", + "\n", + "response = evaluate(\n", + " data=file_name,\n", + " evaluators={\n", + " \"tool_call_accuracy\": tool_call_accuracy,\n", + " \"intent_resolution\": intent_resolution,\n", + " \"task_adherence\": task_adherence,\n", + " },\n", + " azure_ai_project={\n", + " \"subscription_id\": os.environ[\"AZURE_SUBSCRIPTION_ID\"],\n", + " \"project_name\": os.environ[\"PROJECT_NAME\"],\n", + " \"resource_group_name\": os.environ[\"RESOURCE_GROUP_NAME\"],\n", + " },\n", + ")\n", + "pprint(f\"AI Foundary URL: {response.get('studio_url')}\")" + ] + }, + { + "cell_type": "markdown", + "id": "ac38d924", + "metadata": {}, + "source": [ + "## Inspect results on Azure AI Foundry\n", + "\n", + "Go to AI Foundry URL for rich Azure AI Foundry data visualization to inspect the evaluation scores and reasoning to quickly identify bugs and issues of your agent to fix and improve." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "225ae69a", + "metadata": {}, + "outputs": [], + "source": [ + "# alternatively, you can use the following to get the evaluation results in memory\n", + "\n", + "# average scores across all runs\n", + "pprint(response[\"metrics\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}