From 90bb419b3846c50d2b7d4745c3357a6c7d1e7969 Mon Sep 17 00:00:00 2001
From: Tao Chen <taochen@microsoft.com>
Date: Fri, 25 Jul 2025 14:58:34 -0700
Subject: [PATCH] Add agent evaluation notebook samples

---
 .../Evaluate_SK_AzureAIAgent.ipynb            | 360 +++++++++++++++++
 ...valuate_SK_AzureOpenAIAssistantAgent.ipynb | 366 ++++++++++++++++++
 2 files changed, 726 insertions(+)
 create mode 100644 python/samples/getting_started/Evaluate_SK_AzureAIAgent.ipynb
 create mode 100644 python/samples/getting_started/Evaluate_SK_AzureOpenAIAssistantAgent.ipynb

diff --git a/python/samples/getting_started/Evaluate_SK_AzureAIAgent.ipynb b/python/samples/getting_started/Evaluate_SK_AzureAIAgent.ipynb
new file mode 100644
index 000000000000..12cf364838af
--- /dev/null
+++ b/python/samples/getting_started/Evaluate_SK_AzureAIAgent.ipynb
@@ -0,0 +1,360 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bf5280e2",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "# Evaluate Semantic Kernel Azure AI Agents in Azure AI Foundry"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0330c099",
+   "metadata": {},
+   "source": [
+    "## Objective\n",
+    "\n",
+    "This sample demonstrates how to evaluate an AI agent (Azure AI Agent Service) on these important aspects of your agentic workflow:\n",
+    "\n",
+    "- Intent Resolution: Measures how well the agent identifies the user’s request, including how well it scopes the user’s intent, asks clarifying questions, and reminds end users of its scope of capabilities.\n",
+    "- Tool Call Accuracy: Evaluates the agent's ability to select the appropriate tools, and process correct parameters from previous steps.\n",
+    "- Task Adherence: Measures how well the agent’s response adheres to its assigned tasks, according to its system message and prior steps."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b364c694",
+   "metadata": {},
+   "source": [
+    "## Time\n",
+    "You can expect to complete this sample in approximately 20 minutes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "919c6017",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "### Packages\n",
+    "- `semantic-kernel` installed (`pip install semantic-kernel`)\n",
+    "- `azure-ai-evaluation` SDK installed\n",
+    "- An Azure OpenAI resource with a deployment configured\n",
+    "\n",
+    "### Environment Variables\n",
+    "- For AzureChatService:\n",
+    "  - `AZURE_OPENAI_API_KEY`\n",
+    "  - `AZURE_OPENAI_CHAT_DEPLOYMENT_NAME`\n",
+    "  - `AZURE_OPENAI_ENDPOINT`\n",
+    "  - `AZURE_OPENAI_API_VERSION=\"2025-04-01-preview\"`\n",
+    "- For evaluating agents:\n",
+    "  - `PROJECT_CONNECTION_STRING`\n",
+    "  - `AZURE_OPENAI_ENDPOINT`\n",
+    "  - `AZURE_OPENAI_API_KEY`\n",
+    "  - `AZURE_OPENAI_API_VERSION`\n",
+    "  - `MODEL_DEPLOYMENT_NAME`\n",
+    "- For Azure AI Foundry (Bonus):\n",
+    "  - `AZURE_SUBSCRIPTION_ID`\n",
+    "  - `PROJECT_NAME`\n",
+    "  - `RESOURCE_GROUP_NAME`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba1d6576",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "### Create an OpenAIResponsesAgent with a plugin - [reference](https://learn.microsoft.com/en-us/semantic-kernel/frameworks/agent/agent-types/responses-agent?pivots=programming-language-python)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7dc6ce40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Annotated\n",
+    "\n",
+    "from azure.identity import DefaultAzureCredential\n",
+    "\n",
+    "from semantic_kernel.agents import AzureAIAgent, AzureAIAgentSettings\n",
+    "from semantic_kernel.functions import kernel_function\n",
+    "\n",
+    "\n",
+    "# Define a sample plugin for the sample\n",
+    "class MenuPlugin:\n",
+    "    \"\"\"A sample Menu Plugin used for the concept sample.\"\"\"\n",
+    "\n",
+    "    @kernel_function(description=\"Provides a list of specials from the menu.\")\n",
+    "    def get_specials(self) -> Annotated[str, \"Returns the specials from the menu.\"]:\n",
+    "        return \"\"\"\n",
+    "        Special Soup: Clam Chowder\n",
+    "        Special Salad: Cobb Salad\n",
+    "        Special Drink: Chai Tea\n",
+    "        \"\"\"\n",
+    "\n",
+    "    @kernel_function(description=\"Provides the price of the requested menu item.\")\n",
+    "    def get_item_price(\n",
+    "        self, menu_item: Annotated[str, \"The name of the menu item.\"]\n",
+    "    ) -> Annotated[str, \"Returns the price of the menu item.\"]:\n",
+    "        return \"$9.99\"\n",
+    "\n",
+    "\n",
+    "# Create an agent\n",
+    "creds = DefaultAzureCredential()\n",
+    "project_client = AzureAIAgent.create_client(credential=creds)\n",
+    "\n",
+    "agent_definition = await project_client.agents.create_agent(\n",
+    "    model=AzureAIAgentSettings().model_deployment_name,\n",
+    "    name=\"Host\",\n",
+    "    instructions=\"Answer questions about the menu.\",\n",
+    ")\n",
+    "agent = AzureAIAgent(\n",
+    "    client=project_client,\n",
+    "    definition=agent_definition,\n",
+    "    plugins=[MenuPlugin()],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca0a35a0",
+   "metadata": {},
+   "source": [
+    "### Invoke the agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3b7b9ba3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "## User: Hello\n",
+      "## Host: Hello! How can I assist you today?\n",
+      "## User: What is the special soup?\n",
+      "## Host: The special soup is Clam Chowder. If you need more information or want to know about other specials, just let me know!\n",
+      "## User: What is the special drink?\n",
+      "## Host: The special drink is Chai Tea. If you have any more questions or need further assistance, feel free to ask!\n",
+      "## User: How much is it?\n",
+      "## Host: The special drink, Chai Tea, is priced at $9.99. If you have any more questions or need further assistance, just let me know!\n",
+      "## User: Thank you\n",
+      "## Host: You're welcome! If you need anything else, feel free to ask. Have a great day!\n"
+     ]
+    }
+   ],
+   "source": [
+    "USER_INPUTS = [\n",
+    "    \"Hello\",\n",
+    "    \"What is the special soup?\",\n",
+    "    \"What is the special drink?\",\n",
+    "    \"How much is it?\",\n",
+    "    \"Thank you\",\n",
+    "]\n",
+    "\n",
+    "thread = None\n",
+    "for user_input in USER_INPUTS:\n",
+    "    print(f\"## User: {user_input}\")\n",
+    "    response = await agent.get_response(messages=user_input, thread=thread)\n",
+    "    print(f\"## {response.name}: {response.content}\")\n",
+    "    thread = response.thread"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2586d3e5",
+   "metadata": {},
+   "source": [
+    "### Converter: Get data from agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7813b5eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "'AsyncItemPaged' object is not iterable",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[5], line 7\u001b[0m\n\u001b[0;32m      5\u001b[0m file_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mevaluation_data.jsonl\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m      6\u001b[0m \u001b[38;5;66;03m# Save the agent thread data to a JSONL file (all turns)\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m evaluation_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[43mconverter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_evaluation_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mthread\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      8\u001b[0m \u001b[38;5;66;03m# print(json.dumps(evaluation_data, indent=4))\u001b[39;00m\n\u001b[0;32m      9\u001b[0m \u001b[38;5;28mlen\u001b[39m(evaluation_data)  \u001b[38;5;66;03m# number of turns in the thread\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:574\u001b[0m, in \u001b[0;36mAIAgentConverter.prepare_evaluation_data\u001b[1;34m(self, thread_ids, filename)\u001b[0m\n\u001b[0;32m    569\u001b[0m     futures \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m    570\u001b[0m         executor\u001b[38;5;241m.\u001b[39msubmit(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_single_thread_evaluation_data, \u001b[38;5;28mstr\u001b[39m(thread_id), \u001b[38;5;28;01mNone\u001b[39;00m): thread_id\n\u001b[0;32m    571\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m thread_id \u001b[38;5;129;01min\u001b[39;00m thread_ids\n\u001b[0;32m    572\u001b[0m     }\n\u001b[0;32m    573\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m future \u001b[38;5;129;01min\u001b[39;00m as_completed(futures):\n\u001b[1;32m--> 574\u001b[0m         evaluations\u001b[38;5;241m.\u001b[39mextend(\u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m    576\u001b[0m \u001b[38;5;66;03m# So, if we have the filename, we can write it to the file, which is expected to be a JSONL file.\u001b[39;00m\n\u001b[0;32m    577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m filename:\n",
+      "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\_base.py:449\u001b[0m, in \u001b[0;36mFuture.result\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m    447\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[0;32m    448\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m--> 449\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_condition\u001b[38;5;241m.\u001b[39mwait(timeout)\n\u001b[0;32m    453\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
+      "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\_base.py:401\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception:\n\u001b[0;32m    400\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 401\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[0;32m    402\u001b[0m     \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m    403\u001b[0m         \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[0;32m    404\u001b[0m         \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\thread.py:58\u001b[0m, in \u001b[0;36m_WorkItem.run\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     55\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m     57\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 58\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     59\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m     60\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mset_exception(exc)\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:475\u001b[0m, in \u001b[0;36mAIAgentConverter._prepare_single_thread_evaluation_data\u001b[1;34m(self, thread_id, filename)\u001b[0m\n\u001b[0;32m    472\u001b[0m list_of_run_evaluations: List[\u001b[38;5;28mdict\u001b[39m] \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m    474\u001b[0m \u001b[38;5;66;03m# These are all the run IDs.\u001b[39;00m\n\u001b[1;32m--> 475\u001b[0m run_ids \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_data_retriever\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_list_run_ids_chronological\u001b[49m\u001b[43m(\u001b[49m\u001b[43mthread_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    477\u001b[0m \u001b[38;5;66;03m# If there were no messages in the thread, we can return an empty list.\u001b[39;00m\n\u001b[0;32m    478\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(run_ids) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m1\u001b[39m:\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:846\u001b[0m, in \u001b[0;36mFDPAgentDataRetriever._list_run_ids_chronological\u001b[1;34m(self, thread_id)\u001b[0m\n\u001b[0;32m    844\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_list_run_ids_chronological\u001b[39m(\u001b[38;5;28mself\u001b[39m, thread_id: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[0;32m    845\u001b[0m     runs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mproject_client\u001b[38;5;241m.\u001b[39magents\u001b[38;5;241m.\u001b[39mruns\u001b[38;5;241m.\u001b[39mlist(thread_id\u001b[38;5;241m=\u001b[39mthread_id, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124masc\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 846\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43mrun\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mruns\u001b[49m\u001b[43m]\u001b[49m\n",
+      "\u001b[1;31mTypeError\u001b[0m: 'AsyncItemPaged' object is not iterable"
+     ]
+    }
+   ],
+   "source": [
+    "from azure.ai.evaluation import AIAgentConverter\n",
+    "\n",
+    "converter = AIAgentConverter(project_client)\n",
+    "\n",
+    "file_name = \"evaluation_data.jsonl\"\n",
+    "# Save the agent thread data to a JSONL file (all turns)\n",
+    "evaluation_data = await converter.prepare_evaluation_data([thread.id], filename=file_name)\n",
+    "# print(json.dumps(evaluation_data, indent=4))\n",
+    "len(evaluation_data)  # number of turns in the thread"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bf87cab",
+   "metadata": {},
+   "source": [
+    "### Setting up evaluator\n",
+    "\n",
+    "We will select the following evaluators to assess the different aspects relevant for agent quality: \n",
+    "\n",
+    "- [Intent resolution](https://aka.ms/intentresolution-sample): measures the extent of which an agent identifies the correct intent from a user query. Scale: integer 1-5. Higher is better.\n",
+    "- [Tool call accuracy](https://aka.ms/toolcallaccuracy-sample): evaluates the agent’s ability to select the appropriate tools, and process correct parameters from previous steps. Scale: float 0-1. Higher is better.\n",
+    "- [Task adherence](https://aka.ms/taskadherence-sample): measures the extent of which an agent’s final response adheres to the task based on its system message and a user query. Scale: integer 1-5. Higher is better.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6ee09df",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Class IntentResolutionEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Class ToolCallAccuracyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n",
+      "Class TaskAdherenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from pprint import pprint\n",
+    "\n",
+    "from azure.ai.evaluation import (\n",
+    "    AzureOpenAIModelConfiguration,\n",
+    "    IntentResolutionEvaluator,\n",
+    "    TaskAdherenceEvaluator,\n",
+    "    ToolCallAccuracyEvaluator,\n",
+    ")\n",
+    "\n",
+    "model_config = AzureOpenAIModelConfiguration(\n",
+    "    azure_endpoint=os.environ[\"AZURE_OPENAI_ENDPOINT\"],\n",
+    "    api_key=os.environ[\"AZURE_OPENAI_API_KEY\"],\n",
+    "    api_version=os.environ[\"AZURE_OPENAI_API_VERSION\"],\n",
+    "    azure_deployment=os.environ[\"MODEL_DEPLOYMENT_NAME\"],\n",
+    ")\n",
+    "\n",
+    "intent_resolution = IntentResolutionEvaluator(model_config=model_config)\n",
+    "\n",
+    "tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)\n",
+    "\n",
+    "task_adherence = TaskAdherenceEvaluator(model_config=model_config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7a3d235",
+   "metadata": {},
+   "source": [
+    "### Run Evaluator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31eb7ecb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.evaluation import evaluate\n",
+    "\n",
+    "response = evaluate(\n",
+    "    data=file_name,\n",
+    "    evaluators={\n",
+    "        \"tool_call_accuracy\": tool_call_accuracy,\n",
+    "        \"intent_resolution\": intent_resolution,\n",
+    "        \"task_adherence\": task_adherence,\n",
+    "    },\n",
+    "    azure_ai_project={\n",
+    "        \"subscription_id\": os.environ[\"AZURE_SUBSCRIPTION_ID\"],\n",
+    "        \"project_name\": os.environ[\"PROJECT_NAME\"],\n",
+    "        \"resource_group_name\": os.environ[\"RESOURCE_GROUP_NAME\"],\n",
+    "    },\n",
+    ")\n",
+    "pprint(f\"AI Foundary URL: {response.get('studio_url')}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ac38d924",
+   "metadata": {},
+   "source": [
+    "## Inspect results on Azure AI Foundry\n",
+    "\n",
+    "Go to AI Foundry URL for rich Azure AI Foundry data visualization to inspect the evaluation scores and reasoning to quickly identify bugs and issues of your agent to fix and improve."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "225ae69a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# alternatively, you can use the following to get the evaluation results in memory\n",
+    "\n",
+    "# average scores across all runs\n",
+    "pprint(response[\"metrics\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/samples/getting_started/Evaluate_SK_AzureOpenAIAssistantAgent.ipynb b/python/samples/getting_started/Evaluate_SK_AzureOpenAIAssistantAgent.ipynb
new file mode 100644
index 000000000000..07b0af451cac
--- /dev/null
+++ b/python/samples/getting_started/Evaluate_SK_AzureOpenAIAssistantAgent.ipynb
@@ -0,0 +1,366 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bf5280e2",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "# Evaluate Semantic Kernel OpenAI Responses Agents in Azure AI Foundry"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0330c099",
+   "metadata": {},
+   "source": [
+    "## Objective\n",
+    "\n",
+    "This sample demonstrates how to evaluate Semantic Kernel OpenAIResponses agent in Azure AI Foundry. It provides a step-by-step guide to set up the environment, create an agent, and evaluate its performance."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b364c694",
+   "metadata": {},
+   "source": [
+    "## Time\n",
+    "You can expect to complete this sample in approximately 20 minutes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "919c6017",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "### Packages\n",
+    "- `semantic-kernel` installed (`pip install semantic-kernel`)\n",
+    "- `azure-ai-evaluation` SDK installed\n",
+    "- An Azure OpenAI resource with a deployment configured\n",
+    "\n",
+    "### Environment Variables\n",
+    "- For AzureChatService:\n",
+    "  - `AZURE_OPENAI_API_KEY`\n",
+    "  - `AZURE_OPENAI_CHAT_DEPLOYMENT_NAME`\n",
+    "  - `AZURE_OPENAI_ENDPOINT`\n",
+    "  - `AZURE_OPENAI_API_VERSION=\"2025-04-01-preview\"`\n",
+    "- For evaluating agents:\n",
+    "  - `PROJECT_CONNECTION_STRING`\n",
+    "  - `AZURE_OPENAI_ENDPOINT`\n",
+    "  - `AZURE_OPENAI_API_KEY`\n",
+    "  - `AZURE_OPENAI_API_VERSION`\n",
+    "  - `MODEL_DEPLOYMENT_NAME`\n",
+    "- For Azure AI Foundry (Bonus):\n",
+    "  - `AZURE_SUBSCRIPTION_ID`\n",
+    "  - `PROJECT_NAME`\n",
+    "  - `RESOURCE_GROUP_NAME`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba1d6576",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "### Create an OpenAIResponsesAgent with a plugin - [reference](https://learn.microsoft.com/en-us/semantic-kernel/frameworks/agent/agent-types/responses-agent?pivots=programming-language-python)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7dc6ce40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Annotated\n",
+    "\n",
+    "from semantic_kernel.agents import AzureAssistantAgent\n",
+    "from semantic_kernel.connectors.ai.open_ai import AzureOpenAISettings\n",
+    "from semantic_kernel.functions import kernel_function\n",
+    "\n",
+    "\n",
+    "# Define a sample plugin for the sample\n",
+    "class MenuPlugin:\n",
+    "    \"\"\"A sample Menu Plugin used for the concept sample.\"\"\"\n",
+    "\n",
+    "    @kernel_function(description=\"Provides a list of specials from the menu.\")\n",
+    "    def get_specials(self) -> Annotated[str, \"Returns the specials from the menu.\"]:\n",
+    "        return \"\"\"\n",
+    "        Special Soup: Clam Chowder\n",
+    "        Special Salad: Cobb Salad\n",
+    "        Special Drink: Chai Tea\n",
+    "        \"\"\"\n",
+    "\n",
+    "    @kernel_function(description=\"Provides the price of the requested menu item.\")\n",
+    "    def get_item_price(\n",
+    "        self, menu_item: Annotated[str, \"The name of the menu item.\"]\n",
+    "    ) -> Annotated[str, \"Returns the price of the menu item.\"]:\n",
+    "        return \"$9.99\"\n",
+    "\n",
+    "\n",
+    "# Create an agent\n",
+    "client = AzureAssistantAgent.create_client()\n",
+    "definition = await client.beta.assistants.create(\n",
+    "    model=AzureOpenAISettings().chat_deployment_name,\n",
+    "    instructions=\"Answer questions about the menu.\",\n",
+    "    name=\"Assistant\",\n",
+    ")\n",
+    "agent = AzureAssistantAgent(\n",
+    "    client=client,\n",
+    "    definition=definition,\n",
+    "    plugins=[MenuPlugin()],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca0a35a0",
+   "metadata": {},
+   "source": [
+    "### Invoke the agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3b7b9ba3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "## User: Hello\n",
+      "## Assistant: Hello! How can I assist you today?\n",
+      "## User: What is the special soup?\n",
+      "## Assistant: The special soup is Clam Chowder. Would you like to know more about it or any other special items?\n",
+      "## User: What is the special drink?\n",
+      "## Assistant: The special drink is Chai Tea. Would you like more details or information about anything else on the menu?\n",
+      "## User: How much is it?\n",
+      "## Assistant: The price of the Chai Tea is $9.99. Is there anything else you would like to know?\n",
+      "## User: Thank you\n",
+      "## Assistant: You're welcome! If you have any more questions in the future, feel free to ask. Have a great day!\n"
+     ]
+    }
+   ],
+   "source": [
+    "USER_INPUTS = [\n",
+    "    \"Hello\",\n",
+    "    \"What is the special soup?\",\n",
+    "    \"What is the special drink?\",\n",
+    "    \"How much is it?\",\n",
+    "    \"Thank you\",\n",
+    "]\n",
+    "\n",
+    "thread = None\n",
+    "for user_input in USER_INPUTS:\n",
+    "    print(f\"## User: {user_input}\")\n",
+    "    response = await agent.get_response(messages=user_input, thread=thread)\n",
+    "    print(f\"## {response.name}: {response.content}\")\n",
+    "    thread = response.thread"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2586d3e5",
+   "metadata": {},
+   "source": [
+    "### Converter: Get data from agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "7813b5eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ResourceNotFoundError",
+     "evalue": "(None) No thread found with id 'thread_PEshQVS2Kqv0t0uRRzmLvESb'.\nCode: None\nMessage: No thread found with id 'thread_PEshQVS2Kqv0t0uRRzmLvESb'.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mResourceNotFoundError\u001b[0m                     Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[6], line 15\u001b[0m\n\u001b[0;32m     13\u001b[0m file_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mevaluation_data.jsonl\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m     14\u001b[0m \u001b[38;5;66;03m# Save the agent thread data to a JSONL file (all turns)\u001b[39;00m\n\u001b[1;32m---> 15\u001b[0m evaluation_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[43mconverter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_evaluation_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mthread\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     16\u001b[0m \u001b[38;5;66;03m# print(json.dumps(evaluation_data, indent=4))\u001b[39;00m\n\u001b[0;32m     17\u001b[0m \u001b[38;5;28mlen\u001b[39m(evaluation_data)  \u001b[38;5;66;03m# number of turns in the thread\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:574\u001b[0m, in \u001b[0;36mAIAgentConverter.prepare_evaluation_data\u001b[1;34m(self, thread_ids, filename)\u001b[0m\n\u001b[0;32m    569\u001b[0m     futures \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m    570\u001b[0m         executor\u001b[38;5;241m.\u001b[39msubmit(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_single_thread_evaluation_data, \u001b[38;5;28mstr\u001b[39m(thread_id), \u001b[38;5;28;01mNone\u001b[39;00m): thread_id\n\u001b[0;32m    571\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m thread_id \u001b[38;5;129;01min\u001b[39;00m thread_ids\n\u001b[0;32m    572\u001b[0m     }\n\u001b[0;32m    573\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m future \u001b[38;5;129;01min\u001b[39;00m as_completed(futures):\n\u001b[1;32m--> 574\u001b[0m         evaluations\u001b[38;5;241m.\u001b[39mextend(\u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m    576\u001b[0m \u001b[38;5;66;03m# So, if we have the filename, we can write it to the file, which is expected to be a JSONL file.\u001b[39;00m\n\u001b[0;32m    577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m filename:\n",
+      "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\_base.py:449\u001b[0m, in \u001b[0;36mFuture.result\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m    447\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[0;32m    448\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m--> 449\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_condition\u001b[38;5;241m.\u001b[39mwait(timeout)\n\u001b[0;32m    453\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
+      "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\_base.py:401\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception:\n\u001b[0;32m    400\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 401\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[0;32m    402\u001b[0m     \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m    403\u001b[0m         \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[0;32m    404\u001b[0m         \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[1;32m~\\AppData\\Roaming\\uv\\python\\cpython-3.11.10-windows-x86_64-none\\Lib\\concurrent\\futures\\thread.py:58\u001b[0m, in \u001b[0;36m_WorkItem.run\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     55\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m     57\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 58\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     59\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[0;32m     60\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mset_exception(exc)\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:475\u001b[0m, in \u001b[0;36mAIAgentConverter._prepare_single_thread_evaluation_data\u001b[1;34m(self, thread_id, filename)\u001b[0m\n\u001b[0;32m    472\u001b[0m list_of_run_evaluations: List[\u001b[38;5;28mdict\u001b[39m] \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m    474\u001b[0m \u001b[38;5;66;03m# These are all the run IDs.\u001b[39;00m\n\u001b[1;32m--> 475\u001b[0m run_ids \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_data_retriever\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_list_run_ids_chronological\u001b[49m\u001b[43m(\u001b[49m\u001b[43mthread_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    477\u001b[0m \u001b[38;5;66;03m# If there were no messages in the thread, we can return an empty list.\u001b[39;00m\n\u001b[0;32m    478\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(run_ids) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m1\u001b[39m:\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:846\u001b[0m, in \u001b[0;36mFDPAgentDataRetriever._list_run_ids_chronological\u001b[1;34m(self, thread_id)\u001b[0m\n\u001b[0;32m    844\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_list_run_ids_chronological\u001b[39m(\u001b[38;5;28mself\u001b[39m, thread_id: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[0;32m    845\u001b[0m     runs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mproject_client\u001b[38;5;241m.\u001b[39magents\u001b[38;5;241m.\u001b[39mruns\u001b[38;5;241m.\u001b[39mlist(thread_id\u001b[38;5;241m=\u001b[39mthread_id, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124masc\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 846\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43mrun\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mruns\u001b[49m\u001b[43m]\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\evaluation\\_converters\\_ai_services.py:846\u001b[0m, in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m    844\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_list_run_ids_chronological\u001b[39m(\u001b[38;5;28mself\u001b[39m, thread_id: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[0;32m    845\u001b[0m     runs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mproject_client\u001b[38;5;241m.\u001b[39magents\u001b[38;5;241m.\u001b[39mruns\u001b[38;5;241m.\u001b[39mlist(thread_id\u001b[38;5;241m=\u001b[39mthread_id, order\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124masc\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 846\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43mrun\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mruns\u001b[49m\u001b[43m]\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\core\\paging.py:136\u001b[0m, in \u001b[0;36mItemPaged.__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    134\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_page_iterator \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m    135\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_page_iterator \u001b[38;5;241m=\u001b[39m itertools\u001b[38;5;241m.\u001b[39mchain\u001b[38;5;241m.\u001b[39mfrom_iterable(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mby_page())\n\u001b[1;32m--> 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_page_iterator\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\core\\paging.py:82\u001b[0m, in \u001b[0;36mPageIterator.__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     80\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnd of paging\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m     81\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 82\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_next\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontinuation_token\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     83\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m AzureError \u001b[38;5;28;01mas\u001b[39;00m error:\n\u001b[0;32m     84\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m error\u001b[38;5;241m.\u001b[39mcontinuation_token:\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\ai\\agents\\operations\\_operations.py:2742\u001b[0m, in \u001b[0;36mRunsOperations.list.<locals>.get_next\u001b[1;34m(_continuation_token)\u001b[0m\n\u001b[0;32m   2739\u001b[0m response \u001b[38;5;241m=\u001b[39m pipeline_response\u001b[38;5;241m.\u001b[39mhttp_response\n\u001b[0;32m   2741\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;241m200\u001b[39m]:\n\u001b[1;32m-> 2742\u001b[0m     \u001b[43mmap_error\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstatus_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstatus_code\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresponse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merror_map\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2743\u001b[0m     error \u001b[38;5;241m=\u001b[39m _failsafe_deserialize(_models\u001b[38;5;241m.\u001b[39mAgentV1Error, response\u001b[38;5;241m.\u001b[39mjson())\n\u001b[0;32m   2744\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m HttpResponseError(response\u001b[38;5;241m=\u001b[39mresponse, model\u001b[38;5;241m=\u001b[39merror)\n",
+      "File \u001b[1;32mc:\\Users\\taochen\\Projects\\semantic-kernel\\python\\.venv\\Lib\\site-packages\\azure\\core\\exceptions.py:163\u001b[0m, in \u001b[0;36mmap_error\u001b[1;34m(status_code, response, error_map)\u001b[0m\n\u001b[0;32m    161\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m    162\u001b[0m error \u001b[38;5;241m=\u001b[39m error_type(response\u001b[38;5;241m=\u001b[39mresponse)\n\u001b[1;32m--> 163\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error\n",
+      "\u001b[1;31mResourceNotFoundError\u001b[0m: (None) No thread found with id 'thread_PEshQVS2Kqv0t0uRRzmLvESb'.\nCode: None\nMessage: No thread found with id 'thread_PEshQVS2Kqv0t0uRRzmLvESb'."
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "from azure.ai.evaluation import AIAgentConverter\n",
+    "from azure.ai.projects import AIProjectClient\n",
+    "from azure.identity import DefaultAzureCredential\n",
+    "\n",
+    "project_client = AIProjectClient(\n",
+    "    endpoint=\"https://sc-ly6269080-1392-resource.services.ai.azure.com/api/projects/sc-ly6269080-1392\",\n",
+    "    credential=DefaultAzureCredential(),\n",
+    ")\n",
+    "converter = AIAgentConverter(project_client)\n",
+    "\n",
+    "file_name = \"evaluation_data.jsonl\"\n",
+    "# Save the agent thread data to a JSONL file (all turns)\n",
+    "evaluation_data = await converter.prepare_evaluation_data([thread.id], filename=file_name)\n",
+    "# print(json.dumps(evaluation_data, indent=4))\n",
+    "len(evaluation_data)  # number of turns in the thread"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bf87cab",
+   "metadata": {},
+   "source": [
+    "### Setting up evaluator\n",
+    "\n",
+    "We will select the following evaluators to assess the different aspects relevant for agent quality: \n",
+    "\n",
+    "- [Intent resolution](https://aka.ms/intentresolution-sample): measures the extent of which an agent identifies the correct intent from a user query. Scale: integer 1-5. Higher is better.\n",
+    "- [Tool call accuracy](https://aka.ms/toolcallaccuracy-sample): evaluates the agent’s ability to select the appropriate tools, and process correct parameters from previous steps. Scale: float 0-1. Higher is better.\n",
+    "- [Task adherence](https://aka.ms/taskadherence-sample): measures the extent of which an agent’s final response adheres to the task based on its system message and a user query. Scale: integer 1-5. Higher is better.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6ee09df",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Class IntentResolutionEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Class ToolCallAccuracyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n",
+      "Class TaskAdherenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from pprint import pprint\n",
+    "\n",
+    "from azure.ai.evaluation import (\n",
+    "    AzureOpenAIModelConfiguration,\n",
+    "    IntentResolutionEvaluator,\n",
+    "    TaskAdherenceEvaluator,\n",
+    "    ToolCallAccuracyEvaluator,\n",
+    ")\n",
+    "\n",
+    "model_config = AzureOpenAIModelConfiguration(\n",
+    "    azure_endpoint=os.environ[\"AZURE_OPENAI_ENDPOINT\"],\n",
+    "    api_key=os.environ[\"AZURE_OPENAI_API_KEY\"],\n",
+    "    api_version=os.environ[\"AZURE_OPENAI_API_VERSION\"],\n",
+    "    azure_deployment=os.environ[\"MODEL_DEPLOYMENT_NAME\"],\n",
+    ")\n",
+    "\n",
+    "intent_resolution = IntentResolutionEvaluator(model_config=model_config)\n",
+    "\n",
+    "tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)\n",
+    "\n",
+    "task_adherence = TaskAdherenceEvaluator(model_config=model_config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7a3d235",
+   "metadata": {},
+   "source": [
+    "### Run Evaluator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31eb7ecb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.evaluation import evaluate\n",
+    "\n",
+    "response = evaluate(\n",
+    "    data=file_name,\n",
+    "    evaluators={\n",
+    "        \"tool_call_accuracy\": tool_call_accuracy,\n",
+    "        \"intent_resolution\": intent_resolution,\n",
+    "        \"task_adherence\": task_adherence,\n",
+    "    },\n",
+    "    azure_ai_project={\n",
+    "        \"subscription_id\": os.environ[\"AZURE_SUBSCRIPTION_ID\"],\n",
+    "        \"project_name\": os.environ[\"PROJECT_NAME\"],\n",
+    "        \"resource_group_name\": os.environ[\"RESOURCE_GROUP_NAME\"],\n",
+    "    },\n",
+    ")\n",
+    "pprint(f\"AI Foundary URL: {response.get('studio_url')}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ac38d924",
+   "metadata": {},
+   "source": [
+    "## Inspect results on Azure AI Foundry\n",
+    "\n",
+    "Go to AI Foundry URL for rich Azure AI Foundry data visualization to inspect the evaluation scores and reasoning to quickly identify bugs and issues of your agent to fix and improve."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "225ae69a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# alternatively, you can use the following to get the evaluation results in memory\n",
+    "\n",
+    "# average scores across all runs\n",
+    "pprint(response[\"metrics\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}