diff --git a/src/Infrastructure/BotSharp.Abstraction/Conversations/Enums/MessageTypeName.cs b/src/Infrastructure/BotSharp.Abstraction/Conversations/Enums/MessageTypeName.cs index c4e73d695..c13f26a81 100644 --- a/src/Infrastructure/BotSharp.Abstraction/Conversations/Enums/MessageTypeName.cs +++ b/src/Infrastructure/BotSharp.Abstraction/Conversations/Enums/MessageTypeName.cs @@ -4,4 +4,6 @@ public static class MessageTypeName { public const string Plain = "plain"; public const string Notification = "notification"; + public const string FunctionCall = "function"; + public const string Audio = "audio"; } diff --git a/src/Infrastructure/BotSharp.Abstraction/Infrastructures/Enums/StateConst.cs b/src/Infrastructure/BotSharp.Abstraction/Infrastructures/Enums/StateConst.cs index c452b7abb..466893d3c 100644 --- a/src/Infrastructure/BotSharp.Abstraction/Infrastructures/Enums/StateConst.cs +++ b/src/Infrastructure/BotSharp.Abstraction/Infrastructures/Enums/StateConst.cs @@ -10,4 +10,6 @@ public class StateConst public const string AGENT_REDIRECTION_REASON = "agent_redirection_reason"; public const string LANGUAGE = "language"; + + public const string SUB_CONVERSATION_ID = "sub_conversation_id"; } diff --git a/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs index 5eebb5b39..c7134693b 100644 --- a/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs +++ b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs @@ -5,8 +5,25 @@ namespace BotSharp.Abstraction.MLTasks; public interface IRealTimeCompletion { string Provider { get; } + string Model { get; } void SetModelName(string model); + Task Connect(RealtimeHubConnection conn, + Action onModelReady, + Action onModelAudioDeltaReceived, + Action onModelAudioResponseDone, + Action onAudioTranscriptDone, + Action onModelResponseDone, + Action onUserInterrupted); + Task AppenAudioBuffer(string message); + + Task SendEventToModel(object message); + Task Disconnect(); + Task CreateSession(Agent agent, List conversations); + Task UpdateInitialSession(RealtimeHubConnection conn); + Task InsertConversationItem(RoleDialogModel message); + Task TriggerModelInference(string? instructions = null); + Task> OnResponsedDone(RealtimeHubConnection conn, string response); } diff --git a/src/Infrastructure/BotSharp.Abstraction/Realtime/IRealtimeHub.cs b/src/Infrastructure/BotSharp.Abstraction/Realtime/IRealtimeHub.cs new file mode 100644 index 000000000..67d0f18c3 --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/Realtime/IRealtimeHub.cs @@ -0,0 +1,12 @@ +using BotSharp.Abstraction.Realtime.Models; +using System.Net.WebSockets; + +namespace BotSharp.Abstraction.Realtime; + +/// +/// Realtime hub interface. Manage the WebSocket connection include User, Agent and Model. +/// +public interface IRealtimeHub +{ + Task Listen(WebSocket userWebSocket, Func onUserMessageReceived); +} diff --git a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs b/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs new file mode 100644 index 000000000..3e4a1f73e --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs @@ -0,0 +1,13 @@ +namespace BotSharp.Abstraction.Realtime.Models; + +public class RealtimeHubConnection +{ + public string Event { get; set; } = null!; + public string StreamId { get; set; } = null!; + public string ConversationId { get; set; } = null!; + public string Data { get; set; } = string.Empty; + public string Model { get; set; } = null!; + public Func OnModelMessageReceived { get; set; } = null!; + public Func OnModelAudioResponseDone { get; set; } = null!; + public Func OnModelUserInterrupted { get; set; } = null!; +} diff --git a/src/Infrastructure/BotSharp.Core/BotSharpCoreExtensions.cs b/src/Infrastructure/BotSharp.Core/BotSharpCoreExtensions.cs index d801e943b..742b39e9b 100644 --- a/src/Infrastructure/BotSharp.Core/BotSharpCoreExtensions.cs +++ b/src/Infrastructure/BotSharp.Core/BotSharpCoreExtensions.cs @@ -15,6 +15,8 @@ using BotSharp.Abstraction.Templating; using BotSharp.Core.Templating; using BotSharp.Abstraction.Infrastructures.Enums; +using BotSharp.Abstraction.Realtime; +using BotSharp.Core.Realtime; namespace BotSharp.Core; @@ -171,5 +173,7 @@ public static void RegisterPlugins(IServiceCollection services, IConfiguration c }); services.AddSingleton(loader); + + services.AddScoped(); } } diff --git a/src/Infrastructure/BotSharp.Core/Conversations/Services/ConversationService.cs b/src/Infrastructure/BotSharp.Core/Conversations/Services/ConversationService.cs index 0c02918e2..02bcab6e1 100644 --- a/src/Infrastructure/BotSharp.Core/Conversations/Services/ConversationService.cs +++ b/src/Infrastructure/BotSharp.Core/Conversations/Services/ConversationService.cs @@ -107,7 +107,7 @@ public async Task NewConversation(Conversation sess) record.Id = sess.Id.IfNullOrEmptyAs(Guid.NewGuid().ToString()); record.UserId = sess.UserId.IfNullOrEmptyAs(foundUserId); record.Tags = sess.Tags; - record.Title = "New Conversation"; + record.Title = string.IsNullOrEmpty(record.Title) ? "New Conversation" : record.Title; db.CreateNewConversation(record); diff --git a/src/Infrastructure/BotSharp.Core/Realtime/RealtimeHub.cs b/src/Infrastructure/BotSharp.Core/Realtime/RealtimeHub.cs new file mode 100644 index 000000000..d311fd25b --- /dev/null +++ b/src/Infrastructure/BotSharp.Core/Realtime/RealtimeHub.cs @@ -0,0 +1,155 @@ +using BotSharp.Abstraction.Realtime; +using System.Net.WebSockets; +using System; +using BotSharp.Abstraction.Realtime.Models; +using BotSharp.Abstraction.MLTasks; +using BotSharp.Abstraction.Agents.Models; + +namespace BotSharp.Core.Realtime; + +public class RealtimeHub : IRealtimeHub +{ + private readonly IServiceProvider _services; + private readonly ILogger _logger; + public RealtimeHub(IServiceProvider services, ILogger logger) + { + _services = services; + _logger = logger; + } + + public async Task Listen(WebSocket userWebSocket, + Func onUserMessageReceived) + { + var buffer = new byte[1024 * 4]; + WebSocketReceiveResult result; + + var llmProviderService = _services.GetRequiredService(); + var model = llmProviderService.GetProviderModel("openai", "gpt-4", + realTime: true).Name; + + var completer = _services.GetServices().First(x => x.Provider == "openai"); + completer.SetModelName(model); + + do + { + result = await userWebSocket.ReceiveAsync(new ArraySegment(buffer), CancellationToken.None); + string receivedText = Encoding.UTF8.GetString(buffer, 0, result.Count); + _logger.LogDebug($"Received from user: {receivedText}"); + if (string.IsNullOrEmpty(receivedText)) + { + continue; + } + + var conn = onUserMessageReceived(receivedText); + conn.Model = model; + + if (conn.Event == "user_connected") + { + await ConnectToModel(completer, userWebSocket, conn); + } + else if (conn.Event == "user_data_received") + { + await completer.AppenAudioBuffer(conn.Data); + } + else if (conn.Event == "user_disconnected") + { + await completer.Disconnect(); + } + } while (!result.CloseStatus.HasValue); + + await userWebSocket.CloseAsync(result.CloseStatus.Value, result.CloseStatusDescription, CancellationToken.None); + } + + private async Task ConnectToModel(IRealTimeCompletion completer, WebSocket userWebSocket, RealtimeHubConnection conn) + { + var hookProvider = _services.GetRequiredService(); + var storage = _services.GetRequiredService(); + var convService = _services.GetRequiredService(); + convService.SetConversationId(conn.ConversationId, []); + var conversation = await convService.GetConversation(conn.ConversationId); + var agentService = _services.GetRequiredService(); + var agent = await agentService.LoadAgent(conversation.AgentId); + var routing = _services.GetRequiredService(); + var dialogs = convService.GetDialogHistory(); + routing.Context.SetDialogs(dialogs); + + await completer.Connect(conn, + onModelReady: async () => + { + // Control initial session + var data = await completer.UpdateInitialSession(conn); + await completer.SendEventToModel(data); + + // Add dialog history + foreach (var item in dialogs) + { + var dialogItem = await completer.InsertConversationItem(item); + await completer.SendEventToModel(data); + } + + if (dialogs.LastOrDefault()?.Role == AgentRole.Assistant) + { + await completer.TriggerModelInference($"Rephase your last response:\r\n{dialogs.LastOrDefault()?.Content}"); + } + else + { + await completer.TriggerModelInference("Reply based on the conversation context."); + } + }, + onModelAudioDeltaReceived: async audioDeltaData => + { + var data = conn.OnModelMessageReceived(audioDeltaData); + await SendEventToUser(userWebSocket, data); + }, + onModelAudioResponseDone: async () => + { + var data = conn.OnModelAudioResponseDone(); + await SendEventToUser(userWebSocket, data); + }, + onAudioTranscriptDone: async transcript => + { + var message = new RoleDialogModel(AgentRole.Assistant, transcript); + + // append transcript to conversation + storage.Append(conn.ConversationId, message); + + foreach (var hook in hookProvider.HooksOrderByPriority) + { + hook.SetAgent(agent) + .SetConversation(conversation); + + if (!string.IsNullOrEmpty(transcript)) + { + await hook.OnMessageReceived(message); + } + } + }, + onModelResponseDone: async response => + { + var messages = await completer.OnResponsedDone(conn, response); + foreach (var message in messages) + { + // Invoke function + if (message.FunctionName != null) + { + await routing.InvokeFunction(message.FunctionName, message); + var data = await completer.InsertConversationItem(message); + await completer.SendEventToModel(data); + await completer.TriggerModelInference("Reply based on the function's output."); + } + } + }, + onUserInterrupted: async () => + { + var data = conn.OnModelUserInterrupted(); + await SendEventToUser(userWebSocket, data); + }); + } + + private async Task SendEventToUser(WebSocket webSocket, object message) + { + var data = JsonSerializer.Serialize(message); + var buffer = Encoding.UTF8.GetBytes(data); + await webSocket.SendAsync(new ArraySegment(buffer), WebSocketMessageType.Text, true, CancellationToken.None); + } +} diff --git a/src/Infrastructure/BotSharp.Core/Routing/RoutingService.InvokeFunction.cs b/src/Infrastructure/BotSharp.Core/Routing/RoutingService.InvokeFunction.cs index fb074dfc4..4d4cd9a89 100644 --- a/src/Infrastructure/BotSharp.Core/Routing/RoutingService.InvokeFunction.cs +++ b/src/Infrastructure/BotSharp.Core/Routing/RoutingService.InvokeFunction.cs @@ -49,7 +49,7 @@ public async Task InvokeFunction(string name, RoleDialogModel message) } // Set result to original message - message.Role = clonedMessage.Role; + message.Role = AgentRole.Function; message.PostbackFunctionName = clonedMessage.PostbackFunctionName; message.CurrentAgentId = clonedMessage.CurrentAgentId; message.Content = clonedMessage.Content; diff --git a/src/Infrastructure/BotSharp.OpenAPI/BotSharp.OpenAPI.csproj b/src/Infrastructure/BotSharp.OpenAPI/BotSharp.OpenAPI.csproj index ef06b4d55..305a4197f 100644 --- a/src/Infrastructure/BotSharp.OpenAPI/BotSharp.OpenAPI.csproj +++ b/src/Infrastructure/BotSharp.OpenAPI/BotSharp.OpenAPI.csproj @@ -1,4 +1,4 @@ - + $(TargetFramework) @@ -47,6 +47,8 @@ + + diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/RealtimeSessionRequest.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs similarity index 59% rename from src/Plugins/BotSharp.Plugin.OpenAI/Models/RealtimeSessionRequest.cs rename to src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs index ce249e702..a5ede20f9 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Models/RealtimeSessionRequest.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs @@ -1,22 +1,39 @@ using BotSharp.Abstraction.Functions.Models; -using System.Text.Json.Serialization; -namespace BotSharp.Plugin.OpenAI.Models; +namespace BotSharp.Plugin.OpenAI.Models.Realtime; -public class RealtimeSessionRequest +public class RealtimeSessionBody { + [JsonPropertyName("id")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public string Id { get; set; } = null!; + + [JsonPropertyName("object")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public string Object { get; set; } = null!; + [JsonPropertyName("model")] - public string Model { get; set; } = "gpt-4o-mini-realtime-preview-2024-12-17"; + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public string Model { get; set; } = null!; [JsonPropertyName("temperature")] - public float temperature { get; set; } = 0.8f; + public float Temperature { get; set; } = 0.8f; [JsonPropertyName("modalities")] public string[] Modalities { get; set; } = ["audio", "text"]; + [JsonPropertyName("input_audio_format")] + public string InputAudioFormat { get; set; } = "pcm16"; + + [JsonPropertyName("output_audio_format")] + public string OutputAudioFormat { get; set; } = "pcm16"; + [JsonPropertyName("instructions")] public string Instructions { get; set; } = "You are a friendly assistant."; + [JsonPropertyName("voice")] + public string Voice { get; set; } = "sage"; + [JsonPropertyName("max_response_output_tokens")] public int MaxResponseOutputTokens { get; set; } = 512; diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionRequest.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionRequest.cs new file mode 100644 index 000000000..b2125e996 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionRequest.cs @@ -0,0 +1,14 @@ +namespace BotSharp.Plugin.OpenAI.Models.Realtime; + +public class RealtimeSessionCreationRequest : RealtimeSessionBody +{ + +} + +/// +/// https://platform.openai.com/docs/api-reference/realtime-client-events/session/update +/// +public class RealtimeSessionUpdateRequest : RealtimeSessionBody +{ + +} \ No newline at end of file diff --git a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeSessionUpdate.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionUpdate.cs similarity index 76% rename from src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeSessionUpdate.cs rename to src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionUpdate.cs index 0d0afa1a6..a89928d83 100644 --- a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeSessionUpdate.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionUpdate.cs @@ -1,4 +1,6 @@ -namespace BotSharp.Abstraction.Realtime.Models; +using BotSharp.Abstraction.Realtime.Models; + +namespace BotSharp.Plugin.OpenAI.Models.Realtime; public class RealtimeSessionUpdate { diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ResponseAudioDelta.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ResponseAudioDelta.cs new file mode 100644 index 000000000..dbf299b25 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ResponseAudioDelta.cs @@ -0,0 +1,19 @@ +namespace BotSharp.Plugin.OpenAI.Models.Realtime; + +public class ResponseAudioDelta : ServerEventResponse +{ + [JsonPropertyName("response_id")] + public string ResponseId { get; set; } = null!; + + [JsonPropertyName("item_id")] + public string ItemId { get; set; } = null!; + + [JsonPropertyName("output_index")] + public int OutputIndex { get; set; } + + [JsonPropertyName("content_index")] + public int ContentIndex { get; set; } + + [JsonPropertyName("delta")] + public string? Delta { get; set; } +} diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ResponseAudioTranscript.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ResponseAudioTranscript.cs new file mode 100644 index 000000000..3f83ef1be --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ResponseAudioTranscript.cs @@ -0,0 +1,19 @@ +namespace BotSharp.Plugin.OpenAI.Models.Realtime; + +public class ResponseAudioTranscript : ServerEventResponse +{ + [JsonPropertyName("response_id")] + public string ResponseId { get; set; } = null!; + + [JsonPropertyName("item_id")] + public string ItemId { get; set; } = null!; + + [JsonPropertyName("output_index")] + public int OutputIndex { get; set; } + + [JsonPropertyName("content_index")] + public int ContentIndex { get; set; } + + [JsonPropertyName("transcript")] + public string? Transcript { get; set; } +} diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ResponseDone.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ResponseDone.cs new file mode 100644 index 000000000..ae3db58df --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ResponseDone.cs @@ -0,0 +1,102 @@ +namespace BotSharp.Plugin.OpenAI.Models.Realtime; + +public class ResponseDone : ServerEventResponse +{ + [JsonPropertyName("response")] + public ResponseDoneBody Body { get; set; } = new(); +} + +public class ResponseDoneBody +{ + [JsonPropertyName("id")] + public string Id { get; set; } = null!; + + [JsonPropertyName("object")] + public string Object { get; set; } = null!; + + [JsonPropertyName("status")] + public string Status { get; set; } = null!; + + [JsonPropertyName("status_details")] + public ResponseDoneStatusDetail StatusDetails { get; set; } = new(); + + [JsonPropertyName("conversation_id")] + public string ConversationId { get; set; } = null!; + + [JsonPropertyName("usage")] + public ModelTokenUsage Usage { get; set; } = new(); + + [JsonPropertyName("modalities")] + public string[] Modalities { get; set; } = []; + + [JsonPropertyName("temperature")] + public float Temperature { get; set; } + + [JsonPropertyName("output_audio_format")] + public string OutputAudioFormat { get; set; } = null!; + + [JsonPropertyName("voice")] + public string Voice { get; set; } = null!; + + [JsonPropertyName("output")] + public ModelResponseDoneOutput[] Outputs { get; set; } = []; +} + +public class ModelTokenUsage +{ + [JsonPropertyName("total_tokens")] + public int TotalTokens { get; set; } + + [JsonPropertyName("input_tokens")] + public int InputTokens { get; set; } + + [JsonPropertyName("output_tokens")] + public int OutputTokens { get; set; } +} + +public class ModelResponseDoneOutput +{ + [JsonPropertyName("id")] + public string Id { get; set; } = null!; + [JsonPropertyName("object")] + public string Object { get; set; } = null!; + + [JsonPropertyName("type")] + public string Type { get; set; } = null!; + + [JsonPropertyName("status")] + public string Status { get; set; } = null!; + + [JsonPropertyName("role")] + public string Role { get; set; } = null!; + + [JsonPropertyName("name")] + public string Name { get; set; } = null!; + + [JsonPropertyName("call_id")] + public string CallId { get; set; } = null!; + + [JsonPropertyName("arguments")] + public string Arguments { get; set; } = null!; + + [JsonPropertyName("content")] + public ResponseDoneOutputContent[] Content { get; set; } = []; +} + +public class ResponseDoneStatusDetail +{ + [JsonPropertyName("type")] + public string Type { get; set; } = null!; + + [JsonPropertyName("reason")] + public string Reason { get; set; } = null!; +} + +public class ResponseDoneOutputContent +{ + [JsonPropertyName("type")] + public string Type { get; set; } = null!; + + [JsonPropertyName("transcript")] + public string Transcript { get; set; } = null!; +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ServerEventErrorResponse.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ServerEventErrorResponse.cs new file mode 100644 index 000000000..f14b54376 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ServerEventErrorResponse.cs @@ -0,0 +1,19 @@ +namespace BotSharp.Plugin.OpenAI.Models.Realtime; + +public class ServerEventErrorResponse : ServerEventResponse +{ + [JsonPropertyName("error")] + public ServerEventErrorBody Body { get; set; } = new(); +} + +public class ServerEventErrorBody +{ + [JsonPropertyName("type")] + public string Type { get; set; } = null!; + + [JsonPropertyName("code")] + public string Code { get; set; } = null!; + + [JsonPropertyName("message")] + public string? Message { get; set; } +} diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ServerEventResponse.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ServerEventResponse.cs new file mode 100644 index 000000000..921c4c562 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ServerEventResponse.cs @@ -0,0 +1,10 @@ +namespace BotSharp.Plugin.OpenAI.Models.Realtime; + +public class ServerEventResponse +{ + [JsonPropertyName("event_id")] + public string EventId { get; set; } = null!; + + [JsonPropertyName("type")] + public string Type { get; set; } = null!; +} diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/SessionServerEventResponse.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/SessionServerEventResponse.cs new file mode 100644 index 000000000..fb1d08f63 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/SessionServerEventResponse.cs @@ -0,0 +1,7 @@ +namespace BotSharp.Plugin.OpenAI.Models.Realtime; + +public class SessionServerEventResponse : ServerEventResponse +{ + [JsonPropertyName("session")] + public RealtimeSessionBody Session { get; set; } = null!; +} diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/IOpenAiRealtimeApi.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/IOpenAiRealtimeApi.cs index c26ce46d9..68382dff9 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/IOpenAiRealtimeApi.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/IOpenAiRealtimeApi.cs @@ -1,4 +1,5 @@ using BotSharp.Abstraction.Realtime.Models; +using BotSharp.Plugin.OpenAI.Models.Realtime; using Refit; namespace BotSharp.Plugin.OpenAI.Providers.Realtime; @@ -6,5 +7,5 @@ namespace BotSharp.Plugin.OpenAI.Providers.Realtime; public interface IOpenAiRealtimeApi { [Post("/v1/realtime/sessions")] - Task GetSessionAsync(RealtimeSessionRequest model, [Authorize("Bearer")] string token); + Task GetSessionAsync(RealtimeSessionCreationRequest model, [Authorize("Bearer")] string token); } diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs index eb678c5d8..2a59b4420 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs @@ -1,20 +1,29 @@ using BotSharp.Abstraction.Files.Utilities; using BotSharp.Abstraction.Functions.Models; using BotSharp.Abstraction.Realtime.Models; +using BotSharp.Plugin.OpenAI.Models.Realtime; using OpenAI.Chat; +using System.Net.WebSockets; +using System.Text; using System.Text.Json; +using System.Threading; namespace BotSharp.Plugin.OpenAI.Providers.Realtime; +/// +/// Reference to https://platform.openai.com/docs/api-reference/realtime-server-events +/// public class RealTimeCompletionProvider : IRealTimeCompletion { public string Provider => "openai"; + public string Model => _model; protected readonly OpenAiSettings _settings; protected readonly IServiceProvider _services; protected readonly ILogger _logger; protected string _model = "gpt-4o-mini-realtime-preview-2024-12-17"; + private ClientWebSocket _webSocket; public RealTimeCompletionProvider( OpenAiSettings settings, @@ -26,6 +35,163 @@ public RealTimeCompletionProvider( _services = services; } + public async Task Connect(RealtimeHubConnection conn, + Action onModelReady, + Action onModelAudioDeltaReceived, + Action onModelAudioResponseDone, + Action onAudioTranscriptDone, + Action onModelResponseDone, + Action onUserInterrupted) + { + var settingsService = _services.GetRequiredService(); + var settings = settingsService.GetSetting(provider: "openai", conn.Model); + + _webSocket = new ClientWebSocket(); + _webSocket.Options.SetRequestHeader("Authorization", $"Bearer {settings.ApiKey}"); + _webSocket.Options.SetRequestHeader("OpenAI-Beta", "realtime=v1"); + + await _webSocket.ConnectAsync(new Uri($"wss://api.openai.com/v1/realtime?model={conn.Model}"), CancellationToken.None); + + if (_webSocket.State == WebSocketState.Open) + { + onModelReady(); + + // Receive a message + _ = ReceiveMessage(onModelAudioDeltaReceived, + onModelAudioResponseDone, + onAudioTranscriptDone, + onModelResponseDone, + onUserInterrupted); + } + } + + public async Task Disconnect() + { + await _webSocket.CloseAsync(WebSocketCloseStatus.Empty, null, CancellationToken.None); + } + + public async Task AppenAudioBuffer(string message) + { + var audioAppend = new + { + type = "input_audio_buffer.append", + audio = message + }; + + await SendEventToModel(audioAppend); + } + + public async Task TriggerModelInference(string? instructions = null) + { + // Triggering model inference + await SendEventToModel(new + { + type = "response.create", + response = new + { + instructions + } + }); + } + + private async Task ReceiveMessage(Action onModelAudioDeltaReceived, + Action onModelAudioResponseDone, + Action onAudioTranscriptDone, + Action onModelResponseDone, + Action onUserInterrupted) + { + var buffer = new byte[1024 * 1024 * 1]; + WebSocketReceiveResult result; + string lastAssistantItem = ""; + do + { + result = await _webSocket.ReceiveAsync( + new ArraySegment(buffer), CancellationToken.None); + + // Convert received data to text/audio (Twilio sends Base64-encoded audio) + string receivedText = Encoding.UTF8.GetString(buffer, 0, result.Count); + if (string.IsNullOrEmpty(receivedText)) + { + continue; + } + _logger.LogDebug($"{nameof(RealTimeCompletionProvider)} received: {receivedText}"); + var response = JsonSerializer.Deserialize(receivedText); + + if (response.Type == "error") + { + var error = JsonSerializer.Deserialize(receivedText); + _logger.LogError($"Error: {error.Body.Message}"); + } + else if (response.Type == "session.created") + { + _logger.LogInformation($"{response.Type}: {receivedText}"); + } + else if (response.Type == "session.updated") + { + _logger.LogInformation($"{response.Type}: {receivedText}"); + } + else if (response.Type == "response.audio_transcript.delta") + { + + } + else if (response.Type == "response.audio_transcript.done") + { + _logger.LogInformation($"{response.Type}: {receivedText}"); + var data = JsonSerializer.Deserialize(receivedText); + onAudioTranscriptDone(data.Transcript); + } + else if (response.Type == "response.audio.delta") + { + var audio = JsonSerializer.Deserialize(receivedText); + lastAssistantItem = audio?.ItemId ?? ""; + + if (audio != null && audio.Delta != null) + { + onModelAudioDeltaReceived(audio.Delta); + } + } + else if (response.Type == "response.audio.done") + { + _logger.LogInformation($"{response.Type}: {receivedText}"); + onModelAudioResponseDone(); + } + else if (response.Type == "response.done") + { + _logger.LogInformation($"{response.Type}: {receivedText}"); + onModelResponseDone(receivedText); + } + else if (response.Type == "input_audio_buffer.speech_started") + { + // var elapsedTime = latestMediaTimestamp - responseStartTimestampTwilio; + // handle use interuption + var truncateEvent = new + { + type = "conversation.item.truncate", + item_id = lastAssistantItem, + content_index = 0, + audio_end_ms = 100 + }; + + await SendEventToModel(truncateEvent); + onUserInterrupted(); + } + + } while (!result.CloseStatus.HasValue); + + await _webSocket.CloseAsync(result.CloseStatus.Value, result.CloseStatusDescription, CancellationToken.None); + } + + public async Task SendEventToModel(object message) + { + if (message is not string data) + { + data = JsonSerializer.Serialize(message); + } + + var buffer = Encoding.UTF8.GetBytes(data); + await _webSocket.SendAsync(new ArraySegment(buffer), WebSocketMessageType.Text, true, CancellationToken.None); + } + public async Task CreateSession(Agent agent, List conversations) { var contentHooks = _services.GetServices().ToList(); @@ -34,9 +200,11 @@ public async Task CreateSession(Agent agent, List { @@ -58,6 +226,93 @@ public async Task CreateSession(Agent agent, List UpdateInitialSession(RealtimeHubConnection conn) + { + var convService = _services.GetRequiredService(); + var conv = await convService.GetConversation(conn.ConversationId); + + var agentService = _services.GetRequiredService(); + var agent = await agentService.LoadAgent(conv.AgentId); + + var client = ProviderHelper.GetClient(Provider, _model, _services); + var chatClient = client.GetChatClient(_model); + var (prompt, messages, options) = PrepareOptions(agent, []); + + var instruction = messages.FirstOrDefault()?.Content.FirstOrDefault()?.Text ?? agent.Description; + + var sessionUpdate = new + { + type = "session.update", + session = new RealtimeSessionUpdateRequest + { + InputAudioFormat = "g711_ulaw", + OutputAudioFormat = "g711_ulaw", + Voice = "alloy", + Instructions = instruction, + ToolChoice = "auto", + Tools = options.Tools.Select(x => + { + var fn = new FunctionDef + { + Name = x.FunctionName, + Description = x.FunctionDescription + }; + fn.Parameters = JsonSerializer.Deserialize(x.FunctionParameters); + return fn; + }).ToArray(), + Modalities = [ "text", "audio" ], + Temperature = Math.Max(options.Temperature ?? 0f, 0.6f) + } + }; + + return JsonSerializer.Serialize(sessionUpdate); + } + + public async Task InsertConversationItem(RoleDialogModel message) + { + if (message.Role == AgentRole.Function) + { + var functionConversationItem = new + { + type = "conversation.item.create", + item = new + { + call_id = message.ToolCallId, + type = "function_call_output", + output = message.Content + } + }; + return JsonSerializer.Serialize(functionConversationItem); + } + else if (message.Role == AgentRole.User || + message.Role == AgentRole.Assistant) + { + var conversationItem = new + { + type = "conversation.item.create", + item = new + { + type = "message", + role = message.Role, + content = new object[] + { + new + { + type = "text", + text = message.Content + } + } + } + }; + + return JsonSerializer.Serialize(conversationItem); + } + else + { + throw new NotImplementedException(""); + } + } + protected (string, IEnumerable, ChatCompletionOptions) PrepareOptions(Agent agent, List conversations) { var agentService = _services.GetRequiredService(); @@ -171,7 +426,6 @@ public async Task CreateSession(Agent agent, List messages, ChatCompletionOptions options) { var prompt = string.Empty; @@ -243,4 +497,26 @@ public void SetModelName(string model) { _model = model; } + + public async Task> OnResponsedDone(RealtimeHubConnection conn, string response) + { + var outputs = new List(); + + var data = JsonSerializer.Deserialize(response).Body; + foreach (var output in data.Outputs) + { + if (output.Type == "function_call") + { + outputs.Add(new RoleDialogModel(AgentRole.Assistant, output.Arguments) + { + FunctionName = output.Name, + FunctionArgs = output.Arguments, + MessageType = output.Type, + ToolCallId = output.CallId + }); + } + } + + return outputs; + } } diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Using.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Using.cs index ac1022a23..e8e458c2b 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Using.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Using.cs @@ -3,8 +3,11 @@ global using System.Linq; global using System.IO; global using System.Threading.Tasks; +global using System.Text.Json.Serialization; + global using Microsoft.Extensions.DependencyInjection; global using Microsoft.Extensions.Logging; + global using BotSharp.Abstraction.Agents.Enums; global using BotSharp.Abstraction.Agents.Constants; global using BotSharp.Abstraction.Agents.Models; @@ -17,4 +20,4 @@ global using BotSharp.Abstraction.Files.Models; global using BotSharp.Abstraction.Utilities; global using BotSharp.Plugin.OpenAI.Models; -global using BotSharp.Plugin.OpenAI.Settings; \ No newline at end of file +global using BotSharp.Plugin.OpenAI.Settings; diff --git a/src/Plugins/BotSharp.Plugin.Twilio/BotSharp.Plugin.Twilio.csproj b/src/Plugins/BotSharp.Plugin.Twilio/BotSharp.Plugin.Twilio.csproj index aff07e81b..91c81e90b 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/BotSharp.Plugin.Twilio.csproj +++ b/src/Plugins/BotSharp.Plugin.Twilio/BotSharp.Plugin.Twilio.csproj @@ -25,8 +25,8 @@ - - + + diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioStreamController.cs b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioStreamController.cs new file mode 100644 index 000000000..e398ff722 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioStreamController.cs @@ -0,0 +1,108 @@ +using BotSharp.Abstraction.Infrastructures; +using BotSharp.Core.Infrastructures; +using BotSharp.Plugin.Twilio.Interfaces; +using BotSharp.Plugin.Twilio.Models; +using BotSharp.Plugin.Twilio.Services; +using Microsoft.AspNetCore.Http; +using Microsoft.AspNetCore.Mvc; +using Twilio.TwiML.Voice; +using Conversation = BotSharp.Abstraction.Conversations.Models.Conversation; +using Task = System.Threading.Tasks.Task; + +namespace BotSharp.Plugin.Twilio.Controllers; + +public class TwilioStreamController : TwilioController +{ + private readonly TwilioSetting _settings; + private readonly IServiceProvider _services; + private readonly IHttpContextAccessor _context; + private readonly ILogger _logger; + + public TwilioStreamController(TwilioSetting settings, IServiceProvider services, IHttpContextAccessor context, ILogger logger) + { + _settings = settings; + _services = services; + _context = context; + _logger = logger; + } + + [ValidateRequest] + [HttpPost("twilio/stream")] + public async Task InitiateStreamConversation(ConversationalVoiceRequest request) + { + var text = JsonSerializer.Serialize(request); + if (request?.CallSid == null) + { + throw new ArgumentNullException(nameof(VoiceRequest.CallSid)); + } + + VoiceResponse response = null; + var instruction = new ConversationalVoiceResponse + { + // SpeechPaths = ["twilio/welcome.mp3"], + ActionOnEmptyResult = true + }; + + if (_context.HttpContext.Request.Query.ContainsKey("conversation_id")) + { + request.ConversationId = _context.HttpContext.Request.Query["conversation_id"]; + } + else + { + request.ConversationId = request.CallSid; + } + + await HookEmitter.Emit(_services, async hook => + { + await hook.OnSessionCreating(request, instruction); + }, new HookEmitOption + { + OnlyOnce = true + }); + + await InitConversation(request); + + var twilio = _services.GetRequiredService(); + + response = twilio.ReturnBidirectionalMediaStreamsInstructions(request.ConversationId, instruction); + + await HookEmitter.Emit(_services, async hook => + { + await hook.OnSessionCreated(request); + }, new HookEmitOption + { + OnlyOnce = true + }); + + return TwiML(response); + } + + private async Task InitConversation(ConversationalVoiceRequest request) + { + var convService = _services.GetRequiredService(); + var conversation = await convService.GetConversation(request.ConversationId); + if (conversation != null) + { + return; + } + + var states = new List + { + new("channel", ConversationChannel.Phone), + new("calling_phone", request.From) + }; + + var conv = new Conversation + { + Id = request.CallSid, + AgentId = _settings.AgentId, + Channel = ConversationChannel.Phone, + Title = $"Phone call from {request.From}", + Tags = [], + }; + + conv = await convService.NewConversation(conv); + convService.SetConversationId(conv.Id, states); + convService.SaveStates(); + } +} diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs index 5ef088e39..191446731 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs @@ -38,6 +38,13 @@ public TwilioVoiceController(TwilioSetting settings, IServiceProvider services, [HttpPost("twilio/voice/welcome")] public async Task InitiateConversation(ConversationalVoiceRequest request) { + foreach(var header in Request.Headers) + { + _logger.LogWarning($"{header.Key}: {header.Value}"); + } + + _logger.LogWarning($"{Request.Path}{Request.QueryString}"); + var text = JsonSerializer.Serialize(request); if (request?.CallSid == null) { @@ -101,7 +108,7 @@ await HookEmitter.Emit(_services, async hook => /// /// /// - [ValidateRequest] + // [ValidateRequest] [HttpPost("twilio/voice/{conversationId}/receive/{seqNum}")] public async Task ReceiveCallerMessage(ConversationalVoiceRequest request) { @@ -195,7 +202,7 @@ await HookEmitter.Emit(_services, async hook => /// /// /// - [ValidateRequest] + // [ValidateRequest] [HttpPost("twilio/voice/{conversationId}/reply/{seqNum}")] public async Task ReplyCallerMessage(ConversationalVoiceRequest request) { @@ -360,7 +367,7 @@ await HookEmitter.Emit(_services, async hook => return TwiML(response); } - [ValidateRequest] + // [ValidateRequest] [HttpPost("twilio/voice/init-call")] public TwiMLResult InitiateOutboundCall(VoiceRequest request, [Required][FromQuery] string conversationId) { @@ -381,7 +388,7 @@ public TwiMLResult InitiateOutboundCall(VoiceRequest request, [Required][FromQue return TwiML(response); } - [ValidateRequest] + // [ValidateRequest] [HttpGet("twilio/voice/speeches/{conversationId}/{fileName}")] public async Task GetSpeechFile([FromRoute] string conversationId, [FromRoute] string fileName) { diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Interfaces/ITwilioSessionHook.cs b/src/Plugins/BotSharp.Plugin.Twilio/Interfaces/ITwilioSessionHook.cs index 775d4b76e..29d4bc7c8 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Interfaces/ITwilioSessionHook.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Interfaces/ITwilioSessionHook.cs @@ -1,3 +1,4 @@ +using BotSharp.Abstraction.Realtime.Models; using BotSharp.Plugin.Twilio.Models; using Task = System.Threading.Tasks.Task; @@ -23,6 +24,9 @@ Task OnSessionCreating(ConversationalVoiceRequest request, ConversationalVoiceRe Task OnSessionCreated(ConversationalVoiceRequest request) => Task.CompletedTask; + Task OnStreamingStarted(RealtimeHubConnection conn) + => Task.CompletedTask; + /// /// On received user message /// diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventMediaResponse.cs b/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventMediaResponse.cs new file mode 100644 index 000000000..5ad291b75 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventMediaResponse.cs @@ -0,0 +1,24 @@ +using System.Text.Json.Serialization; + +namespace BotSharp.Plugin.Twilio.Models.Stream; + +public class StreamEventMediaResponse : StreamEventResponse +{ + [JsonPropertyName("media")] + public StreamEventMediaBody Body { get; set; } +} + +public class StreamEventMediaBody +{ + [JsonPropertyName("track")] + public string Track { get; set; } + + [JsonPropertyName("chunk")] + public string Chunk { get; set; } + + [JsonPropertyName("timestamp")] + public string Timestamp { get; set; } + + [JsonPropertyName("payload")] + public string Payload { get; set; } +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventResponse.cs b/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventResponse.cs new file mode 100644 index 000000000..5be7aa606 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventResponse.cs @@ -0,0 +1,18 @@ +using System.Text.Json.Serialization; + +namespace BotSharp.Plugin.Twilio.Models.Stream; + +public class StreamEventResponse +{ + /// + /// connected, start, media, stop + /// + [JsonPropertyName("event")] + public string Event { get; set; } + + [JsonPropertyName("sequenceNumber")] + public string SequenceNumber { get; set; } + + [JsonPropertyName("streamSid")] + public string StreamSid { get; set; } +} diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventStartResponse.cs b/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventStartResponse.cs new file mode 100644 index 000000000..28b818d28 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventStartResponse.cs @@ -0,0 +1,24 @@ +using System.Text.Json.Serialization; + +namespace BotSharp.Plugin.Twilio.Models.Stream; + +public class StreamEventStartResponse : StreamEventResponse +{ + [JsonPropertyName("start")] + public StreamEventStartBody Body { get; set; } +} + +public class StreamEventStartBody +{ + [JsonPropertyName("accountSid")] + public string AccountSid { get; set; } + + [JsonPropertyName("callSid")] + public string CallSid { get; set; } + + [JsonPropertyName("tracks")] + public string[] Tracks { get; set; } + + [JsonPropertyName("customParameters")] + public JsonDocument CustomParameters { get; set; } +} diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventStopResponse.cs b/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventStopResponse.cs new file mode 100644 index 000000000..7ff6f69a2 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/StreamEventStopResponse.cs @@ -0,0 +1,24 @@ +using System.Text.Json.Serialization; + +namespace BotSharp.Plugin.Twilio.Models.Stream; + +public class StreamEventStopResponse : StreamEventResponse +{ + [JsonPropertyName("sequenceNumber")] + public string SequenceNumber { get; set; } + + [JsonPropertyName("streamSid")] + public string StreamSid { get; set; } + + [JsonPropertyName("stop")] + public StreamEventStopBody Body { get; set; } +} + +public class StreamEventStopBody +{ + [JsonPropertyName("accountSid")] + public string AccountSid { get; set; } + + [JsonPropertyName("callSid")] + public string CallSid { get; set; } +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/TwilioHubCallerContext.cs b/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/TwilioHubCallerContext.cs new file mode 100644 index 000000000..11e9c774f --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.Twilio/Models/Stream/TwilioHubCallerContext.cs @@ -0,0 +1,37 @@ +using Microsoft.AspNetCore.Http.Features; +using Microsoft.AspNetCore.SignalR; +using System.Security.Claims; +using System.Threading; + +namespace BotSharp.Plugin.Twilio.Models.Stream; + +public class TwilioHubCallerContext : HubCallerContext +{ + private readonly HubConnectionContext _connection; + + public TwilioHubCallerContext(HubConnectionContext connection) + { + _connection = connection; + } + + /// + public override string ConnectionId => _connection.ConnectionId; + + /// + public override string? UserIdentifier => _connection.UserIdentifier; + + /// + public override ClaimsPrincipal? User => _connection.User; + + /// + public override IDictionary Items => _connection.Items; + + /// + public override IFeatureCollection Features => _connection.Features; + + /// + public override CancellationToken ConnectionAborted => _connection.ConnectionAborted; + + /// + public override void Abort() => _connection.Abort(); +} diff --git a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HandleOutboundPhoneCallFn.cs b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HandleOutboundPhoneCallFn.cs index b032d8edf..ea9b08a1f 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HandleOutboundPhoneCallFn.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HandleOutboundPhoneCallFn.cs @@ -1,4 +1,5 @@ using BotSharp.Abstraction.Files; +using BotSharp.Abstraction.Infrastructures.Enums; using BotSharp.Abstraction.Options; using BotSharp.Abstraction.Routing; using BotSharp.Core.Infrastructures; @@ -55,6 +56,7 @@ public async Task Execute(RoleDialogModel message) var routing = _services.GetRequiredService(); var fileStorage = _services.GetRequiredService(); var sessionManager = _services.GetRequiredService(); + var states = _services.GetRequiredService(); // Fork conversation var entryAgentId = routing.EntryAgentId; @@ -75,9 +77,10 @@ public async Task Execute(RoleDialogModel message) CurrentAgentId = entryAgentId } }); + states.SetState(StateConst.SUB_CONVERSATION_ID, conversationId); // Generate audio - var completion = CompletionProvider.GetAudioCompletion(_services, "openai", "tts-1"); + /*var completion = CompletionProvider.GetAudioCompletion(_services, "openai", "tts-1"); var data = await completion.GenerateAudioFromTextAsync(args.InitialMessage); var fileName = $"intial.mp3"; fileStorage.SaveSpeechFile(conversationId, fileName, data); @@ -87,16 +90,17 @@ public async Task Execute(RoleDialogModel message) { Content = args.InitialMessage, SpeechFileName = fileName - }); + });*/ var call = await CallResource.CreateAsync( - url: new Uri($"{_twilioSetting.CallbackHost}/twilio/voice/init-call?conversationId={conversationId}"), + // url: new Uri($"{_twilioSetting.CallbackHost}/twilio/voice/init-call?conversationId={conversationId}"), + url: new Uri($"{_twilioSetting.CallbackHost}/twilio/stream?conversation_id={conversationId}"), to: new PhoneNumber(args.PhoneNumber), from: new PhoneNumber(_twilioSetting.PhoneNumber), asyncAmd: "true", machineDetection: "DetectMessageEnd"); - message.Content = $"The generated phone message: {args.InitialMessage}. \r\n[Conversation ID: {conversationId}]" ?? message.Content; + message.Content = $"The generated phone message: {args.InitialMessage}." ?? message.Content; message.StopCompletion = true; return true; } diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Services/Stream/TwilioStreamMiddleware.cs b/src/Plugins/BotSharp.Plugin.Twilio/Services/Stream/TwilioStreamMiddleware.cs new file mode 100644 index 000000000..492b5dcf7 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.Twilio/Services/Stream/TwilioStreamMiddleware.cs @@ -0,0 +1,114 @@ +using BotSharp.Abstraction.Realtime; +using BotSharp.Abstraction.Realtime.Models; +using BotSharp.Core.Infrastructures; +using BotSharp.Plugin.Twilio.Interfaces; +using BotSharp.Plugin.Twilio.Models.Stream; +using Microsoft.AspNetCore.Http; +using System.Net.WebSockets; +using Task = System.Threading.Tasks.Task; + +namespace BotSharp.Plugin.Twilio.Services.Stream; + +/// +/// Refrence to https://github.com/twilio-samples/speech-assistant-openai-realtime-api-node/blob/main/index.js +/// +public class TwilioStreamMiddleware +{ + private readonly RequestDelegate _next; + + public TwilioStreamMiddleware(RequestDelegate next) + { + _next = next; + } + + public async Task Invoke(HttpContext httpContext) + { + var request = httpContext.Request; + + if (request.Path.StartsWithSegments("/twilio/stream")) + { + if (httpContext.WebSockets.IsWebSocketRequest) + { + var services = httpContext.RequestServices; + var conversationId = request.Path.Value.Split("/").Last(); + using WebSocket webSocket = await httpContext.WebSockets.AcceptWebSocketAsync(); + await HandleWebSocket(services, conversationId, webSocket); + return; + } + } + + await _next(httpContext); + } + + private async Task HandleWebSocket(IServiceProvider services, string conversationId, WebSocket webSocket) + { + var hub = services.GetRequiredService(); + + var conn = new RealtimeHubConnection + { + ConversationId = conversationId + }; + + // load conversation and state + var convService = services.GetRequiredService(); + convService.SetConversationId(conversationId, []); + var hooks = services.GetServices(); + foreach (var hook in hooks) + { + await hook.OnStreamingStarted(conn); + } + convService.States.Save(); + + await hub.Listen(webSocket, (receivedText) => + { + var response = JsonSerializer.Deserialize(receivedText); + conn.StreamId = response.StreamSid; + conn.Event = response.Event switch + { + "start" => "user_connected", + "media" => "user_data_received", + "stop" => "user_disconnected", + _ => response.Event + }; + + if (string.IsNullOrEmpty(conn.Event)) + { + return conn; + } + + conn.OnModelMessageReceived = message => + new + { + @event = "media", + streamSid = response.StreamSid, + media = new { payload = message } + }; + conn.OnModelAudioResponseDone = () => + new + { + @event = "mark", + streamSid = response.StreamSid, + mark = new { name = "responsePart" } + }; + conn.OnModelUserInterrupted = () => + new + { + @event = "clear", + streamSid = response.StreamSid + }; + + if (response.Event == "start") + { + var startResponse = JsonSerializer.Deserialize(receivedText); + conn.Data = JsonSerializer.Serialize(startResponse.Body.CustomParameters); + } + else if (response.Event == "media") + { + var mediaResponse = JsonSerializer.Deserialize(receivedText); + conn.Data = mediaResponse.Body.Payload; + } + + return conn; + }); + } +} diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs b/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs index 5d407b23d..99595b5f0 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs @@ -1,6 +1,7 @@ using BotSharp.Abstraction.Utilities; using BotSharp.Plugin.Twilio.Models; using Twilio.Jwt.AccessToken; +using Twilio.TwiML.Messaging; using Token = Twilio.Jwt.AccessToken.Token; namespace BotSharp.Plugin.Twilio.Services; @@ -175,4 +176,27 @@ public VoiceResponse HoldOn(int interval, string message = null) response.Append(gather); return response; } + + /// + /// Bidirectional Media Streams + /// + /// + /// + public VoiceResponse ReturnBidirectionalMediaStreamsInstructions(string conversationId, ConversationalVoiceResponse conversationalVoiceResponse) + { + var response = new VoiceResponse(); + if (conversationalVoiceResponse.SpeechPaths != null && conversationalVoiceResponse.SpeechPaths.Any()) + { + foreach (var speechPath in conversationalVoiceResponse.SpeechPaths) + { + response.Play(new Uri($"{_settings.CallbackHost}/{speechPath}")); + } + } + var connect = new Connect(); + var host = _settings.CallbackHost.Split("://").Last(); + connect.Stream(url: $"wss://{host}/twilio/stream/{conversationId}"); + response.Append(connect); + + return response; + } } diff --git a/src/Plugins/BotSharp.Plugin.Twilio/TwilioPlugin.cs b/src/Plugins/BotSharp.Plugin.Twilio/TwilioPlugin.cs index d78489ad7..a31dbb027 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/TwilioPlugin.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/TwilioPlugin.cs @@ -1,7 +1,9 @@ +using BotSharp.Abstraction.Realtime; using BotSharp.Abstraction.Settings; using BotSharp.Plugin.Twilio.Interfaces; using BotSharp.Plugin.Twilio.OutboundPhoneCallHandler.Hooks; using BotSharp.Plugin.Twilio.Services; +using BotSharp.Plugin.Twilio.Services.Stream; using StackExchange.Redis; using Twilio;