diff --git a/OpenAI/Packages/com.openai.unity/Documentation~/README.md b/OpenAI/Packages/com.openai.unity/Documentation~/README.md index 72e8262a..3729dff6 100644 --- a/OpenAI/Packages/com.openai.unity/Documentation~/README.md +++ b/OpenAI/Packages/com.openai.unity/Documentation~/README.md @@ -61,12 +61,12 @@ The recommended installation method is though the unity package manager and [Ope - [List Models](#list-models) - [Retrieve Models](#retrieve-model) - [Delete Fine Tuned Model](#delete-fine-tuned-model) -- [Realtime](#realtime) :new: - - [Create Realtime Session](#create-realtime-session) :new: - - [Client Events](#client-events) :new: - - [Sending Client Events](#sending-client-events) :new: - - [Server Events](#server-events) :new: - - [Receiving Server Events](#receiving-server-events) :new: +- [Realtime](#realtime) + - [Create Realtime Session](#create-realtime-session) + - [Client Events](#client-events) + - [Sending Client Events](#sending-client-events) + - [Server Events](#server-events) + - [Receiving Server Events](#receiving-server-events) - [Assistants](#assistants) - [List Assistants](#list-assistants) - [Create Assistant](#create-assistant) @@ -118,7 +118,7 @@ The recommended installation method is though the unity package manager and [Ope - [Streaming](#chat-streaming) - [Tools](#chat-tools) - [Vision](#chat-vision) - - [Audio](#chat-audio) :new: + - [Audio](#chat-audio) - [Structured Outputs](#chat-structured-outputs) - [Json Mode](#chat-json-mode) - [Audio](#audio) @@ -1555,6 +1555,7 @@ Debug.Log($"{result.FirstChoice.Message.Role}: {result.FirstChoice} | Finish Rea #### [Chat Audio](https://platform.openai.com/docs/guides/audio) ```csharp +var api = new OpenAIClient(); var messages = new List { new Message(Role.System, "You are a helpful assistant."), @@ -1662,9 +1663,9 @@ Generates audio from the input text. ```csharp var api = new OpenAIClient(); var request = new SpeechRequest("Hello world!"); -var (path, clip) = await api.AudioEndpoint.CreateSpeechAsync(request); -audioSource.PlayOneShot(clip); -Debug.Log(path); +var speechClip = await api.AudioEndpoint.CreateSpeechAsync(request); +audioSource.PlayOneShot(speechClip); +Debug.Log(speechClip); ``` ##### [Stream Speech] @@ -1673,11 +1674,17 @@ Generate streamed audio from the input text. ```csharp var api = new OpenAIClient(); -var request = new SpeechRequest("Hello world!"); -var (path, clip) = await api.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => audioSource.PlayOneShot(partialClip)); -Debug.Log(path); +var request = new SpeechRequest("Hello world!", responseFormat: SpeechResponseFormat.PCM); +var speechClip = await api.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => +{ + audioSource.PlayOneShot(partialClip); +}); +Debug.Log(speechClip); ``` +> [!NOTE] +> Checkout any of the demo scenes for best practices on how to handle playback with `OnAudioFilterRead`. + #### [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription) Transcribes audio into the input language. diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs index 82a622ad..c9c4b73b 100644 --- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs @@ -7,7 +7,6 @@ using System.Threading; using System.Threading.Tasks; using UnityEngine; -using UnityEngine.Networking; using Utilities.WebRequestRest; namespace OpenAI.Audio @@ -27,25 +26,29 @@ internal AudioEndpoint(OpenAIClient client) : base(client) { } private static readonly object mutex = new(); - /// - /// Generates audio from the input text. - /// - /// . - /// Optional, . - /// and the cached path. - [Function("Generates audio from the input text.")] + [Obsolete("use GetSpeechAsync")] public async Task> CreateSpeechAsync(SpeechRequest request, CancellationToken cancellationToken = default) => await CreateSpeechStreamAsync(request, null, cancellationToken); + [Obsolete("use GetSpeechAsync")] + public async Task> CreateSpeechStreamAsync(SpeechRequest request, Action partialClipCallback, CancellationToken cancellationToken = default) + { + var result = await GetSpeechAsync(request, speechClip => + { + partialClipCallback.Invoke(speechClip.AudioClip); + }, cancellationToken); + return Tuple.Create(result.CachePath, result.AudioClip); + } + /// - /// Generates streaming audio from the input text. + /// Generates audio from the input text. /// /// . - /// Optional, partial callback used to stream audio. + /// Optional, partial callback used to stream audio. /// Optional, . - /// and the cached path. - [Function("Generates streaming audio from the input text.")] - public async Task> CreateSpeechStreamAsync(SpeechRequest request, Action partialClipCallback, CancellationToken cancellationToken = default) + /// + [Function("Generates audio from the input text.")] + public async Task GetSpeechAsync(SpeechRequest request, Action partialClipCallback = null, CancellationToken cancellationToken = default) { if (partialClipCallback != null && request.ResponseFormat != SpeechResponseFormat.PCM) { @@ -70,52 +73,16 @@ public async Task> CreateSpeechStreamAsync(SpeechReques Rest.TryGetDownloadCacheItem(clipName, out var cachedPath); - if (request.ResponseFormat == SpeechResponseFormat.PCM) - { - var part = 0; - var response = await Rest.PostAsync( - GetUrl("/speech"), - payload, - StreamCallback, - eventChunkSize: 8192, - new RestParameters(client.DefaultRequestHeaders), - cancellationToken); - response.Validate(EnableDebug); - var samples = Utilities.Audio.PCMEncoder.Decode(response.Data); - await File.WriteAllBytesAsync(cachedPath, response.Data, cancellationToken).ConfigureAwait(true); - return new Tuple(cachedPath, AudioClip.Create(clipName, samples.Length, 1, 24000, false)); - - void StreamCallback(Response partialResponse) - { - var chunk = Utilities.Audio.PCMEncoder.Decode(partialResponse.Data); - var partialClip = AudioClip.Create($"{clipName}_{++part}", chunk.Length, 1, 24000, false); - - if (!partialClip.SetData(chunk, 0)) - { - Debug.LogError("Failed to set pcm data to partial clip."); - return; - } - - partialClipCallback?.Invoke(partialClip); - } - } + var part = 0; + var pcmResponse = await Rest.PostAsync(GetUrl("/speech"), payload, StreamCallback, 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken); + pcmResponse.Validate(EnableDebug); + await File.WriteAllBytesAsync(cachedPath, pcmResponse.Data, cancellationToken).ConfigureAwait(true); + return new SpeechClip(clipName, cachedPath, new ReadOnlyMemory(pcmResponse.Data)); - var audioFormat = request.ResponseFormat switch + void StreamCallback(Response partialResponse) { - SpeechResponseFormat.MP3 => AudioType.MPEG, - SpeechResponseFormat.WAV => AudioType.WAV, - _ => throw new NotSupportedException(request.ResponseFormat.ToString()) - }; - - var clip = await Rest.DownloadAudioClipAsync( - GetUrl("/speech"), - audioFormat, - UnityWebRequest.kHttpVerbPOST, - clipName, - payload, - parameters: new RestParameters(client.DefaultRequestHeaders, debug: EnableDebug), - cancellationToken: cancellationToken); - return new Tuple(cachedPath, clip); + partialClipCallback?.Invoke(new SpeechClip($"{clipName}_{++part}", null, partialResponse.Data)); + } } /// diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs new file mode 100644 index 00000000..ef774dd3 --- /dev/null +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs @@ -0,0 +1,56 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System; +using UnityEngine; +using UnityEngine.Scripting; +using Utilities.Audio; + +namespace OpenAI.Audio +{ + [Preserve] + public sealed class SpeechClip + { + [Preserve] + internal SpeechClip(string name, string cachePath, ReadOnlyMemory audioData, int sampleRate = 24000) + { + Name = name; + CachePath = cachePath; + AudioData = audioData; + SampleRate = sampleRate; + } + + [Preserve] + public string Name { get; } + + [Preserve] + public string CachePath { get; } + + [Preserve] + public ReadOnlyMemory AudioData { get; } + + [Preserve] + public float[] AudioSamples + => PCMEncoder.Resample(PCMEncoder.Decode(AudioData.ToArray()), SampleRate, 44100); + + [Preserve] + public int SampleRate { get; } + + [Preserve] + public AudioClip AudioClip + { + get + { + var samples = AudioSamples; + var clip = AudioClip.Create(Name, samples.Length, 1, 44100, false); + clip.SetData(samples, 0); + return clip; + } + } + + [Preserve] + public static implicit operator AudioClip(SpeechClip clip) => clip?.AudioClip; + + [Preserve] + public static implicit operator string(SpeechClip clip) => clip?.CachePath; + } +} diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs.meta b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs.meta new file mode 100644 index 00000000..dc74396e --- /dev/null +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 6ece7c46c60b14641ac6673ddbed795c +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {fileID: 2800000, guid: 84a7eb8fc6eba7540bf56cea8e12249c, type: 3} + userData: + assetBundleName: + assetBundleVariant: diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Realtime/ResponseAudioResponse.cs b/OpenAI/Packages/com.openai.unity/Runtime/Realtime/ResponseAudioResponse.cs index 13e58f9d..a4f5e974 100644 --- a/OpenAI/Packages/com.openai.unity/Runtime/Realtime/ResponseAudioResponse.cs +++ b/OpenAI/Packages/com.openai.unity/Runtime/Realtime/ResponseAudioResponse.cs @@ -1,6 +1,7 @@ // Licensed under the MIT License. See LICENSE in the project root for license information. using Newtonsoft.Json; +using System; using UnityEngine; using UnityEngine.Scripting; using Utilities.Audio; @@ -72,6 +73,11 @@ internal ResponseAudioResponse( [JsonProperty("delta")] public string Delta { get; } + [Preserve] + [JsonIgnore] + public float[] AudioSamples + => PCMEncoder.Resample(PCMEncoder.Decode(Convert.FromBase64String(Delta)), 24000, 44100); + [Preserve] [JsonIgnore] public bool IsDelta => Type.EndsWith("delta"); @@ -83,8 +89,8 @@ internal ResponseAudioResponse( [Preserve] public static implicit operator AudioClip(ResponseAudioResponse response) { - var audioSamples = PCMEncoder.Decode(System.Convert.FromBase64String(response.Delta)); - var audioClip = AudioClip.Create($"{response.ItemId}_{response.OutputIndex}_delta", audioSamples.Length, 1, 24000, false); + var audioSamples = response.AudioSamples; + var audioClip = AudioClip.Create($"{response.ItemId}_{response.OutputIndex}_delta", audioSamples.Length, 1, 44100, false); audioClip.SetData(audioSamples, 0); return audioClip; } diff --git a/OpenAI/Packages/com.openai.unity/Samples~/Assistant/AssistantBehaviour.cs b/OpenAI/Packages/com.openai.unity/Samples~/Assistant/AssistantBehaviour.cs index 36322542..e3a1e698 100644 --- a/OpenAI/Packages/com.openai.unity/Samples~/Assistant/AssistantBehaviour.cs +++ b/OpenAI/Packages/com.openai.unity/Samples~/Assistant/AssistantBehaviour.cs @@ -6,6 +6,7 @@ using OpenAI.Models; using OpenAI.Threads; using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading; @@ -22,6 +23,7 @@ namespace OpenAI.Samples.Assistant { + [RequireComponent(typeof(AudioSource))] public class AssistantBehaviour : MonoBehaviour { [SerializeField] @@ -59,6 +61,7 @@ public class AssistantBehaviour : MonoBehaviour private OpenAIClient openAI; private AssistantResponse assistant; private ThreadResponse thread; + private readonly ConcurrentQueue sampleQueue = new(); #if !UNITY_2022_3_OR_NEWER private readonly CancellationTokenSource lifetimeCts = new(); @@ -73,7 +76,11 @@ private void OnValidate() contentArea.Validate(); submitButton.Validate(); recordButton.Validate(); - audioSource.Validate(); + + if (audioSource == null) + { + audioSource = GetComponent(); + } } private async void Awake() @@ -155,6 +162,22 @@ private async void Awake() } } + private void OnAudioFilterRead(float[] data, int channels) + { + if (sampleQueue.IsEmpty) { return; } + + for (var i = 0; i < data.Length; i += channels) + { + if (sampleQueue.TryDequeue(out var sample)) + { + for (var j = 0; j < channels; j++) + { + data[i + j] = sample; + } + } + } + } + private void OnDestroy() { #if !UNITY_2022_3_OR_NEWER @@ -308,69 +331,21 @@ private async Task GenerateSpeechAsync(string text, CancellationToken cancellati #pragma warning disable CS0612 // Type or member is obsolete var request = new SpeechRequest(text, Model.TTS_1, voice, SpeechResponseFormat.PCM); #pragma warning restore CS0612 // Type or member is obsolete - var streamClipQueue = new Queue(); - var streamTcs = new TaskCompletionSource(); - var audioPlaybackTask = PlayStreamQueueAsync(streamTcs.Task); - var (clipPath, fullClip) = await openAI.AudioEndpoint.CreateSpeechStreamAsync(request, clip => streamClipQueue.Enqueue(clip), cancellationToken); - streamTcs.SetResult(true); + var speechClip = await openAI.AudioEndpoint.GetSpeechAsync(request, partialClip => + { + foreach (var sample in partialClip.AudioSamples) + { + sampleQueue.Enqueue(sample); + } + }, cancellationToken); if (enableDebug) { - Debug.Log(clipPath); + Debug.Log(speechClip.CachePath); } - await audioPlaybackTask; - audioSource.clip = fullClip; - - async Task PlayStreamQueueAsync(Task streamTask) - { - try - { - bool IsStreamTaskDone() - => streamTask.IsCompleted || destroyCancellationToken.IsCancellationRequested; - - await new WaitUntil(() => streamClipQueue.Count > 0 || IsStreamTaskDone()); - if (IsStreamTaskDone()) { return; } - var endOfFrame = new WaitForEndOfFrame(); - - do - { - if (!audioSource.isPlaying && - streamClipQueue.TryDequeue(out var clip)) - { - if (enableDebug) - { - Debug.Log($"playing partial clip: {clip.name} | ({streamClipQueue.Count} remaining)"); - } - - audioSource.PlayOneShot(clip); - // ReSharper disable once MethodSupportsCancellation - await Task.Delay(TimeSpan.FromSeconds(clip.length), cancellationToken).ConfigureAwait(true); - } - else - { - await endOfFrame; - } - - if (streamTask.IsCompleted && !audioSource.isPlaying && streamClipQueue.Count == 0) - { - return; - } - } while (!cancellationToken.IsCancellationRequested); - } - catch (Exception e) - { - switch (e) - { - case TaskCanceledException: - case OperationCanceledException: - break; - default: - Debug.LogError(e); - break; - } - } - } + await new WaitUntil(() => sampleQueue.IsEmpty || cancellationToken.IsCancellationRequested); + audioSource.clip = speechClip.AudioClip; } finally { diff --git a/OpenAI/Packages/com.openai.unity/Samples~/Assistant/OpenAIAssistantSample.unity b/OpenAI/Packages/com.openai.unity/Samples~/Assistant/OpenAIAssistantSample.unity index e141627d..eeeee746 100644 --- a/OpenAI/Packages/com.openai.unity/Samples~/Assistant/OpenAIAssistantSample.unity +++ b/OpenAI/Packages/com.openai.unity/Samples~/Assistant/OpenAIAssistantSample.unity @@ -578,6 +578,166 @@ CanvasRenderer: m_PrefabAsset: {fileID: 0} m_GameObject: {fileID: 422726882} m_CullTransparentMesh: 1 +--- !u!1 &445122396 +GameObject: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + serializedVersion: 6 + m_Component: + - component: {fileID: 445122399} + - component: {fileID: 445122398} + - component: {fileID: 445122397} + m_Layer: 0 + m_Name: AssistantBehaviour + m_TagString: Untagged + m_Icon: {fileID: 0} + m_NavMeshLayer: 0 + m_StaticEditorFlags: 0 + m_IsActive: 1 +--- !u!114 &445122397 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 445122396} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 11500000, guid: 3d9d46a39446f3744bffcbb493079564, type: 3} + m_Name: + m_EditorClassIdentifier: + configuration: {fileID: 0} + enableDebug: 1 + submitButton: {fileID: 1094024334} + recordButton: {fileID: 1143678156} + inputField: {fileID: 1377121433} + contentArea: {fileID: 250955499} + scrollView: {fileID: 1974642466} + audioSource: {fileID: 445122398} + voice: 0 + systemPrompt: 'You are a helpful assistant. + + - If an image is requested then + use "![Image](output.jpg)" to display it. + + - When performing function calls, + use the defaults unless explicitly told to use a specific value. + + - Images + should always be generated in base64.' +--- !u!82 &445122398 +AudioSource: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 445122396} + m_Enabled: 1 + serializedVersion: 4 + OutputAudioMixerGroup: {fileID: 0} + m_audioClip: {fileID: 0} + m_PlayOnAwake: 1 + m_Volume: 1 + m_Pitch: 1 + Loop: 0 + Mute: 0 + Spatialize: 0 + SpatializePostEffects: 0 + Priority: 128 + DopplerLevel: 1 + MinDistance: 1 + MaxDistance: 500 + Pan2D: 0 + rolloffMode: 0 + BypassEffects: 0 + BypassListenerEffects: 0 + BypassReverbZones: 0 + rolloffCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 1 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + - serializedVersion: 3 + time: 1 + value: 0 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 + panLevelCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 0 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 + spreadCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 0 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 + reverbZoneMixCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 1 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 +--- !u!4 &445122399 +Transform: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 445122396} + m_LocalRotation: {x: 0, y: 0, z: 0, w: 1} + m_LocalPosition: {x: 0, y: 0, z: 0} + m_LocalScale: {x: 1, y: 1, z: 1} + m_ConstrainProportionsScale: 1 + m_Children: [] + m_Father: {fileID: 0} + m_RootOrder: 4 + m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} --- !u!1 &530667792 GameObject: m_ObjectHideFlags: 0 @@ -1487,7 +1647,7 @@ RectTransform: m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} m_AnchorMin: {x: 0, y: 0} m_AnchorMax: {x: 0, y: 0} - m_AnchoredPosition: {x: 477.276, y: 0} + m_AnchoredPosition: {x: 480.06848, y: 0} m_SizeDelta: {x: 64, y: 64} m_Pivot: {x: 0.5, y: 0.5} --- !u!114 &1143678155 @@ -2078,8 +2238,6 @@ GameObject: - component: {fileID: 1711080859} - component: {fileID: 1711080858} - component: {fileID: 1711080857} - - component: {fileID: 1711080862} - - component: {fileID: 1711080863} m_Layer: 5 m_Name: Canvas m_TagString: Untagged @@ -2170,133 +2328,6 @@ RectTransform: m_AnchoredPosition: {x: 0, y: 0} m_SizeDelta: {x: 0, y: 0} m_Pivot: {x: 0, y: 0} ---- !u!82 &1711080862 -AudioSource: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 1711080856} - m_Enabled: 1 - serializedVersion: 4 - OutputAudioMixerGroup: {fileID: 0} - m_audioClip: {fileID: 0} - m_PlayOnAwake: 0 - m_Volume: 1 - m_Pitch: 1 - Loop: 0 - Mute: 0 - Spatialize: 0 - SpatializePostEffects: 0 - Priority: 128 - DopplerLevel: 1 - MinDistance: 1 - MaxDistance: 500 - Pan2D: 0 - rolloffMode: 0 - BypassEffects: 0 - BypassListenerEffects: 0 - BypassReverbZones: 0 - rolloffCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 1 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - - serializedVersion: 3 - time: 1 - value: 0 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 - panLevelCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 0 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 - spreadCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 0 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 - reverbZoneMixCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 1 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 ---- !u!114 &1711080863 -MonoBehaviour: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 1711080856} - m_Enabled: 1 - m_EditorHideFlags: 0 - m_Script: {fileID: 11500000, guid: 3d9d46a39446f3744bffcbb493079564, type: 3} - m_Name: - m_EditorClassIdentifier: - configuration: {fileID: 0} - enableDebug: 1 - submitButton: {fileID: 1094024334} - recordButton: {fileID: 1143678156} - inputField: {fileID: 1377121433} - contentArea: {fileID: 250955499} - scrollView: {fileID: 1974642466} - audioSource: {fileID: 1711080862} - voice: 0 - systemPrompt: 'You are a helpful assistant. - - - If an image is requested then - use "![Image](output.jpg)" to display it. - - - When performing function calls, - use the defaults unless explicitly told to use a specific value. - - - Images - should always be generated in base64.' --- !u!1 &1819767325 GameObject: m_ObjectHideFlags: 0 diff --git a/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs b/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs index 69c4b1ea..0d3d5b7e 100644 --- a/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs +++ b/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs @@ -5,6 +5,7 @@ using OpenAI.Images; using OpenAI.Models; using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Threading; using System.Threading.Tasks; @@ -20,6 +21,7 @@ namespace OpenAI.Samples.Chat { + [RequireComponent(typeof(AudioSource))] public class ChatBehaviour : MonoBehaviour { [SerializeField] @@ -59,6 +61,7 @@ public class ChatBehaviour : MonoBehaviour private readonly Conversation conversation = new(); private readonly List assistantTools = new(); + private readonly ConcurrentQueue sampleQueue = new(); #if !UNITY_2022_3_OR_NEWER private readonly CancellationTokenSource lifetimeCts = new(); @@ -72,7 +75,11 @@ private void OnValidate() contentArea.Validate(); submitButton.Validate(); recordButton.Validate(); - audioSource.Validate(); + + if (audioSource == null) + { + audioSource = GetComponent(); + } } private void Awake() @@ -89,6 +96,22 @@ private void Awake() recordButton.onClick.AddListener(ToggleRecording); } + private void OnAudioFilterRead(float[] data, int channels) + { + if (sampleQueue.Count <= 0) { return; } + + for (var i = 0; i < data.Length; i += channels) + { + if (sampleQueue.TryDequeue(out var sample)) + { + for (var j = 0; j < channels; j++) + { + data[i + j] = sample; + } + } + } + } + #if !UNITY_2022_3_OR_NEWER private void OnDestroy() { @@ -238,69 +261,21 @@ private async Task GenerateSpeechAsync(string text, CancellationToken cancellati #pragma warning disable CS0612 // Type or member is obsolete var request = new SpeechRequest(text, Model.TTS_1, voice, SpeechResponseFormat.PCM); #pragma warning restore CS0612 // Type or member is obsolete - var streamClipQueue = new Queue(); - var streamTcs = new TaskCompletionSource(); - var audioPlaybackTask = PlayStreamQueueAsync(streamTcs.Task); - var (clipPath, fullClip) = await openAI.AudioEndpoint.CreateSpeechStreamAsync(request, clip => streamClipQueue.Enqueue(clip), destroyCancellationToken); - streamTcs.SetResult(true); + var speechClip = await openAI.AudioEndpoint.GetSpeechAsync(request, partialCLip => + { + foreach (var sample in partialCLip.AudioSamples) + { + sampleQueue.Enqueue(sample); + } + }, destroyCancellationToken); if (enableDebug) { - Debug.Log(clipPath); + Debug.Log(speechClip.CachePath); } - await audioPlaybackTask; - audioSource.clip = fullClip; - - async Task PlayStreamQueueAsync(Task streamTask) - { - try - { - bool IsStreamTaskDone() - => streamTask.IsCompleted || destroyCancellationToken.IsCancellationRequested; - - await new WaitUntil(() => streamClipQueue.Count > 0 || IsStreamTaskDone()); - if (IsStreamTaskDone()) { return; } - var endOfFrame = new WaitForEndOfFrame(); - - do - { - if (!audioSource.isPlaying && - streamClipQueue.TryDequeue(out var clip)) - { - if (enableDebug) - { - Debug.Log($"playing partial clip: {clip.name} | ({streamClipQueue.Count} remaining)"); - } - - audioSource.PlayOneShot(clip); - // ReSharper disable once MethodSupportsCancellation - await Task.Delay(TimeSpan.FromSeconds(clip.length)).ConfigureAwait(true); - } - else - { - await endOfFrame; - } - - if (streamTask.IsCompleted && !audioSource.isPlaying && streamClipQueue.Count == 0) - { - return; - } - } while (!cancellationToken.IsCancellationRequested); - } - catch (Exception e) - { - switch (e) - { - case TaskCanceledException: - case OperationCanceledException: - break; - default: - Debug.LogError(e); - break; - } - } - } + await new WaitUntil(() => sampleQueue.IsEmpty || cancellationToken.IsCancellationRequested); + audioSource.clip = speechClip.AudioClip; } private TextMeshProUGUI AddNewTextMessageContent(Role role) diff --git a/OpenAI/Packages/com.openai.unity/Samples~/Chat/OpenAIChatSample.unity b/OpenAI/Packages/com.openai.unity/Samples~/Chat/OpenAIChatSample.unity index b3a7dcd7..336bc920 100644 --- a/OpenAI/Packages/com.openai.unity/Samples~/Chat/OpenAIChatSample.unity +++ b/OpenAI/Packages/com.openai.unity/Samples~/Chat/OpenAIChatSample.unity @@ -1487,7 +1487,7 @@ RectTransform: m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} m_AnchorMin: {x: 0, y: 0} m_AnchorMax: {x: 0, y: 0} - m_AnchoredPosition: {x: 517.1232, y: 0} + m_AnchoredPosition: {x: 503.20578, y: 0} m_SizeDelta: {x: 64, y: 64} m_Pivot: {x: 0.5, y: 0.5} --- !u!114 &1143678155 @@ -2029,6 +2029,166 @@ CanvasRenderer: m_PrefabAsset: {fileID: 0} m_GameObject: {fileID: 1377121430} m_CullTransparentMesh: 1 +--- !u!1 &1411251220 +GameObject: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + serializedVersion: 6 + m_Component: + - component: {fileID: 1411251223} + - component: {fileID: 1411251222} + - component: {fileID: 1411251221} + m_Layer: 0 + m_Name: ChatBehaviour + m_TagString: Untagged + m_Icon: {fileID: 0} + m_NavMeshLayer: 0 + m_StaticEditorFlags: 0 + m_IsActive: 1 +--- !u!114 &1411251221 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 1411251220} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 11500000, guid: a891710bf1466924297c3b3b6f1b6e51, type: 3} + m_Name: + m_EditorClassIdentifier: + configuration: {fileID: 0} + enableDebug: 1 + submitButton: {fileID: 1094024334} + recordButton: {fileID: 1143678156} + inputField: {fileID: 1377121433} + contentArea: {fileID: 250955499} + scrollView: {fileID: 1974642466} + audioSource: {fileID: 0} + voice: 0 + systemPrompt: 'You are a helpful assistant. + + - If an image is requested then + use "![Image](output.jpg)" to display it. + + - When performing function calls, + use the defaults unless explicitly told to use a specific value. + + - Images + should always be generated in base64.' +--- !u!82 &1411251222 +AudioSource: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 1411251220} + m_Enabled: 1 + serializedVersion: 4 + OutputAudioMixerGroup: {fileID: 0} + m_audioClip: {fileID: 0} + m_PlayOnAwake: 1 + m_Volume: 1 + m_Pitch: 1 + Loop: 0 + Mute: 0 + Spatialize: 0 + SpatializePostEffects: 0 + Priority: 128 + DopplerLevel: 1 + MinDistance: 1 + MaxDistance: 500 + Pan2D: 0 + rolloffMode: 0 + BypassEffects: 0 + BypassListenerEffects: 0 + BypassReverbZones: 0 + rolloffCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 1 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + - serializedVersion: 3 + time: 1 + value: 0 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 + panLevelCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 0 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 + spreadCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 0 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 + reverbZoneMixCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 1 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 +--- !u!4 &1411251223 +Transform: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 1411251220} + m_LocalRotation: {x: 0, y: 0, z: 0, w: 1} + m_LocalPosition: {x: 0, y: 0, z: 0} + m_LocalScale: {x: 1, y: 1, z: 1} + m_ConstrainProportionsScale: 1 + m_Children: [] + m_Father: {fileID: 0} + m_RootOrder: 4 + m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} --- !u!1 &1466169038 GameObject: m_ObjectHideFlags: 0 @@ -2078,8 +2238,6 @@ GameObject: - component: {fileID: 1711080859} - component: {fileID: 1711080858} - component: {fileID: 1711080857} - - component: {fileID: 1711080861} - - component: {fileID: 1711080862} m_Layer: 5 m_Name: Canvas m_TagString: Untagged @@ -2170,133 +2328,6 @@ RectTransform: m_AnchoredPosition: {x: 0, y: 0} m_SizeDelta: {x: 0, y: 0} m_Pivot: {x: 0, y: 0} ---- !u!114 &1711080861 -MonoBehaviour: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 1711080856} - m_Enabled: 1 - m_EditorHideFlags: 0 - m_Script: {fileID: 11500000, guid: a891710bf1466924297c3b3b6f1b6e51, type: 3} - m_Name: - m_EditorClassIdentifier: - configuration: {fileID: 0} - enableDebug: 1 - submitButton: {fileID: 1094024334} - recordButton: {fileID: 1143678156} - inputField: {fileID: 1377121433} - contentArea: {fileID: 250955499} - scrollView: {fileID: 1974642466} - audioSource: {fileID: 1711080862} - voice: 0 - systemPrompt: 'You are a helpful assistant. - - - If an image is requested then - use "![Image](output.jpg)" to display it. - - - When performing function calls, - use the defaults unless explicitly told to use a specific value. - - - Images - should always be generated in base64.' ---- !u!82 &1711080862 -AudioSource: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 1711080856} - m_Enabled: 1 - serializedVersion: 4 - OutputAudioMixerGroup: {fileID: 0} - m_audioClip: {fileID: 0} - m_PlayOnAwake: 0 - m_Volume: 1 - m_Pitch: 1 - Loop: 0 - Mute: 0 - Spatialize: 0 - SpatializePostEffects: 0 - Priority: 128 - DopplerLevel: 1 - MinDistance: 1 - MaxDistance: 500 - Pan2D: 0 - rolloffMode: 0 - BypassEffects: 0 - BypassListenerEffects: 0 - BypassReverbZones: 0 - rolloffCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 1 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - - serializedVersion: 3 - time: 1 - value: 0 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 - panLevelCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 0 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 - spreadCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 0 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 - reverbZoneMixCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 1 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 --- !u!1 &1819767325 GameObject: m_ObjectHideFlags: 0 @@ -2380,7 +2411,7 @@ MonoBehaviour: m_TargetGraphic: {fileID: 800336258} m_HandleRect: {fileID: 800336257} m_Direction: 0 - m_Value: 1 + m_Value: 0 m_Size: 1 m_NumberOfSteps: 0 m_OnValueChanged: diff --git a/OpenAI/Packages/com.openai.unity/Samples~/Realtime/OpenAIRealtimeSample.unity b/OpenAI/Packages/com.openai.unity/Samples~/Realtime/OpenAIRealtimeSample.unity index 5724fc91..8de88c7f 100644 --- a/OpenAI/Packages/com.openai.unity/Samples~/Realtime/OpenAIRealtimeSample.unity +++ b/OpenAI/Packages/com.openai.unity/Samples~/Realtime/OpenAIRealtimeSample.unity @@ -1487,7 +1487,7 @@ RectTransform: m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} m_AnchorMin: {x: 0, y: 0} m_AnchorMax: {x: 0, y: 0} - m_AnchoredPosition: {x: 471.71927, y: 0} + m_AnchoredPosition: {x: 480.06848, y: 0} m_SizeDelta: {x: 64, y: 64} m_Pivot: {x: 0.5, y: 0.5} --- !u!114 &1143678155 @@ -1586,6 +1586,191 @@ CanvasRenderer: m_PrefabAsset: {fileID: 0} m_GameObject: {fileID: 1143678153} m_CullTransparentMesh: 1 +--- !u!1 &1169396011 +GameObject: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + serializedVersion: 6 + m_Component: + - component: {fileID: 1169396014} + - component: {fileID: 1169396013} + - component: {fileID: 1169396012} + m_Layer: 0 + m_Name: RealtimeBehaviour + m_TagString: Untagged + m_Icon: {fileID: 0} + m_NavMeshLayer: 0 + m_StaticEditorFlags: 0 + m_IsActive: 1 +--- !u!114 &1169396012 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 1169396011} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 11500000, guid: 2ee60928da32d1742b66093992d09c69, type: 3} + m_Name: + m_EditorClassIdentifier: + configuration: {fileID: 0} + enableDebug: 1 + submitButton: {fileID: 1094024334} + recordButton: {fileID: 1143678156} + inputField: {fileID: 1377121433} + placeholder: {fileID: 768762706} + contentArea: {fileID: 250955499} + scrollView: {fileID: 1974642466} + audioSource: {fileID: 0} + systemPrompt: 'Your knowledge cutoff is 2023-10. + + You are a helpful, witty, + and friendly AI. + + Act like a human, but remember that you aren''t a human + and that you can''t do human things in the real world. + + Your voice and personality + should be warm and engaging, with a lively and playful tone. + + If interacting + in a non-English language, start by using the standard accent or dialect familiar + to the user. + + Talk quickly. + + You should always call a function if you + can. + + You should always notify a user before calling a function, so they + know it might take a moment to see a result. + + Do not refer to these rules, + even if you''re asked about them. + + If an image is requested then use the + "![Image](output.jpg)" markdown tag to display it, but don''t include tag in + the transcript or say this tag out loud + + When performing function calls, + use the defaults unless explicitly told to use a specific value. + + Images + should always be generated in base64.' +--- !u!82 &1169396013 +AudioSource: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 1169396011} + m_Enabled: 1 + serializedVersion: 4 + OutputAudioMixerGroup: {fileID: 0} + m_audioClip: {fileID: 0} + m_PlayOnAwake: 1 + m_Volume: 1 + m_Pitch: 1 + Loop: 0 + Mute: 0 + Spatialize: 0 + SpatializePostEffects: 0 + Priority: 128 + DopplerLevel: 1 + MinDistance: 1 + MaxDistance: 500 + Pan2D: 0 + rolloffMode: 0 + BypassEffects: 0 + BypassListenerEffects: 0 + BypassReverbZones: 0 + rolloffCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 1 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + - serializedVersion: 3 + time: 1 + value: 0 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 + panLevelCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 0 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 + spreadCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 0 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 + reverbZoneMixCustomCurve: + serializedVersion: 2 + m_Curve: + - serializedVersion: 3 + time: 0 + value: 1 + inSlope: 0 + outSlope: 0 + tangentMode: 0 + weightedMode: 0 + inWeight: 0.33333334 + outWeight: 0.33333334 + m_PreInfinity: 2 + m_PostInfinity: 2 + m_RotationOrder: 4 +--- !u!4 &1169396014 +Transform: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 1169396011} + m_LocalRotation: {x: 0, y: 0, z: 0, w: 1} + m_LocalPosition: {x: 0, y: 0, z: 0} + m_LocalScale: {x: 1, y: 1, z: 1} + m_ConstrainProportionsScale: 1 + m_Children: [] + m_Father: {fileID: 0} + m_RootOrder: 4 + m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} --- !u!1 &1246159954 GameObject: m_ObjectHideFlags: 0 @@ -2078,8 +2263,6 @@ GameObject: - component: {fileID: 1711080859} - component: {fileID: 1711080858} - component: {fileID: 1711080857} - - component: {fileID: 1711080862} - - component: {fileID: 1711080863} m_Layer: 5 m_Name: Canvas m_TagString: Untagged @@ -2170,158 +2353,6 @@ RectTransform: m_AnchoredPosition: {x: 0, y: 0} m_SizeDelta: {x: 0, y: 0} m_Pivot: {x: 0, y: 0} ---- !u!82 &1711080862 -AudioSource: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 1711080856} - m_Enabled: 1 - serializedVersion: 4 - OutputAudioMixerGroup: {fileID: 0} - m_audioClip: {fileID: 0} - m_PlayOnAwake: 0 - m_Volume: 1 - m_Pitch: 1 - Loop: 0 - Mute: 0 - Spatialize: 0 - SpatializePostEffects: 0 - Priority: 128 - DopplerLevel: 1 - MinDistance: 1 - MaxDistance: 500 - Pan2D: 0 - rolloffMode: 0 - BypassEffects: 0 - BypassListenerEffects: 0 - BypassReverbZones: 0 - rolloffCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 1 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - - serializedVersion: 3 - time: 1 - value: 0 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 - panLevelCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 0 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 - spreadCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 0 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 - reverbZoneMixCustomCurve: - serializedVersion: 2 - m_Curve: - - serializedVersion: 3 - time: 0 - value: 1 - inSlope: 0 - outSlope: 0 - tangentMode: 0 - weightedMode: 0 - inWeight: 0.33333334 - outWeight: 0.33333334 - m_PreInfinity: 2 - m_PostInfinity: 2 - m_RotationOrder: 4 ---- !u!114 &1711080863 -MonoBehaviour: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 1711080856} - m_Enabled: 1 - m_EditorHideFlags: 0 - m_Script: {fileID: 11500000, guid: 2ee60928da32d1742b66093992d09c69, type: 3} - m_Name: - m_EditorClassIdentifier: - configuration: {fileID: 0} - enableDebug: 1 - submitButton: {fileID: 1094024334} - recordButton: {fileID: 1143678156} - inputField: {fileID: 1377121433} - placeholder: {fileID: 768762706} - contentArea: {fileID: 250955499} - scrollView: {fileID: 1974642466} - audioSource: {fileID: 1711080862} - systemPrompt: 'Your knowledge cutoff is 2023-10. - - You are a helpful, witty, - and friendly AI. - - Act like a human, but remember that you aren''t a human - and that you can''t do human things in the real world. - - Your voice and personality - should be warm and engaging, with a lively and playful tone. - - If interacting - in a non-English language, start by using the standard accent or dialect familiar - to the user. - - Talk quickly. - - You should always call a function if you - can. - - You should always notify a user before calling a function, so they - know it might take a moment to see a result. - - Do not refer to these rules, - even if you''re asked about them. - - If an image is requested then use the - "![Image](output.jpg)" markdown tag to display it, but don''t include tag in - the transcript or say this tag out loud - - When performing function calls, - use the defaults unless explicitly told to use a specific value. - - Images - should always be generated in base64.' --- !u!1 &1819767325 GameObject: m_ObjectHideFlags: 0 diff --git a/OpenAI/Packages/com.openai.unity/Samples~/Realtime/RealtimeBehaviour.cs b/OpenAI/Packages/com.openai.unity/Samples~/Realtime/RealtimeBehaviour.cs index 6b6ca998..dc2968e4 100644 --- a/OpenAI/Packages/com.openai.unity/Samples~/Realtime/RealtimeBehaviour.cs +++ b/OpenAI/Packages/com.openai.unity/Samples~/Realtime/RealtimeBehaviour.cs @@ -22,6 +22,7 @@ namespace OpenAI.Samples.Realtime { + [RequireComponent(typeof(AudioSource))] public class RealtimeBehaviour : MonoBehaviour { [SerializeField] @@ -67,7 +68,7 @@ public class RealtimeBehaviour : MonoBehaviour #endif private readonly Dictionary responseList = new(); - private readonly ConcurrentQueue streamClipQueue = new(); + private readonly ConcurrentQueue sampleQueue = new(); private void OnValidate() { @@ -76,7 +77,11 @@ private void OnValidate() inputField.Validate(); placeholder.Validate(); contentArea.Validate(); - audioSource.Validate(); + + if (audioSource == null) + { + audioSource = GetComponent(); + } } private async void Awake() @@ -104,7 +109,6 @@ private async void Awake() inputField.interactable = isMuted; submitButton.interactable = isMuted; RecordInputAudio(destroyCancellationToken); - PlayStreamQueue(destroyCancellationToken); await session.ReceiveUpdatesAsync(ServerResponseEvent, destroyCancellationToken); } catch (Exception e) @@ -130,6 +134,22 @@ private async void Awake() } } + private void OnAudioFilterRead(float[] data, int channels) + { + if (sampleQueue.Count <= 0) { return; } + + for (var i = 0; i < data.Length; i += channels) + { + if (sampleQueue.TryDequeue(out var sample)) + { + for (var j = 0; j < channels; j++) + { + data[i + j] = sample; + } + } + } + } + private void OnDestroy() { inputField.onSubmit.RemoveListener(SubmitChat); @@ -283,40 +303,6 @@ async Task BufferCallback(ReadOnlyMemory bufferCallback) } } - private async void PlayStreamQueue(CancellationToken cancellationToken) - { - try - { - do - { - if (!audioSource.isPlaying && - streamClipQueue.TryDequeue(out var clip)) - { - Log($"playing partial clip: {clip.name} | ({streamClipQueue.Count} remaining)"); - audioSource.PlayOneShot(clip); - // ReSharper disable once MethodSupportsCancellation - await Task.Delay(TimeSpan.FromSeconds(clip.length)).ConfigureAwait(true); - } - else - { - await Task.Yield(); - } - } while (!cancellationToken.IsCancellationRequested); - } - catch (Exception e) - { - switch (e) - { - case TaskCanceledException: - case OperationCanceledException: - break; - default: - Debug.LogError(e); - break; - } - } - } - private void ServerResponseEvent(IServerEvent serverEvent) { switch (serverEvent) @@ -324,7 +310,10 @@ private void ServerResponseEvent(IServerEvent serverEvent) case ResponseAudioResponse audioResponse: if (audioResponse.IsDelta) { - streamClipQueue.Enqueue(audioResponse); + foreach (var sample in audioResponse.AudioSamples) + { + sampleQueue.Enqueue(sample); + } } break; case ResponseAudioTranscriptResponse transcriptResponse: diff --git a/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs b/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs index 40843567..65881cd3 100644 --- a/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs +++ b/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs @@ -2,7 +2,6 @@ using NUnit.Framework; using OpenAI.Audio; -using System; using System.Collections.Concurrent; using System.IO; using System.Threading.Tasks; @@ -94,9 +93,10 @@ public async Task Test_03_01_Speech() { Assert.IsNotNull(OpenAIClient.AudioEndpoint); var request = new SpeechRequest("Hello world!"); - var (path, clip) = await OpenAIClient.AudioEndpoint.CreateSpeechAsync(request); - Debug.Log(path); - Assert.IsNotNull(clip); + var speechClip = await OpenAIClient.AudioEndpoint.GetSpeechAsync(request); + Debug.Log(speechClip.CachePath); + Assert.IsNotEmpty(speechClip.AudioSamples); + Assert.IsNotNull(speechClip.AudioClip); } [Test] @@ -104,11 +104,12 @@ public async Task Test_03_02_Speech_Streaming() { Assert.IsNotNull(OpenAIClient.AudioEndpoint); var request = new SpeechRequest("Hello world!", responseFormat: SpeechResponseFormat.PCM); - var clipQueue = new ConcurrentQueue(); - var (path, clip) = await OpenAIClient.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => clipQueue.Enqueue(partialClip)); - Debug.Log(path); - Assert.IsNotNull(clip); - Assert.IsTrue(clipQueue.Count > 0); + var clipQueue = new ConcurrentQueue(); + var speechClip = await OpenAIClient.AudioEndpoint.GetSpeechAsync(request, partialClip => clipQueue.Enqueue(partialClip)); + Debug.Log(speechClip.CachePath); + Assert.IsNotEmpty(speechClip.AudioSamples); + Assert.IsNotNull(speechClip.AudioClip); + Assert.IsFalse(clipQueue.IsEmpty); } } } diff --git a/OpenAI/Packages/com.openai.unity/package.json b/OpenAI/Packages/com.openai.unity/package.json index 01446dad..281222aa 100644 --- a/OpenAI/Packages/com.openai.unity/package.json +++ b/OpenAI/Packages/com.openai.unity/package.json @@ -3,7 +3,7 @@ "displayName": "OpenAI", "description": "A OpenAI package for the Unity Game Engine to use GPT-4, GPT-3.5, GPT-3 and Dall-E though their RESTful API (currently in beta).\n\nIndependently developed, this is not an official library and I am not affiliated with OpenAI.\n\nAn OpenAI API account is required.", "keywords": [], - "version": "8.4.3", + "version": "8.4.4", "unity": "2021.3", "documentationUrl": "https://github.com/RageAgainstThePixel/com.openai.unity#documentation", "changelogUrl": "https://github.com/RageAgainstThePixel/com.openai.unity/releases", @@ -17,7 +17,7 @@ "url": "https://github.com/StephenHodgson" }, "dependencies": { - "com.utilities.encoder.wav": "2.0.1", + "com.utilities.encoder.wav": "2.0.2", "com.utilities.rest": "3.3.0", "com.utilities.websockets": "1.0.1" }, diff --git a/OpenAI/Packages/manifest.json b/OpenAI/Packages/manifest.json index 40b5ec76..3650ee2e 100644 --- a/OpenAI/Packages/manifest.json +++ b/OpenAI/Packages/manifest.json @@ -4,7 +4,7 @@ "com.unity.ide.visualstudio": "2.0.22", "com.unity.textmeshpro": "3.0.9", "com.unity.ugui": "1.0.0", - "com.utilities.buildpipeline": "1.5.6" + "com.utilities.buildpipeline": "1.5.7" }, "scopedRegistries": [ { diff --git a/README.md b/README.md index 6fcad771..0346470c 100644 --- a/README.md +++ b/README.md @@ -61,12 +61,12 @@ The recommended installation method is though the unity package manager and [Ope - [List Models](#list-models) - [Retrieve Models](#retrieve-model) - [Delete Fine Tuned Model](#delete-fine-tuned-model) -- [Realtime](#realtime) :new: - - [Create Realtime Session](#create-realtime-session) :new: - - [Client Events](#client-events) :new: - - [Sending Client Events](#sending-client-events) :new: - - [Server Events](#server-events) :new: - - [Receiving Server Events](#receiving-server-events) :new: +- [Realtime](#realtime) + - [Create Realtime Session](#create-realtime-session) + - [Client Events](#client-events) + - [Sending Client Events](#sending-client-events) + - [Server Events](#server-events) + - [Receiving Server Events](#receiving-server-events) - [Assistants](#assistants) - [List Assistants](#list-assistants) - [Create Assistant](#create-assistant) @@ -118,7 +118,7 @@ The recommended installation method is though the unity package manager and [Ope - [Streaming](#chat-streaming) - [Tools](#chat-tools) - [Vision](#chat-vision) - - [Audio](#chat-audio) :new: + - [Audio](#chat-audio) - [Structured Outputs](#chat-structured-outputs) - [Json Mode](#chat-json-mode) - [Audio](#audio) @@ -1663,9 +1663,9 @@ Generates audio from the input text. ```csharp var api = new OpenAIClient(); var request = new SpeechRequest("Hello world!"); -var (path, clip) = await api.AudioEndpoint.CreateSpeechAsync(request); -audioSource.PlayOneShot(clip); -Debug.Log(path); +var speechClip = await api.AudioEndpoint.CreateSpeechAsync(request); +audioSource.PlayOneShot(speechClip); +Debug.Log(speechClip); ``` ##### [Stream Speech] @@ -1674,11 +1674,17 @@ Generate streamed audio from the input text. ```csharp var api = new OpenAIClient(); -var request = new SpeechRequest("Hello world!"); -var (path, clip) = await api.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => audioSource.PlayOneShot(partialClip)); -Debug.Log(path); +var request = new SpeechRequest("Hello world!", responseFormat: SpeechResponseFormat.PCM); +var speechClip = await api.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => +{ + audioSource.PlayOneShot(partialClip); +}); +Debug.Log(speechClip); ``` +> [!NOTE] +> Checkout any of the demo scenes for best practices on how to handle playback with `OnAudioFilterRead`. + #### [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription) Transcribes audio into the input language.