From 93eed233ab706b56a1ea683eb49224a496f67b9b Mon Sep 17 00:00:00 2001 From: Stephen Hodgson Date: Mon, 18 Mar 2024 21:39:43 -0400 Subject: [PATCH] com.openai.unity 7.7.6 (#210) - Added support for streaming text to speech - Added AudioEndpoit.CreateSpeechStreamAsync(SpeechRequest, Action, CancellationToken) - Added support for Audio Transcription and Translation verbose json output - Added support for timestamp granularities for segments and words - Marked CreateTranscriptionAsync obsolete - Added CreateTranscriptionTextAsync - Added CreateTranscriptionJsonAsync - Marked CreateTranspationAsync obsolete - Added CreateTranslationTextAsync - Added CreateTranslationJsonAsync - Updated SpeechResponseFormat to include wav and pcm --- .../com.openai.unity/Documentation~/README.md | 28 ++- .../Runtime/Audio/AudioEndpoint.cs | 160 ++++++++++++++---- .../Runtime/Audio/AudioResponse.cs | 62 +++++++ .../Runtime/Audio/AudioResponse.cs.meta | 11 ++ .../Audio/AudioTranscriptionRequest.cs | 46 ++++- .../Runtime/Audio/AudioTranslationRequest.cs | 2 +- .../Runtime/Audio/SpeechRequest.cs | 6 +- .../Runtime/Audio/SpeechResponseFormat.cs | 6 +- .../Runtime/Audio/TimestampGranularity.cs | 11 ++ .../Audio/TimestampGranularity.cs.meta | 11 ++ .../Runtime/Audio/TranscriptionSegment.cs | 108 ++++++++++++ .../Audio/TranscriptionSegment.cs.meta | 11 ++ .../Runtime/Audio/TranscriptionWord.cs | 42 +++++ .../Runtime/Audio/TranscriptionWord.cs.meta | 11 ++ .../Samples~/Chat/ChatBehaviour.cs | 68 +++++++- .../Tests/TestFixture_07_Audio.cs | 71 ++++++-- OpenAI/Packages/com.openai.unity/package.json | 4 +- OpenAI/Packages/manifest.json | 2 +- README.md | 28 ++- 19 files changed, 623 insertions(+), 65 deletions(-) create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs.meta create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs.meta create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs.meta create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs.meta diff --git a/OpenAI/Packages/com.openai.unity/Documentation~/README.md b/OpenAI/Packages/com.openai.unity/Documentation~/README.md index 45dc3944..6f10d81d 100644 --- a/OpenAI/Packages/com.openai.unity/Documentation~/README.md +++ b/OpenAI/Packages/com.openai.unity/Documentation~/README.md @@ -103,6 +103,7 @@ The recommended installation method is though the unity package manager and [Ope - [Json Mode](#chat-json-mode) - [Audio](#audio) - [Create Speech](#create-speech) + - [Stream Speech](#stream-speech) :new: - [Create Transcription](#create-transcription) - [Create Translation](#create-translation) - [Images](#images) @@ -1047,6 +1048,18 @@ var api = new OpenAIClient(); var request = new SpeechRequest("Hello world!"); var (path, clip) = await api.AudioEndpoint.CreateSpeechAsync(request); audioSource.PlayOneShot(clip); +Debug.Log(path); +``` + +##### [Stream Speech] + +Generate streamed audio from the input text. + +```csharp +var api = new OpenAIClient(); +var request = new SpeechRequest("Hello world!"); +var (path, clip) = await OpenAIClient.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => audioSource.PlayOneShot(partialClip)); +Debug.Log(path); ``` #### [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription) @@ -1060,6 +1073,19 @@ var result = await api.AudioEndpoint.CreateTranscriptionAsync(request); Debug.Log(result); ``` +You can also get detailed information using `verbose_json` to get timestamp granularities: + +```csharp +var api = new OpenAIClient(); +using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en"); +var response = await api.AudioEndpoint.CreateTranscriptionTextAsync(request); + +foreach (var word in response.Words) +{ + Debug.Log($"[{word.Start}-{word.End}] \"{word.Word}\""); +} +``` + #### [Create Translation](https://platform.openai.com/docs/api-reference/audio/createTranslation) Translates audio into into English. @@ -1187,7 +1213,7 @@ Returns information about a specific file. ```csharp var api = new OpenAIClient(); -var file = await GetFileInfoAsync(fileId); +var file = await api.FilesEndpoint.GetFileInfoAsync(fileId); Debug.Log($"{file.Id} -> {file.Object}: {file.FileName} | {file.Size} bytes"); ``` diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs index eccde182..eb4f66f9 100644 --- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs @@ -7,7 +7,6 @@ using System.Threading.Tasks; using UnityEngine; using UnityEngine.Networking; -using UnityEngine.Scripting; using Utilities.WebRequestRest; namespace OpenAI.Audio @@ -18,21 +17,6 @@ namespace OpenAI.Audio /// public sealed class AudioEndpoint : OpenAIBaseEndpoint { - [Preserve] - private class AudioResponse - { - [Preserve] - [JsonConstructor] - public AudioResponse([JsonProperty("text")] string text) - { - Text = text; - } - - [Preserve] - [JsonProperty("text")] - public string Text { get; } - } - internal AudioEndpoint(OpenAIClient client) : base(client) { } /// @@ -48,15 +32,29 @@ internal AudioEndpoint(OpenAIClient client) : base(client) { } /// and the cached path. [Function("Generates audio from the input text.")] public async Task> CreateSpeechAsync(SpeechRequest request, CancellationToken cancellationToken = default) + => await CreateSpeechStreamAsync(request, null, cancellationToken); + + /// + /// Generates streaming audio from the input text. + /// + /// . + /// Optional, partial callback used to stream audio. + /// Optional, . + /// and the cached path. + [Function("Generates streaming audio from the input text.")] + public async Task> CreateSpeechStreamAsync(SpeechRequest request, Action partialClipCallback, CancellationToken cancellationToken = default) { - var audioFormat = request.ResponseFormat switch + if (partialClipCallback != null && request.ResponseFormat != SpeechResponseFormat.PCM) { - SpeechResponseFormat.MP3 => AudioType.MPEG, - _ => throw new NotSupportedException(request.ResponseFormat.ToString()) - }; + Debug.LogWarning("Speech streaming only supported with PCM response format. Overriding to PCM..."); + request.ResponseFormat = SpeechResponseFormat.PCM; + } + var ext = request.ResponseFormat switch { SpeechResponseFormat.MP3 => "mp3", + SpeechResponseFormat.WAV => "wav", + SpeechResponseFormat.PCM => "pcm", _ => throw new NotSupportedException(request.ResponseFormat.ToString()) }; var payload = JsonConvert.SerializeObject(request, OpenAIClient.JsonSerializationOptions); @@ -64,9 +62,48 @@ public async Task> CreateSpeechAsync(SpeechRequest requ lock (mutex) { - clipName = $"{request.Voice}-{DateTime.UtcNow:yyyyMMddThhmmssff}.{ext}"; + clipName = $"{request.Voice}-{DateTime.UtcNow:yyyyMMddThhmmssfffff}.{ext}"; } + Rest.TryGetDownloadCacheItem(clipName, out var cachedPath); + + if (request.ResponseFormat == SpeechResponseFormat.PCM) + { + var part = 0; + var response = await Rest.PostAsync( + GetUrl("/speech"), + payload, + StreamCallback, + eventChunkSize: 8192, + new RestParameters(client.DefaultRequestHeaders), + cancellationToken); + response.Validate(EnableDebug); + var samples = Utilities.Audio.PCMEncoder.Decode(response.Data); + await File.WriteAllBytesAsync(cachedPath, response.Data, cancellationToken).ConfigureAwait(true); + return new Tuple(cachedPath, AudioClip.Create(clipName, samples.Length, 1, 24000, false)); + + void StreamCallback(Response partialResponse) + { + var chunk = Utilities.Audio.PCMEncoder.Decode(partialResponse.Data); + var partialClip = AudioClip.Create($"{clipName}_{++part}", chunk.Length, 1, 24000, false); + + if (!partialClip.SetData(chunk, 0)) + { + Debug.LogError("Failed to set pcm data to partial clip."); + return; + } + + partialClipCallback?.Invoke(partialClip); + } + } + + var audioFormat = request.ResponseFormat switch + { + SpeechResponseFormat.MP3 => AudioType.MPEG, + SpeechResponseFormat.WAV => AudioType.WAV, + _ => throw new NotSupportedException(request.ResponseFormat.ToString()) + }; + var clip = await Rest.DownloadAudioClipAsync( GetUrl("/speech"), audioFormat, @@ -75,17 +112,46 @@ public async Task> CreateSpeechAsync(SpeechRequest requ payload, parameters: new RestParameters(client.DefaultRequestHeaders, debug: EnableDebug), cancellationToken: cancellationToken); - Rest.TryGetDownloadCacheItem(clipName, out var cachedPath); return new Tuple(cachedPath, clip); } + [Obsolete("Use CreateTranscriptionTextAsync or CreateTranscriptionJsonAsync instead.")] + public async Task CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) + => await CreateTranscriptionTextAsync(request, cancellationToken); + /// /// Transcribes audio into the input language. /// /// . /// Optional, . /// The transcribed text. - public async Task CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) + public async Task CreateTranscriptionTextAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) + { + var response = await Internal_CreateTranscriptionAsync(request, cancellationToken); + return request.ResponseFormat == AudioResponseFormat.Json + ? JsonConvert.DeserializeObject(response)?.Text + : response; + } + + /// + /// Transcribes audio into the input language. + /// + /// This method expects the request format to be either or . + /// . + /// Optional, . + /// . + public async Task CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) + { + if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json)) + { + throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat)); + } + + var response = await Internal_CreateTranscriptionAsync(request, cancellationToken); + return JsonConvert.DeserializeObject(response); + } + + private async Task Internal_CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) { var form = new WWWForm(); using var audioData = new MemoryStream(); @@ -111,22 +177,58 @@ public async Task CreateTranscriptionAsync(AudioTranscriptionRequest req form.AddField("language", request.Language); } + switch (request.TimestampGranularities) + { + case TimestampGranularity.Segment: + case TimestampGranularity.Word: + form.AddField("timestamp_granularities[]", request.TimestampGranularities.ToString().ToLower()); + break; + } + request.Dispose(); var response = await Rest.PostAsync(GetUrl("/transcriptions"), form, new RestParameters(client.DefaultRequestHeaders), cancellationToken); response.Validate(EnableDebug); - return responseFormat == AudioResponseFormat.Json - ? JsonConvert.DeserializeObject(response.Body)?.Text - : response.Body; + return response.Body; } + [Obsolete("Use CreateTranslationTextAsync or CreateTranslationJsonAsync instead.")] + public async Task CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) + => await CreateTranslationTextAsync(request, cancellationToken); + /// /// Translates audio into English. /// /// /// /// The translated text. - public async Task CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) + public async Task CreateTranslationTextAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) + { + var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken); + return request.ResponseFormat == AudioResponseFormat.Json + ? JsonConvert.DeserializeObject(responseAsString)?.Text + : responseAsString; + } + + /// + /// Translates audio into English. + /// + /// + /// + /// + /// + public async Task CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) + { + if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json)) + { + throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat)); + } + + var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken); + return JsonConvert.DeserializeObject(responseAsString); + } + + private async Task Internal_CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken) { var form = new WWWForm(); using var audioData = new MemoryStream(); @@ -151,9 +253,7 @@ public async Task CreateTranslationAsync(AudioTranslationRequest request var response = await Rest.PostAsync(GetUrl("/translations"), form, new RestParameters(client.DefaultRequestHeaders), cancellationToken); response.Validate(EnableDebug); - return responseFormat == AudioResponseFormat.Json - ? JsonConvert.DeserializeObject(response.Body)?.Text - : response.Body; + return response.Body; } } } diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs new file mode 100644 index 00000000..cac4b419 --- /dev/null +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs @@ -0,0 +1,62 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using Newtonsoft.Json; +using UnityEngine.Scripting; + +namespace OpenAI.Audio +{ + [Preserve] + public class AudioResponse + { + [Preserve] + [JsonConstructor] + public AudioResponse( + [JsonProperty("language")] string language, + [JsonProperty("duration")] double? duration, + [JsonProperty("text")] string text, + [JsonProperty("words")] TranscriptionWord[] words, + [JsonProperty("segments")] TranscriptionSegment[] segments) + { + Language = language; + Duration = duration; + Text = text; + Words = words; + Segments = segments; + } + + /// + /// The language of the input audio. + /// + [Preserve] + [JsonProperty("language")] + public string Language { get; } + + /// + /// The duration of the input audio. + /// + [Preserve] + [JsonProperty("duration")] + public double? Duration { get; } + + /// + /// The transcribed text. + /// + [Preserve] + [JsonProperty("text")] + public string Text { get; } + + /// + /// Extracted words and their corresponding timestamps. + /// + [Preserve] + [JsonProperty("words")] + public TranscriptionWord[] Words { get; } + + /// + /// Segments of the transcribed text and their corresponding details. + /// + [Preserve] + [JsonProperty("segments")] + public TranscriptionSegment[] Segments { get; } + } +} diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs.meta b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs.meta new file mode 100644 index 00000000..cef8d2e0 --- /dev/null +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 31e77051dcea1474192df60c0d5a0052 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {fileID: 2800000, guid: 84a7eb8fc6eba7540bf56cea8e12249c, type: 3} + userData: + assetBundleName: + assetBundleVariant: diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranscriptionRequest.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranscriptionRequest.cs index 1b4686a0..76cd0b4f 100644 --- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranscriptionRequest.cs +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranscriptionRequest.cs @@ -41,14 +41,21 @@ public sealed class AudioTranscriptionRequest : IDisposable /// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, /// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh. /// + /// + /// The timestamp granularities to populate for this transcription. + /// response_format must be set verbose_json to use timestamp granularities. + /// Either or both of these options are supported: , or .
+ /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + /// public AudioTranscriptionRequest( string audioPath, string model = null, string prompt = null, AudioResponseFormat responseFormat = AudioResponseFormat.Json, float? temperature = null, - string language = null) - : this(File.OpenRead(audioPath), Path.GetFileName(audioPath), model, prompt, responseFormat, temperature, language) + string language = null, + TimestampGranularity timestampGranularity = TimestampGranularity.None) + : this(File.OpenRead(audioPath), Path.GetFileName(audioPath), model, prompt, responseFormat, temperature, language, timestampGranularity) { } @@ -84,14 +91,21 @@ public AudioTranscriptionRequest( /// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, /// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh. /// + /// + /// The timestamp granularities to populate for this transcription. + /// response_format must be set verbose_json to use timestamp granularities. + /// Either or both of these options are supported: , or .
+ /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + /// public AudioTranscriptionRequest( AudioClip audio, string model = null, string prompt = null, AudioResponseFormat responseFormat = AudioResponseFormat.Json, float? temperature = null, - string language = null) - : this(new MemoryStream(audio.EncodeToWav()), $"{audio.name}.wav", model, prompt, responseFormat, temperature, language) + string language = null, + TimestampGranularity timestampGranularity = TimestampGranularity.None) + : this(new MemoryStream(audio.EncodeToWav()), $"{audio.name}.wav", model, prompt, responseFormat, temperature, language, timestampGranularity) { } @@ -130,6 +144,12 @@ public AudioTranscriptionRequest( /// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, /// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh. /// + /// + /// The timestamp granularities to populate for this transcription. + /// response_format must be set verbose_json to use timestamp granularities. + /// Either or both of these options are supported: , or .
+ /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + /// public AudioTranscriptionRequest( Stream audio, string audioName, @@ -137,7 +157,8 @@ public AudioTranscriptionRequest( string prompt = null, AudioResponseFormat responseFormat = AudioResponseFormat.Json, float? temperature = null, - string language = null) + string language = null, + TimestampGranularity timestampGranularity = TimestampGranularity.None) { Audio = audio; @@ -152,6 +173,13 @@ public AudioTranscriptionRequest( ResponseFormat = responseFormat; Temperature = temperature; Language = language; + + if (timestampGranularity != TimestampGranularity.None && responseFormat != AudioResponseFormat.Verbose_Json) + { + throw new ArgumentException($"{nameof(responseFormat)} must be set {AudioResponseFormat.Verbose_Json} to use timestamp granularities."); + } + + TimestampGranularities = timestampGranularity; } ~AudioTranscriptionRequest() => Dispose(false); @@ -202,6 +230,14 @@ public AudioTranscriptionRequest( /// public string Language { get; } + /// + /// The timestamp granularities to populate for this transcription. + /// response_format must be set verbose_json to use timestamp granularities. + /// Either or both of these options are supported: , or .
+ /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + ///
+ public TimestampGranularity TimestampGranularities { get; } + private void Dispose(bool disposing) { if (disposing) diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranslationRequest.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranslationRequest.cs index 51695d8c..3f7ca3ff 100644 --- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranslationRequest.cs +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranslationRequest.cs @@ -35,7 +35,7 @@ public sealed class AudioTranslationRequest : IDisposable public AudioTranslationRequest( string audioPath, string model = null, - string prompt = null, + string prompt = "response should be in english.", AudioResponseFormat responseFormat = AudioResponseFormat.Json, float? temperature = null) : this(File.OpenRead(audioPath), Path.GetFileName(audioPath), model, prompt, responseFormat, temperature) diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechRequest.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechRequest.cs index 9a1408ea..68574c01 100644 --- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechRequest.cs +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechRequest.cs @@ -53,12 +53,12 @@ public SpeechRequest(string input, Model model = null, SpeechVoice voice = Speec public SpeechVoice Voice { get; } /// - /// The format to audio in. Supported formats are mp3, opus, aac, and flac. + /// The format to audio in. Supported formats are mp3, opus, aac, flac, wav and pcm. /// [Preserve] [JsonProperty("response_format", DefaultValueHandling = DefaultValueHandling.Include)] - [FunctionProperty("The format to audio in. Supported formats are mp3, opus, aac, and flac.", false, SpeechResponseFormat.MP3)] - public SpeechResponseFormat ResponseFormat { get; } + [FunctionProperty("The format to audio in. Supported formats are mp3, opus, aac, flac, wav and pcm.", false, SpeechResponseFormat.MP3)] + public SpeechResponseFormat ResponseFormat { get; internal set; } /// /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default. diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechResponseFormat.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechResponseFormat.cs index 43fc4205..2b3afe4d 100644 --- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechResponseFormat.cs +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechResponseFormat.cs @@ -13,6 +13,10 @@ public enum SpeechResponseFormat [EnumMember(Value = "aac")] AAC, [EnumMember(Value = "flac")] - Flac + Flac, + [EnumMember(Value = "wav")] + WAV, + [EnumMember(Value = "pcm")] + PCM } } diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs new file mode 100644 index 00000000..c7c6028f --- /dev/null +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs @@ -0,0 +1,11 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +namespace OpenAI.Audio +{ + public enum TimestampGranularity + { + None = 0, + Word, + Segment + } +} diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs.meta b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs.meta new file mode 100644 index 00000000..cc3e64fd --- /dev/null +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 5ed575abdaf01c949862e343cb3bfcc6 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {fileID: 2800000, guid: 84a7eb8fc6eba7540bf56cea8e12249c, type: 3} + userData: + assetBundleName: + assetBundleVariant: diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs new file mode 100644 index 00000000..f517b1b2 --- /dev/null +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs @@ -0,0 +1,108 @@ +using Newtonsoft.Json; +using UnityEngine.Scripting; + +namespace OpenAI.Audio +{ + [Preserve] + public sealed class TranscriptionSegment + { + [Preserve] + [JsonConstructor] + public TranscriptionSegment( + [JsonProperty("id")] int id, + [JsonProperty("seek")] int seek, + [JsonProperty("start")] double start, + [JsonProperty("end")] double end, + [JsonProperty("text")] string text, + [JsonProperty("tokens")] int[] tokens, + [JsonProperty("temperature")] double temperature, + [JsonProperty("avg_logprob")] double averageLogProbability, + [JsonProperty("compression_ratio")] double compressionRatio, + [JsonProperty("no_speech_prob")] double noSpeechProbability) + { + Id = id; + Seek = seek; + Start = start; + End = end; + Text = text; + Tokens = tokens; + Temperature = temperature; + AverageLogProbability = averageLogProbability; + CompressionRatio = compressionRatio; + NoSpeechProbability = noSpeechProbability; + } + + /// + /// Unique identifier of the segment. + /// + [Preserve] + [JsonProperty("id")] + public int Id { get; } + + /// + /// Seek offset of the segment. + /// + [Preserve] + [JsonProperty("seek")] + public int Seek { get; } + + /// + /// Start time of the segment in seconds. + /// + [Preserve] + [JsonProperty("start")] + public double Start { get; } + + /// + /// End time of the segment in seconds. + /// + [Preserve] + [JsonProperty("end")] + public double End { get; } + + /// + /// Text content of the segment. + /// + [Preserve] + [JsonProperty("text")] + public string Text { get; } + + /// + /// Array of token IDs for the text content. + /// + [Preserve] + [JsonProperty("tokens")] + public int[] Tokens { get; } + + /// + /// Temperature parameter used for generating the segment. + /// + [Preserve] + [JsonProperty("temperature")] + public double Temperature { get; } + + /// + /// Average logprob of the segment. + /// If the value is lower than -1, consider the logprobs failed. + /// + [Preserve] + [JsonProperty("avg_logprob")] + public double AverageLogProbability { get; } + + /// + /// Compression ratio of the segment. + /// If the value is greater than 2.4, consider the compression failed. + /// + [Preserve] + [JsonProperty("compression_ratio")] + public double CompressionRatio { get; } + + /// + /// Probability of no speech in the segment. + /// If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent. + /// + [Preserve] + [JsonProperty("no_speech_prob")] + public double NoSpeechProbability { get; } + } +} \ No newline at end of file diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs.meta b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs.meta new file mode 100644 index 00000000..13ee2434 --- /dev/null +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: ddfdacc2cf4ac0a4d94e6df03b6b70ab +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {fileID: 2800000, guid: 84a7eb8fc6eba7540bf56cea8e12249c, type: 3} + userData: + assetBundleName: + assetBundleVariant: diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs new file mode 100644 index 00000000..f54a9b82 --- /dev/null +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs @@ -0,0 +1,42 @@ +using Newtonsoft.Json; +using UnityEngine.Scripting; + +namespace OpenAI.Audio +{ + [Preserve] + public sealed class TranscriptionWord + { + [Preserve] + [JsonConstructor] + public TranscriptionWord( + [JsonProperty("word")] string word, + [JsonProperty("start")] double start, + [JsonProperty("end")] double end) + { + Word = word; + Start = start; + End = end; + } + + /// + /// The text content of the word. + /// + [Preserve] + [JsonProperty("word")] + public string Word { get; } + + /// + /// Start time of the word in seconds. + /// + [Preserve] + [JsonProperty("start")] + public double Start { get; } + + /// + /// End time of the word in seconds. + /// + [Preserve] + [JsonProperty("end")] + public double End { get; } + } +} diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs.meta b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs.meta new file mode 100644 index 00000000..344e2973 --- /dev/null +++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 98528e8960e95fd4eb83731aec127f77 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {fileID: 2800000, guid: 84a7eb8fc6eba7540bf56cea8e12249c, type: 3} + userData: + assetBundleName: + assetBundleVariant: diff --git a/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs b/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs index 77881802..d6b4f6c1 100644 --- a/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs +++ b/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs @@ -6,6 +6,7 @@ using OpenAI.Models; using System; using System.Collections.Generic; +using System.Threading; using System.Threading.Tasks; using TMPro; using UnityEngine; @@ -45,6 +46,9 @@ public class ChatBehaviour : MonoBehaviour [SerializeField] private AudioSource audioSource; + [SerializeField] + private SpeechVoice voice; + [SerializeField] [TextArea(3, 10)] private string systemPrompt = "You are a helpful assistant.\n- If an image is requested then use \"![Image](output.jpg)\" to display it.\n- When performing function calls, use the defaults unless explicitly told to use a specific value.\n- Images should always be generated in base64."; @@ -115,7 +119,7 @@ private async void SubmitChat() assistantMessageContent.text += response.ToString().Replace("![Image](output.jpg)", string.Empty); } - GenerateSpeech(response); + await GenerateSpeechAsync(response, destroyCancellationToken); } catch (Exception e) { @@ -212,18 +216,70 @@ async Task ProcessToolCall() } } - private async void GenerateSpeech(string text) + private async Task GenerateSpeechAsync(string text, CancellationToken cancellationToken) { text = text.Replace("![Image](output.jpg)", string.Empty); if (string.IsNullOrWhiteSpace(text)) { return; } - var request = new SpeechRequest(text, Model.TTS_1); - var (clipPath, clip) = await openAI.AudioEndpoint.CreateSpeechAsync(request, destroyCancellationToken); - audioSource.PlayOneShot(clip); + var request = new SpeechRequest(text, Model.TTS_1, voice, SpeechResponseFormat.PCM); + var streamClipQueue = new Queue(); + var streamTcs = new TaskCompletionSource(); + var audioPlaybackTask = PlayStreamQueueAsync(streamTcs.Task); + var (clipPath, fullClip) = await openAI.AudioEndpoint.CreateSpeechStreamAsync(request, clip => streamClipQueue.Enqueue(clip), destroyCancellationToken); + streamTcs.SetResult(true); if (enableDebug) { Debug.Log(clipPath); } + + await audioPlaybackTask; + audioSource.clip = fullClip; + + async Task PlayStreamQueueAsync(Task streamTask) + { + try + { + await new WaitUntil(() => streamClipQueue.Count > 0); + var endOfFrame = new WaitForEndOfFrame(); + + do + { + if (!audioSource.isPlaying && + streamClipQueue.TryDequeue(out var clip)) + { + if (enableDebug) + { + Debug.Log($"playing partial clip: {clip.name} | ({streamClipQueue.Count} remaining)"); + } + + audioSource.PlayOneShot(clip); + // ReSharper disable once MethodSupportsCancellation + await Task.Delay(TimeSpan.FromSeconds(clip.length)).ConfigureAwait(true); + } + else + { + await endOfFrame; + } + + if (streamTask.IsCompleted && !audioSource.isPlaying && streamClipQueue.Count == 0) + { + return; + } + } while (!cancellationToken.IsCancellationRequested); + } + catch (Exception e) + { + switch (e) + { + case TaskCanceledException: + case OperationCanceledException: + break; + default: + Debug.LogError(e); + break; + } + } + } } private TextMeshProUGUI AddNewTextMessageContent(Role role) @@ -282,7 +338,7 @@ private async void ProcessRecording(Tuple recording) { recordButton.interactable = false; var request = new AudioTranscriptionRequest(clip, temperature: 0.1f, language: "en"); - var userInput = await openAI.AudioEndpoint.CreateTranscriptionAsync(request, destroyCancellationToken); + var userInput = await openAI.AudioEndpoint.CreateTranscriptionTextAsync(request, destroyCancellationToken); if (enableDebug) { diff --git a/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs b/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs index bc0877ea..40843567 100644 --- a/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs +++ b/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs @@ -3,6 +3,7 @@ using NUnit.Framework; using OpenAI.Audio; using System; +using System.Collections.Concurrent; using System.IO; using System.Threading.Tasks; using UnityEditor; @@ -17,10 +18,10 @@ public async Task Test_01_01_Transcription_Path() { Assert.IsNotNull(OpenAIClient.AudioEndpoint); var audioPath = AssetDatabase.GUIDToAssetPath("259eaa73cab84284eac307d3134c3ade"); - using var request = new AudioTranscriptionRequest(Path.GetFullPath(audioPath), temperature: 0.1f, language: "en"); - var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(request); - Assert.IsNotNull(result); - Debug.Log(result); + using var request = new AudioTranscriptionRequest(Path.GetFullPath(audioPath), responseFormat: AudioResponseFormat.Text, temperature: 0.1f, language: "en"); + var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request); + Assert.IsNotNull(response); + Debug.Log(response); } [Test] @@ -29,10 +30,40 @@ public async Task Test_01_02_Transcription_AudioClip() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var audioPath = AssetDatabase.GUIDToAssetPath("259eaa73cab84284eac307d3134c3ade"); var audioClip = AssetDatabase.LoadAssetAtPath(audioPath); - using var request = new AudioTranscriptionRequest(audioClip, temperature: 0.1f, language: "en"); - var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(request); - Assert.IsNotNull(result); - Debug.Log(result); + using var request = new AudioTranscriptionRequest(audioClip, responseFormat: AudioResponseFormat.Json, temperature: 0.1f, language: "en"); + var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request); + Assert.IsNotNull(response); + Debug.Log(response); + } + + [Test] + public async Task Test_01_03_01_Transcription_VerboseJson() + { + Assert.IsNotNull(OpenAIClient.AudioEndpoint); + var audioPath = AssetDatabase.GUIDToAssetPath("259eaa73cab84284eac307d3134c3ade"); + var audioClip = AssetDatabase.LoadAssetAtPath(audioPath); + using var request = new AudioTranscriptionRequest(audioClip, responseFormat: AudioResponseFormat.Verbose_Json, temperature: 0.1f, language: "en"); + var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); + Assert.IsNotNull(response); + Assert.IsNotNull(response.Duration); + Assert.IsTrue(response.Language == "english"); + Assert.IsNotNull(response.Segments); + Assert.IsNotEmpty(response.Segments); + } + + [Test] + public async Task Test_01_03_02_Transcription_VerboseJson_WordSimilarities() + { + Assert.IsNotNull(OpenAIClient.AudioEndpoint); + var audioPath = AssetDatabase.GUIDToAssetPath("259eaa73cab84284eac307d3134c3ade"); + var audioClip = AssetDatabase.LoadAssetAtPath(audioPath); + using var request = new AudioTranscriptionRequest(audioClip, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en"); + var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); + Assert.IsNotNull(response); + Assert.IsNotNull(response.Duration); + Assert.IsTrue(response.Language == "english"); + Assert.IsNotNull(response.Words); + Assert.IsNotEmpty(response.Words); } [Test] @@ -41,7 +72,7 @@ public async Task Test_02_01_Translation_Path() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var audioPath = AssetDatabase.GUIDToAssetPath("3ab176222366dc241894506c315c6fa4"); using var request = new AudioTranslationRequest(Path.GetFullPath(audioPath)); - var result = await OpenAIClient.AudioEndpoint.CreateTranslationAsync(request); + var result = await OpenAIClient.AudioEndpoint.CreateTranslationTextAsync(request); Assert.IsNotNull(result); Debug.Log(result); } @@ -52,14 +83,14 @@ public async Task Test_02_02_Translation_AudioClip() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var audioPath = AssetDatabase.GUIDToAssetPath("3ab176222366dc241894506c315c6fa4"); var audioClip = AssetDatabase.LoadAssetAtPath(audioPath); - using var request = new AudioTranslationRequest(audioClip); - var result = await OpenAIClient.AudioEndpoint.CreateTranslationAsync(request); - Assert.IsNotNull(result); - Debug.Log(result); + using var request = new AudioTranslationRequest(audioClip, prompt: "responses should be in spanish."); + var response = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request); + Assert.IsNotNull(response); + Debug.Log(response); } [Test] - public async Task Test_3_Speech() + public async Task Test_03_01_Speech() { Assert.IsNotNull(OpenAIClient.AudioEndpoint); var request = new SpeechRequest("Hello world!"); @@ -67,5 +98,17 @@ public async Task Test_3_Speech() Debug.Log(path); Assert.IsNotNull(clip); } + + [Test] + public async Task Test_03_02_Speech_Streaming() + { + Assert.IsNotNull(OpenAIClient.AudioEndpoint); + var request = new SpeechRequest("Hello world!", responseFormat: SpeechResponseFormat.PCM); + var clipQueue = new ConcurrentQueue(); + var (path, clip) = await OpenAIClient.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => clipQueue.Enqueue(partialClip)); + Debug.Log(path); + Assert.IsNotNull(clip); + Assert.IsTrue(clipQueue.Count > 0); + } } } diff --git a/OpenAI/Packages/com.openai.unity/package.json b/OpenAI/Packages/com.openai.unity/package.json index fe0d8160..c30b955e 100644 --- a/OpenAI/Packages/com.openai.unity/package.json +++ b/OpenAI/Packages/com.openai.unity/package.json @@ -3,7 +3,7 @@ "displayName": "OpenAI", "description": "A OpenAI package for the Unity Game Engine to use GPT-4, GPT-3.5, GPT-3 and Dall-E though their RESTful API (currently in beta).\n\nIndependently developed, this is not an official library and I am not affiliated with OpenAI.\n\nAn OpenAI API account is required.", "keywords": [], - "version": "7.7.5", + "version": "7.7.6", "unity": "2021.3", "documentationUrl": "https://github.com/RageAgainstThePixel/com.openai.unity#documentation", "changelogUrl": "https://github.com/RageAgainstThePixel/com.openai.unity/releases", @@ -17,7 +17,7 @@ "url": "https://github.com/StephenHodgson" }, "dependencies": { - "com.utilities.rest": "2.5.4", + "com.utilities.rest": "2.5.5", "com.utilities.encoder.wav": "1.1.5" }, "samples": [ diff --git a/OpenAI/Packages/manifest.json b/OpenAI/Packages/manifest.json index c322539c..7f6b7ccc 100644 --- a/OpenAI/Packages/manifest.json +++ b/OpenAI/Packages/manifest.json @@ -3,7 +3,7 @@ "com.unity.ide.rider": "3.0.28", "com.unity.ide.visualstudio": "2.0.22", "com.unity.textmeshpro": "3.0.8", - "com.utilities.buildpipeline": "1.2.3" + "com.utilities.buildpipeline": "1.3.0" }, "scopedRegistries": [ { diff --git a/README.md b/README.md index c273665b..1c0a5310 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,7 @@ The recommended installation method is though the unity package manager and [Ope - [Json Mode](#chat-json-mode) - [Audio](#audio) - [Create Speech](#create-speech) + - [Stream Speech](#stream-speech) :new: - [Create Transcription](#create-transcription) - [Create Translation](#create-translation) - [Images](#images) @@ -1047,6 +1048,18 @@ var api = new OpenAIClient(); var request = new SpeechRequest("Hello world!"); var (path, clip) = await api.AudioEndpoint.CreateSpeechAsync(request); audioSource.PlayOneShot(clip); +Debug.Log(path); +``` + +##### [Stream Speech] + +Generate streamed audio from the input text. + +```csharp +var api = new OpenAIClient(); +var request = new SpeechRequest("Hello world!"); +var (path, clip) = await OpenAIClient.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => audioSource.PlayOneShot(partialClip)); +Debug.Log(path); ``` #### [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription) @@ -1060,6 +1073,19 @@ var result = await api.AudioEndpoint.CreateTranscriptionAsync(request); Debug.Log(result); ``` +You can also get detailed information using `verbose_json` to get timestamp granularities: + +```csharp +var api = new OpenAIClient(); +using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en"); +var response = await api.AudioEndpoint.CreateTranscriptionTextAsync(request); + +foreach (var word in response.Words) +{ + Debug.Log($"[{word.Start}-{word.End}] \"{word.Word}\""); +} +``` + #### [Create Translation](https://platform.openai.com/docs/api-reference/audio/createTranslation) Translates audio into into English. @@ -1187,7 +1213,7 @@ Returns information about a specific file. ```csharp var api = new OpenAIClient(); -var file = await GetFileInfoAsync(fileId); +var file = await api.FilesEndpoint.GetFileInfoAsync(fileId); Debug.Log($"{file.Id} -> {file.Object}: {file.FileName} | {file.Size} bytes"); ```