From 93eed233ab706b56a1ea683eb49224a496f67b9b Mon Sep 17 00:00:00 2001
From: Stephen Hodgson <rage.against.the.pixel@gmail.com>
Date: Mon, 18 Mar 2024 21:39:43 -0400
Subject: [PATCH] com.openai.unity 7.7.6 (#210)

- Added support for streaming text to speech
  - Added AudioEndpoit.CreateSpeechStreamAsync(SpeechRequest, Action<AudioClip>, CancellationToken)
- Added support for Audio Transcription and Translation verbose json output
  - Added support for timestamp granularities for segments and words
  - Marked CreateTranscriptionAsync obsolete
  - Added CreateTranscriptionTextAsync
  - Added CreateTranscriptionJsonAsync
  - Marked CreateTranspationAsync obsolete
  - Added CreateTranslationTextAsync
  - Added CreateTranslationJsonAsync
- Updated SpeechResponseFormat to include wav and pcm
---
 .../com.openai.unity/Documentation~/README.md |  28 ++-
 .../Runtime/Audio/AudioEndpoint.cs            | 160 ++++++++++++++----
 .../Runtime/Audio/AudioResponse.cs            |  62 +++++++
 .../Runtime/Audio/AudioResponse.cs.meta       |  11 ++
 .../Audio/AudioTranscriptionRequest.cs        |  46 ++++-
 .../Runtime/Audio/AudioTranslationRequest.cs  |   2 +-
 .../Runtime/Audio/SpeechRequest.cs            |   6 +-
 .../Runtime/Audio/SpeechResponseFormat.cs     |   6 +-
 .../Runtime/Audio/TimestampGranularity.cs     |  11 ++
 .../Audio/TimestampGranularity.cs.meta        |  11 ++
 .../Runtime/Audio/TranscriptionSegment.cs     | 108 ++++++++++++
 .../Audio/TranscriptionSegment.cs.meta        |  11 ++
 .../Runtime/Audio/TranscriptionWord.cs        |  42 +++++
 .../Runtime/Audio/TranscriptionWord.cs.meta   |  11 ++
 .../Samples~/Chat/ChatBehaviour.cs            |  68 +++++++-
 .../Tests/TestFixture_07_Audio.cs             |  71 ++++++--
 OpenAI/Packages/com.openai.unity/package.json |   4 +-
 OpenAI/Packages/manifest.json                 |   2 +-
 README.md                                     |  28 ++-
 19 files changed, 623 insertions(+), 65 deletions(-)
 create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs
 create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs.meta
 create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs
 create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs.meta
 create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs
 create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs.meta
 create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs
 create mode 100644 OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs.meta
diff --git a/OpenAI/Packages/com.openai.unity/Documentation~/README.md b/OpenAI/Packages/com.openai.unity/Documentation~/README.md
index 45dc3944..6f10d81d 100644
--- a/OpenAI/Packages/com.openai.unity/Documentation~/README.md
+++ b/OpenAI/Packages/com.openai.unity/Documentation~/README.md
@@ -103,6 +103,7 @@ The recommended installation method is though the unity package manager and [Ope
   - [Json Mode](#chat-json-mode)
 - [Audio](#audio)
   - [Create Speech](#create-speech)
+    - [Stream Speech](#stream-speech) :new:
   - [Create Transcription](#create-transcription)
   - [Create Translation](#create-translation)
 - [Images](#images)
@@ -1047,6 +1048,18 @@ var api = new OpenAIClient();
 var request = new SpeechRequest("Hello world!");
 var (path, clip) = await api.AudioEndpoint.CreateSpeechAsync(request);
 audioSource.PlayOneShot(clip);
+Debug.Log(path);
+```
+
+##### [Stream Speech]
+
+Generate streamed audio from the input text.
+
+```csharp
+var api = new OpenAIClient();
+var request = new SpeechRequest("Hello world!");
+var (path, clip) = await OpenAIClient.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => audioSource.PlayOneShot(partialClip));
+Debug.Log(path);
 ```
 
 #### [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
@@ -1060,6 +1073,19 @@ var result = await api.AudioEndpoint.CreateTranscriptionAsync(request);
 Debug.Log(result);
 ```
 
+You can also get detailed information using `verbose_json` to get timestamp granularities:
+
+```csharp
+var api = new OpenAIClient();
+using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en");
+var response = await api.AudioEndpoint.CreateTranscriptionTextAsync(request);
+
+foreach (var word in response.Words)
+{
+    Debug.Log($"[{word.Start}-{word.End}] \"{word.Word}\"");
+}
+```
+
 #### [Create Translation](https://platform.openai.com/docs/api-reference/audio/createTranslation)
 
 Translates audio into into English.
@@ -1187,7 +1213,7 @@ Returns information about a specific file.
 
 ```csharp
 var api = new OpenAIClient();
-var file = await GetFileInfoAsync(fileId);
+var file = await  api.FilesEndpoint.GetFileInfoAsync(fileId);
 Debug.Log($"{file.Id} -> {file.Object}: {file.FileName} | {file.Size} bytes");
 ```
 
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs
index eccde182..eb4f66f9 100644
--- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs
@@ -7,7 +7,6 @@
 using System.Threading.Tasks;
 using UnityEngine;
 using UnityEngine.Networking;
-using UnityEngine.Scripting;
 using Utilities.WebRequestRest;
 
 namespace OpenAI.Audio
@@ -18,21 +17,6 @@ namespace OpenAI.Audio
     /// </summary>
     public sealed class AudioEndpoint : OpenAIBaseEndpoint
     {
-        [Preserve]
-        private class AudioResponse
-        {
-            [Preserve]
-            [JsonConstructor]
-            public AudioResponse([JsonProperty("text")] string text)
-            {
-                Text = text;
-            }
-
-            [Preserve]
-            [JsonProperty("text")]
-            public string Text { get; }
-        }
-
         internal AudioEndpoint(OpenAIClient client) : base(client) { }
 
         /// <inheritdoc />
@@ -48,15 +32,29 @@ internal AudioEndpoint(OpenAIClient client) : base(client) { }
         /// <returns><see cref="AudioClip"/> and the cached path.</returns>
         [Function("Generates audio from the input text.")]
         public async Task<Tuple<string, AudioClip>> CreateSpeechAsync(SpeechRequest request, CancellationToken cancellationToken = default)
+            => await CreateSpeechStreamAsync(request, null, cancellationToken);
+
+        /// <summary>
+        /// Generates streaming audio from the input text.
+        /// </summary>
+        /// <param name="request"><see cref="SpeechRequest"/>.</param>
+        /// <param name="partialClipCallback">Optional, partial <see cref="AudioClip"/> callback used to stream audio.</param>
+        /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
+        /// <returns><see cref="AudioClip"/> and the cached path.</returns>
+        [Function("Generates streaming audio from the input text.")]
+        public async Task<Tuple<string, AudioClip>> CreateSpeechStreamAsync(SpeechRequest request, Action<AudioClip> partialClipCallback, CancellationToken cancellationToken = default)
         {
-            var audioFormat = request.ResponseFormat switch
+            if (partialClipCallback != null && request.ResponseFormat != SpeechResponseFormat.PCM)
             {
-                SpeechResponseFormat.MP3 => AudioType.MPEG,
-                _ => throw new NotSupportedException(request.ResponseFormat.ToString())
-            };
+                Debug.LogWarning("Speech streaming only supported with PCM response format. Overriding to PCM...");
+                request.ResponseFormat = SpeechResponseFormat.PCM;
+            }
+
             var ext = request.ResponseFormat switch
             {
                 SpeechResponseFormat.MP3 => "mp3",
+                SpeechResponseFormat.WAV => "wav",
+                SpeechResponseFormat.PCM => "pcm",
                 _ => throw new NotSupportedException(request.ResponseFormat.ToString())
             };
             var payload = JsonConvert.SerializeObject(request, OpenAIClient.JsonSerializationOptions);
@@ -64,9 +62,48 @@ public async Task<Tuple<string, AudioClip>> CreateSpeechAsync(SpeechRequest requ
 
             lock (mutex)
             {
-                clipName = $"{request.Voice}-{DateTime.UtcNow:yyyyMMddThhmmssff}.{ext}";
+                clipName = $"{request.Voice}-{DateTime.UtcNow:yyyyMMddThhmmssfffff}.{ext}";
             }
 
+            Rest.TryGetDownloadCacheItem(clipName, out var cachedPath);
+
+            if (request.ResponseFormat == SpeechResponseFormat.PCM)
+            {
+                var part = 0;
+                var response = await Rest.PostAsync(
+                    GetUrl("/speech"),
+                    payload,
+                    StreamCallback,
+                    eventChunkSize: 8192,
+                    new RestParameters(client.DefaultRequestHeaders),
+                    cancellationToken);
+                response.Validate(EnableDebug);
+                var samples = Utilities.Audio.PCMEncoder.Decode(response.Data);
+                await File.WriteAllBytesAsync(cachedPath, response.Data, cancellationToken).ConfigureAwait(true);
+                return new Tuple<string, AudioClip>(cachedPath, AudioClip.Create(clipName, samples.Length, 1, 24000, false));
+
+                void StreamCallback(Response partialResponse)
+                {
+                    var chunk = Utilities.Audio.PCMEncoder.Decode(partialResponse.Data);
+                    var partialClip = AudioClip.Create($"{clipName}_{++part}", chunk.Length, 1, 24000, false);
+
+                    if (!partialClip.SetData(chunk, 0))
+                    {
+                        Debug.LogError("Failed to set pcm data to partial clip.");
+                        return;
+                    }
+
+                    partialClipCallback?.Invoke(partialClip);
+                }
+            }
+
+            var audioFormat = request.ResponseFormat switch
+            {
+                SpeechResponseFormat.MP3 => AudioType.MPEG,
+                SpeechResponseFormat.WAV => AudioType.WAV,
+                _ => throw new NotSupportedException(request.ResponseFormat.ToString())
+            };
+
             var clip = await Rest.DownloadAudioClipAsync(
                 GetUrl("/speech"),
                 audioFormat,
@@ -75,17 +112,46 @@ public async Task<Tuple<string, AudioClip>> CreateSpeechAsync(SpeechRequest requ
                 payload,
                 parameters: new RestParameters(client.DefaultRequestHeaders, debug: EnableDebug),
                 cancellationToken: cancellationToken);
-            Rest.TryGetDownloadCacheItem(clipName, out var cachedPath);
             return new Tuple<string, AudioClip>(cachedPath, clip);
         }
 
+        [Obsolete("Use CreateTranscriptionTextAsync or CreateTranscriptionJsonAsync instead.")]
+        public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
+            => await CreateTranscriptionTextAsync(request, cancellationToken);
+
         /// <summary>
         /// Transcribes audio into the input language.
         /// </summary>
         /// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
         /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
         /// <returns>The transcribed text.</returns>
-        public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
+        public async Task<string> CreateTranscriptionTextAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
+        {
+            var response = await Internal_CreateTranscriptionAsync(request, cancellationToken);
+            return request.ResponseFormat == AudioResponseFormat.Json
+                ? JsonConvert.DeserializeObject<AudioResponse>(response)?.Text
+                : response;
+        }
+
+        /// <summary>
+        /// Transcribes audio into the input language.
+        /// </summary>
+        /// <remarks>This method expects the request format to be either <see cref="AudioResponseFormat.Json"/> or <see cref="AudioResponseFormat.Verbose_Json"/>.</remarks>
+        /// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
+        /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
+        /// <returns><see cref="AudioResponse"/>.</returns>
+        public async Task<AudioResponse> CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
+        {
+            if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
+            {
+                throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat));
+            }
+
+            var response = await Internal_CreateTranscriptionAsync(request, cancellationToken);
+            return JsonConvert.DeserializeObject<AudioResponse>(response);
+        }
+
+        private async Task<string> Internal_CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
         {
             var form = new WWWForm();
             using var audioData = new MemoryStream();
@@ -111,22 +177,58 @@ public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest req
                 form.AddField("language", request.Language);
             }
 
+            switch (request.TimestampGranularities)
+            {
+                case TimestampGranularity.Segment:
+                case TimestampGranularity.Word:
+                    form.AddField("timestamp_granularities[]", request.TimestampGranularities.ToString().ToLower());
+                    break;
+            }
+
             request.Dispose();
 
             var response = await Rest.PostAsync(GetUrl("/transcriptions"), form, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
             response.Validate(EnableDebug);
-            return responseFormat == AudioResponseFormat.Json
-                ? JsonConvert.DeserializeObject<AudioResponse>(response.Body)?.Text
-                : response.Body;
+            return response.Body;
         }
 
+        [Obsolete("Use CreateTranslationTextAsync or CreateTranslationJsonAsync instead.")]
+        public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
+            => await CreateTranslationTextAsync(request, cancellationToken);
+
         /// <summary>
         /// Translates audio into English.
         /// </summary>
         /// <param name="request"></param>
         /// <param name="cancellationToken"></param>
         /// <returns>The translated text.</returns>
-        public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
+        public async Task<string> CreateTranslationTextAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
+        {
+            var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken);
+            return request.ResponseFormat == AudioResponseFormat.Json
+                ? JsonConvert.DeserializeObject<AudioResponse>(responseAsString)?.Text
+                : responseAsString;
+        }
+
+        /// <summary>
+        /// Translates audio into English.
+        /// </summary>
+        /// <param name="request"></param>
+        /// <param name="cancellationToken"></param>
+        /// <returns></returns>
+        /// <exception cref="ArgumentException"></exception>
+        public async Task<AudioResponse> CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
+        {
+            if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
+            {
+                throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat));
+            }
+
+            var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken);
+            return JsonConvert.DeserializeObject<AudioResponse>(responseAsString);
+        }
+
+        private async Task<string> Internal_CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken)
         {
             var form = new WWWForm();
             using var audioData = new MemoryStream();
@@ -151,9 +253,7 @@ public async Task<string> CreateTranslationAsync(AudioTranslationRequest request
 
             var response = await Rest.PostAsync(GetUrl("/translations"), form, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
             response.Validate(EnableDebug);
-            return responseFormat == AudioResponseFormat.Json
-                ? JsonConvert.DeserializeObject<AudioResponse>(response.Body)?.Text
-                : response.Body;
+            return response.Body;
         }
     }
 }
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs
new file mode 100644
index 00000000..cac4b419
--- /dev/null
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs
@@ -0,0 +1,62 @@
+// Licensed under the MIT License. See LICENSE in the project root for license information.
+
+using Newtonsoft.Json;
+using UnityEngine.Scripting;
+
+namespace OpenAI.Audio
+{
+    [Preserve]
+    public class AudioResponse
+    {
+        [Preserve]
+        [JsonConstructor]
+        public AudioResponse(
+            [JsonProperty("language")] string language,
+            [JsonProperty("duration")] double? duration,
+            [JsonProperty("text")] string text,
+            [JsonProperty("words")] TranscriptionWord[] words,
+            [JsonProperty("segments")] TranscriptionSegment[] segments)
+        {
+            Language = language;
+            Duration = duration;
+            Text = text;
+            Words = words;
+            Segments = segments;
+        }
+
+        /// <summary>
+        /// The language of the input audio.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("language")]
+        public string Language { get; }
+
+        /// <summary>
+        /// The duration of the input audio.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("duration")]
+        public double? Duration { get; }
+
+        /// <summary>
+        /// The transcribed text.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("text")]
+        public string Text { get; }
+
+        /// <summary>
+        /// Extracted words and their corresponding timestamps.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("words")]
+        public TranscriptionWord[] Words { get; }
+
+        /// <summary>
+        /// Segments of the transcribed text and their corresponding details.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("segments")]
+        public TranscriptionSegment[] Segments { get; }
+    }
+}
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs.meta b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs.meta
new file mode 100644
index 00000000..cef8d2e0
--- /dev/null
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 31e77051dcea1474192df60c0d5a0052
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {fileID: 2800000, guid: 84a7eb8fc6eba7540bf56cea8e12249c, type: 3}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranscriptionRequest.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranscriptionRequest.cs
index 1b4686a0..76cd0b4f 100644
--- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranscriptionRequest.cs
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranscriptionRequest.cs
@@ -41,14 +41,21 @@ public sealed class AudioTranscriptionRequest : IDisposable
         /// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian,
         /// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh.
         /// </param>
+        /// <param name="timestampGranularity">
+        /// The timestamp granularities to populate for this transcription.
+        /// response_format must be set verbose_json to use timestamp granularities.
+        /// Either or both of these options are supported: <see cref="TimestampGranularity.Word"/>, or <see cref="TimestampGranularity.Segment"/>. <br/>
+        /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
+        /// </param>
         public AudioTranscriptionRequest(
             string audioPath,
             string model = null,
             string prompt = null,
             AudioResponseFormat responseFormat = AudioResponseFormat.Json,
             float? temperature = null,
-            string language = null)
-            : this(File.OpenRead(audioPath), Path.GetFileName(audioPath), model, prompt, responseFormat, temperature, language)
+            string language = null,
+            TimestampGranularity timestampGranularity = TimestampGranularity.None)
+            : this(File.OpenRead(audioPath), Path.GetFileName(audioPath), model, prompt, responseFormat, temperature, language, timestampGranularity)
         {
         }
 
@@ -84,14 +91,21 @@ public AudioTranscriptionRequest(
         /// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian,
         /// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh.
         /// </param>
+        /// <param name="timestampGranularity">
+        /// The timestamp granularities to populate for this transcription.
+        /// response_format must be set verbose_json to use timestamp granularities.
+        /// Either or both of these options are supported: <see cref="TimestampGranularity.Word"/>, or <see cref="TimestampGranularity.Segment"/>. <br/>
+        /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
+        /// </param>
         public AudioTranscriptionRequest(
             AudioClip audio,
             string model = null,
             string prompt = null,
             AudioResponseFormat responseFormat = AudioResponseFormat.Json,
             float? temperature = null,
-            string language = null)
-            : this(new MemoryStream(audio.EncodeToWav()), $"{audio.name}.wav", model, prompt, responseFormat, temperature, language)
+            string language = null,
+            TimestampGranularity timestampGranularity = TimestampGranularity.None)
+            : this(new MemoryStream(audio.EncodeToWav()), $"{audio.name}.wav", model, prompt, responseFormat, temperature, language, timestampGranularity)
         {
         }
 
@@ -130,6 +144,12 @@ public AudioTranscriptionRequest(
         /// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian,
         /// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh.
         /// </param>
+        /// <param name="timestampGranularity">
+        /// The timestamp granularities to populate for this transcription.
+        /// response_format must be set verbose_json to use timestamp granularities.
+        /// Either or both of these options are supported: <see cref="TimestampGranularity.Word"/>, or <see cref="TimestampGranularity.Segment"/>. <br/>
+        /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
+        /// </param>
         public AudioTranscriptionRequest(
             Stream audio,
             string audioName,
@@ -137,7 +157,8 @@ public AudioTranscriptionRequest(
             string prompt = null,
             AudioResponseFormat responseFormat = AudioResponseFormat.Json,
             float? temperature = null,
-            string language = null)
+            string language = null,
+            TimestampGranularity timestampGranularity = TimestampGranularity.None)
         {
             Audio = audio;
 
@@ -152,6 +173,13 @@ public AudioTranscriptionRequest(
             ResponseFormat = responseFormat;
             Temperature = temperature;
             Language = language;
+
+            if (timestampGranularity != TimestampGranularity.None && responseFormat != AudioResponseFormat.Verbose_Json)
+            {
+                throw new ArgumentException($"{nameof(responseFormat)} must be set {AudioResponseFormat.Verbose_Json} to use timestamp granularities.");
+            }
+
+            TimestampGranularities = timestampGranularity;
         }
 
         ~AudioTranscriptionRequest() => Dispose(false);
@@ -202,6 +230,14 @@ public AudioTranscriptionRequest(
         /// </summary>
         public string Language { get; }
 
+        /// <summary>
+        /// The timestamp granularities to populate for this transcription.
+        /// response_format must be set verbose_json to use timestamp granularities.
+        /// Either or both of these options are supported: <see cref="TimestampGranularity.Word"/>, or <see cref="TimestampGranularity.Segment"/>. <br/>
+        /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
+        /// </summary>
+        public TimestampGranularity TimestampGranularities { get; }
+
         private void Dispose(bool disposing)
         {
             if (disposing)
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranslationRequest.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranslationRequest.cs
index 51695d8c..3f7ca3ff 100644
--- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranslationRequest.cs
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioTranslationRequest.cs
@@ -35,7 +35,7 @@ public sealed class AudioTranslationRequest : IDisposable
         public AudioTranslationRequest(
             string audioPath,
             string model = null,
-            string prompt = null,
+            string prompt = "response should be in english.",
             AudioResponseFormat responseFormat = AudioResponseFormat.Json,
             float? temperature = null)
             : this(File.OpenRead(audioPath), Path.GetFileName(audioPath), model, prompt, responseFormat, temperature)
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechRequest.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechRequest.cs
index 9a1408ea..68574c01 100644
--- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechRequest.cs
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechRequest.cs
@@ -53,12 +53,12 @@ public SpeechRequest(string input, Model model = null, SpeechVoice voice = Speec
         public SpeechVoice Voice { get; }
 
         /// <summary>
-        /// The format to audio in. Supported formats are mp3, opus, aac, and flac.
+        /// The format to audio in. Supported formats are mp3, opus, aac, flac, wav and pcm.
         /// </summary>
         [Preserve]
         [JsonProperty("response_format", DefaultValueHandling = DefaultValueHandling.Include)]
-        [FunctionProperty("The format to audio in. Supported formats are mp3, opus, aac, and flac.", false, SpeechResponseFormat.MP3)]
-        public SpeechResponseFormat ResponseFormat { get; }
+        [FunctionProperty("The format to audio in. Supported formats are mp3, opus, aac, flac, wav and pcm.", false, SpeechResponseFormat.MP3)]
+        public SpeechResponseFormat ResponseFormat { get; internal set; }
 
         /// <summary>
         /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechResponseFormat.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechResponseFormat.cs
index 43fc4205..2b3afe4d 100644
--- a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechResponseFormat.cs
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechResponseFormat.cs
@@ -13,6 +13,10 @@ public enum SpeechResponseFormat
         [EnumMember(Value = "aac")]
         AAC,
         [EnumMember(Value = "flac")]
-        Flac
+        Flac,
+        [EnumMember(Value = "wav")]
+        WAV,
+        [EnumMember(Value = "pcm")]
+        PCM
     }
 }
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs
new file mode 100644
index 00000000..c7c6028f
--- /dev/null
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs
@@ -0,0 +1,11 @@
+// Licensed under the MIT License. See LICENSE in the project root for license information.
+
+namespace OpenAI.Audio
+{
+    public enum TimestampGranularity
+    {
+        None = 0,
+        Word,
+        Segment
+    }
+}
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs.meta b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs.meta
new file mode 100644
index 00000000..cc3e64fd
--- /dev/null
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TimestampGranularity.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5ed575abdaf01c949862e343cb3bfcc6
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {fileID: 2800000, guid: 84a7eb8fc6eba7540bf56cea8e12249c, type: 3}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs
new file mode 100644
index 00000000..f517b1b2
--- /dev/null
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs
@@ -0,0 +1,108 @@
+using Newtonsoft.Json;
+using UnityEngine.Scripting;
+
+namespace OpenAI.Audio
+{
+    [Preserve]
+    public sealed class TranscriptionSegment
+    {
+        [Preserve]
+        [JsonConstructor]
+        public TranscriptionSegment(
+            [JsonProperty("id")] int id,
+            [JsonProperty("seek")] int seek,
+            [JsonProperty("start")] double start,
+            [JsonProperty("end")] double end,
+            [JsonProperty("text")] string text,
+            [JsonProperty("tokens")] int[] tokens,
+            [JsonProperty("temperature")] double temperature,
+            [JsonProperty("avg_logprob")] double averageLogProbability,
+            [JsonProperty("compression_ratio")] double compressionRatio,
+            [JsonProperty("no_speech_prob")] double noSpeechProbability)
+        {
+            Id = id;
+            Seek = seek;
+            Start = start;
+            End = end;
+            Text = text;
+            Tokens = tokens;
+            Temperature = temperature;
+            AverageLogProbability = averageLogProbability;
+            CompressionRatio = compressionRatio;
+            NoSpeechProbability = noSpeechProbability;
+        }
+
+        /// <summary>
+        /// Unique identifier of the segment.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("id")]
+        public int Id { get; }
+
+        /// <summary>
+        /// Seek offset of the segment.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("seek")]
+        public int Seek { get; }
+
+        /// <summary>
+        /// Start time of the segment in seconds.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("start")]
+        public double Start { get; }
+
+        /// <summary>
+        /// End time of the segment in seconds.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("end")]
+        public double End { get; }
+
+        /// <summary>
+        /// Text content of the segment.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("text")]
+        public string Text { get; }
+
+        /// <summary>
+        /// Array of token IDs for the text content.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("tokens")]
+        public int[] Tokens { get; }
+
+        /// <summary>
+        /// Temperature parameter used for generating the segment.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("temperature")]
+        public double Temperature { get; }
+
+        /// <summary>
+        /// Average logprob of the segment.
+        /// If the value is lower than -1, consider the logprobs failed.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("avg_logprob")]
+        public double AverageLogProbability { get; }
+
+        /// <summary>
+        /// Compression ratio of the segment.
+        /// If the value is greater than 2.4, consider the compression failed.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("compression_ratio")]
+        public double CompressionRatio { get; }
+
+        /// <summary>
+        /// Probability of no speech in the segment.
+        /// If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("no_speech_prob")]
+        public double NoSpeechProbability { get; }
+    }
+}
\ No newline at end of file
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs.meta b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs.meta
new file mode 100644
index 00000000..13ee2434
--- /dev/null
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionSegment.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: ddfdacc2cf4ac0a4d94e6df03b6b70ab
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {fileID: 2800000, guid: 84a7eb8fc6eba7540bf56cea8e12249c, type: 3}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs
new file mode 100644
index 00000000..f54a9b82
--- /dev/null
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs
@@ -0,0 +1,42 @@
+using Newtonsoft.Json;
+using UnityEngine.Scripting;
+
+namespace OpenAI.Audio
+{
+    [Preserve]
+    public sealed class TranscriptionWord
+    {
+        [Preserve]
+        [JsonConstructor]
+        public TranscriptionWord(
+            [JsonProperty("word")] string word,
+            [JsonProperty("start")] double start,
+            [JsonProperty("end")] double end)
+        {
+            Word = word;
+            Start = start;
+            End = end;
+        }
+
+        /// <summary>
+        /// The text content of the word.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("word")]
+        public string Word { get; }
+
+        /// <summary>
+        /// Start time of the word in seconds.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("start")]
+        public double Start { get; }
+
+        /// <summary>
+        /// End time of the word in seconds.
+        /// </summary>
+        [Preserve]
+        [JsonProperty("end")]
+        public double End { get; }
+    }
+}
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs.meta b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs.meta
new file mode 100644
index 00000000..344e2973
--- /dev/null
+++ b/OpenAI/Packages/com.openai.unity/Runtime/Audio/TranscriptionWord.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 98528e8960e95fd4eb83731aec127f77
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {fileID: 2800000, guid: 84a7eb8fc6eba7540bf56cea8e12249c, type: 3}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs b/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs
index 77881802..d6b4f6c1 100644
--- a/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs
+++ b/OpenAI/Packages/com.openai.unity/Samples~/Chat/ChatBehaviour.cs
@@ -6,6 +6,7 @@
 using OpenAI.Models;
 using System;
 using System.Collections.Generic;
+using System.Threading;
 using System.Threading.Tasks;
 using TMPro;
 using UnityEngine;
@@ -45,6 +46,9 @@ public class ChatBehaviour : MonoBehaviour
         [SerializeField]
         private AudioSource audioSource;
 
+        [SerializeField]
+        private SpeechVoice voice;
+
         [SerializeField]
         [TextArea(3, 10)]
         private string systemPrompt = "You are a helpful assistant.\n- If an image is requested then use \"![Image](output.jpg)\" to display it.\n- When performing function calls, use the defaults unless explicitly told to use a specific value.\n- Images should always be generated in base64.";
@@ -115,7 +119,7 @@ private async void SubmitChat()
                     assistantMessageContent.text += response.ToString().Replace("![Image](output.jpg)", string.Empty);
                 }
 
-                GenerateSpeech(response);
+                await GenerateSpeechAsync(response, destroyCancellationToken);
             }
             catch (Exception e)
             {
@@ -212,18 +216,70 @@ async Task ProcessToolCall()
             }
         }
 
-        private async void GenerateSpeech(string text)
+        private async Task GenerateSpeechAsync(string text, CancellationToken cancellationToken)
         {
             text = text.Replace("![Image](output.jpg)", string.Empty);
             if (string.IsNullOrWhiteSpace(text)) { return; }
-            var request = new SpeechRequest(text, Model.TTS_1);
-            var (clipPath, clip) = await openAI.AudioEndpoint.CreateSpeechAsync(request, destroyCancellationToken);
-            audioSource.PlayOneShot(clip);
+            var request = new SpeechRequest(text, Model.TTS_1, voice, SpeechResponseFormat.PCM);
+            var streamClipQueue = new Queue<AudioClip>();
+            var streamTcs = new TaskCompletionSource<bool>();
+            var audioPlaybackTask = PlayStreamQueueAsync(streamTcs.Task);
+            var (clipPath, fullClip) = await openAI.AudioEndpoint.CreateSpeechStreamAsync(request, clip => streamClipQueue.Enqueue(clip), destroyCancellationToken);
+            streamTcs.SetResult(true);
 
             if (enableDebug)
             {
                 Debug.Log(clipPath);
             }
+
+            await audioPlaybackTask;
+            audioSource.clip = fullClip;
+
+            async Task PlayStreamQueueAsync(Task streamTask)
+            {
+                try
+                {
+                    await new WaitUntil(() => streamClipQueue.Count > 0);
+                    var endOfFrame = new WaitForEndOfFrame();
+
+                    do
+                    {
+                        if (!audioSource.isPlaying &&
+                            streamClipQueue.TryDequeue(out var clip))
+                        {
+                            if (enableDebug)
+                            {
+                                Debug.Log($"playing partial clip: {clip.name} | ({streamClipQueue.Count} remaining)");
+                            }
+
+                            audioSource.PlayOneShot(clip);
+                            // ReSharper disable once MethodSupportsCancellation
+                            await Task.Delay(TimeSpan.FromSeconds(clip.length)).ConfigureAwait(true);
+                        }
+                        else
+                        {
+                            await endOfFrame;
+                        }
+
+                        if (streamTask.IsCompleted && !audioSource.isPlaying && streamClipQueue.Count == 0)
+                        {
+                            return;
+                        }
+                    } while (!cancellationToken.IsCancellationRequested);
+                }
+                catch (Exception e)
+                {
+                    switch (e)
+                    {
+                        case TaskCanceledException:
+                        case OperationCanceledException:
+                            break;
+                        default:
+                            Debug.LogError(e);
+                            break;
+                    }
+                }
+            }
         }
 
         private TextMeshProUGUI AddNewTextMessageContent(Role role)
@@ -282,7 +338,7 @@ private async void ProcessRecording(Tuple<string, AudioClip> recording)
             {
                 recordButton.interactable = false;
                 var request = new AudioTranscriptionRequest(clip, temperature: 0.1f, language: "en");
-                var userInput = await openAI.AudioEndpoint.CreateTranscriptionAsync(request, destroyCancellationToken);
+                var userInput = await openAI.AudioEndpoint.CreateTranscriptionTextAsync(request, destroyCancellationToken);
 
                 if (enableDebug)
                 {
diff --git a/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs b/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs
index bc0877ea..40843567 100644
--- a/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs
+++ b/OpenAI/Packages/com.openai.unity/Tests/TestFixture_07_Audio.cs
@@ -3,6 +3,7 @@
 using NUnit.Framework;
 using OpenAI.Audio;
 using System;
+using System.Collections.Concurrent;
 using System.IO;
 using System.Threading.Tasks;
 using UnityEditor;
@@ -17,10 +18,10 @@ public async Task Test_01_01_Transcription_Path()
         {
             Assert.IsNotNull(OpenAIClient.AudioEndpoint);
             var audioPath = AssetDatabase.GUIDToAssetPath("259eaa73cab84284eac307d3134c3ade");
-            using var request = new AudioTranscriptionRequest(Path.GetFullPath(audioPath), temperature: 0.1f, language: "en");
-            var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(request);
-            Assert.IsNotNull(result);
-            Debug.Log(result);
+            using var request = new AudioTranscriptionRequest(Path.GetFullPath(audioPath), responseFormat: AudioResponseFormat.Text, temperature: 0.1f, language: "en");
+            var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request);
+            Assert.IsNotNull(response);
+            Debug.Log(response);
         }
 
         [Test]
@@ -29,10 +30,40 @@ public async Task Test_01_02_Transcription_AudioClip()
             Assert.IsNotNull(OpenAIClient.AudioEndpoint);
             var audioPath = AssetDatabase.GUIDToAssetPath("259eaa73cab84284eac307d3134c3ade");
             var audioClip = AssetDatabase.LoadAssetAtPath<AudioClip>(audioPath);
-            using var request = new AudioTranscriptionRequest(audioClip, temperature: 0.1f, language: "en");
-            var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(request);
-            Assert.IsNotNull(result);
-            Debug.Log(result);
+            using var request = new AudioTranscriptionRequest(audioClip, responseFormat: AudioResponseFormat.Json, temperature: 0.1f, language: "en");
+            var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request);
+            Assert.IsNotNull(response);
+            Debug.Log(response);
+        }
+
+        [Test]
+        public async Task Test_01_03_01_Transcription_VerboseJson()
+        {
+            Assert.IsNotNull(OpenAIClient.AudioEndpoint);
+            var audioPath = AssetDatabase.GUIDToAssetPath("259eaa73cab84284eac307d3134c3ade");
+            var audioClip = AssetDatabase.LoadAssetAtPath<AudioClip>(audioPath);
+            using var request = new AudioTranscriptionRequest(audioClip, responseFormat: AudioResponseFormat.Verbose_Json, temperature: 0.1f, language: "en");
+            var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request);
+            Assert.IsNotNull(response);
+            Assert.IsNotNull(response.Duration);
+            Assert.IsTrue(response.Language == "english");
+            Assert.IsNotNull(response.Segments);
+            Assert.IsNotEmpty(response.Segments);
+        }
+
+        [Test]
+        public async Task Test_01_03_02_Transcription_VerboseJson_WordSimilarities()
+        {
+            Assert.IsNotNull(OpenAIClient.AudioEndpoint);
+            var audioPath = AssetDatabase.GUIDToAssetPath("259eaa73cab84284eac307d3134c3ade");
+            var audioClip = AssetDatabase.LoadAssetAtPath<AudioClip>(audioPath);
+            using var request = new AudioTranscriptionRequest(audioClip, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en");
+            var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request);
+            Assert.IsNotNull(response);
+            Assert.IsNotNull(response.Duration);
+            Assert.IsTrue(response.Language == "english");
+            Assert.IsNotNull(response.Words);
+            Assert.IsNotEmpty(response.Words);
         }
 
         [Test]
@@ -41,7 +72,7 @@ public async Task Test_02_01_Translation_Path()
             Assert.IsNotNull(OpenAIClient.AudioEndpoint);
             var audioPath = AssetDatabase.GUIDToAssetPath("3ab176222366dc241894506c315c6fa4");
             using var request = new AudioTranslationRequest(Path.GetFullPath(audioPath));
-            var result = await OpenAIClient.AudioEndpoint.CreateTranslationAsync(request);
+            var result = await OpenAIClient.AudioEndpoint.CreateTranslationTextAsync(request);
             Assert.IsNotNull(result);
             Debug.Log(result);
         }
@@ -52,14 +83,14 @@ public async Task Test_02_02_Translation_AudioClip()
             Assert.IsNotNull(OpenAIClient.AudioEndpoint);
             var audioPath = AssetDatabase.GUIDToAssetPath("3ab176222366dc241894506c315c6fa4");
             var audioClip = AssetDatabase.LoadAssetAtPath<AudioClip>(audioPath);
-            using var request = new AudioTranslationRequest(audioClip);
-            var result = await OpenAIClient.AudioEndpoint.CreateTranslationAsync(request);
-            Assert.IsNotNull(result);
-            Debug.Log(result);
+            using var request = new AudioTranslationRequest(audioClip, prompt: "responses should be in spanish.");
+            var response = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request);
+            Assert.IsNotNull(response);
+            Debug.Log(response);
         }
 
         [Test]
-        public async Task Test_3_Speech()
+        public async Task Test_03_01_Speech()
         {
             Assert.IsNotNull(OpenAIClient.AudioEndpoint);
             var request = new SpeechRequest("Hello world!");
@@ -67,5 +98,17 @@ public async Task Test_3_Speech()
             Debug.Log(path);
             Assert.IsNotNull(clip);
         }
+
+        [Test]
+        public async Task Test_03_02_Speech_Streaming()
+        {
+            Assert.IsNotNull(OpenAIClient.AudioEndpoint);
+            var request = new SpeechRequest("Hello world!", responseFormat: SpeechResponseFormat.PCM);
+            var clipQueue = new ConcurrentQueue<AudioClip>();
+            var (path, clip) = await OpenAIClient.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => clipQueue.Enqueue(partialClip));
+            Debug.Log(path);
+            Assert.IsNotNull(clip);
+            Assert.IsTrue(clipQueue.Count > 0);
+        }
     }
 }
diff --git a/OpenAI/Packages/com.openai.unity/package.json b/OpenAI/Packages/com.openai.unity/package.json
index fe0d8160..c30b955e 100644
--- a/OpenAI/Packages/com.openai.unity/package.json
+++ b/OpenAI/Packages/com.openai.unity/package.json
@@ -3,7 +3,7 @@
   "displayName": "OpenAI",
   "description": "A OpenAI package for the Unity Game Engine to use GPT-4, GPT-3.5, GPT-3 and Dall-E though their RESTful API (currently in beta).\n\nIndependently developed, this is not an official library and I am not affiliated with OpenAI.\n\nAn OpenAI API account is required.",
   "keywords": [],
-  "version": "7.7.5",
+  "version": "7.7.6",
   "unity": "2021.3",
   "documentationUrl": "https://github.com/RageAgainstThePixel/com.openai.unity#documentation",
   "changelogUrl": "https://github.com/RageAgainstThePixel/com.openai.unity/releases",
@@ -17,7 +17,7 @@
     "url": "https://github.com/StephenHodgson"
   },
   "dependencies": {
-    "com.utilities.rest": "2.5.4",
+    "com.utilities.rest": "2.5.5",
     "com.utilities.encoder.wav": "1.1.5"
   },
   "samples": [
diff --git a/OpenAI/Packages/manifest.json b/OpenAI/Packages/manifest.json
index c322539c..7f6b7ccc 100644
--- a/OpenAI/Packages/manifest.json
+++ b/OpenAI/Packages/manifest.json
@@ -3,7 +3,7 @@
     "com.unity.ide.rider": "3.0.28",
     "com.unity.ide.visualstudio": "2.0.22",
     "com.unity.textmeshpro": "3.0.8",
-    "com.utilities.buildpipeline": "1.2.3"
+    "com.utilities.buildpipeline": "1.3.0"
   },
   "scopedRegistries": [
     {
diff --git a/README.md b/README.md
index c273665b..1c0a5310 100644
--- a/README.md
+++ b/README.md
@@ -103,6 +103,7 @@ The recommended installation method is though the unity package manager and [Ope
   - [Json Mode](#chat-json-mode)
 - [Audio](#audio)
   - [Create Speech](#create-speech)
+    - [Stream Speech](#stream-speech) :new:
   - [Create Transcription](#create-transcription)
   - [Create Translation](#create-translation)
 - [Images](#images)
@@ -1047,6 +1048,18 @@ var api = new OpenAIClient();
 var request = new SpeechRequest("Hello world!");
 var (path, clip) = await api.AudioEndpoint.CreateSpeechAsync(request);
 audioSource.PlayOneShot(clip);
+Debug.Log(path);
+```
+
+##### [Stream Speech]
+
+Generate streamed audio from the input text.
+
+```csharp
+var api = new OpenAIClient();
+var request = new SpeechRequest("Hello world!");
+var (path, clip) = await OpenAIClient.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => audioSource.PlayOneShot(partialClip));
+Debug.Log(path);
 ```
 
 #### [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
@@ -1060,6 +1073,19 @@ var result = await api.AudioEndpoint.CreateTranscriptionAsync(request);
 Debug.Log(result);
 ```
 
+You can also get detailed information using `verbose_json` to get timestamp granularities:
+
+```csharp
+var api = new OpenAIClient();
+using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en");
+var response = await api.AudioEndpoint.CreateTranscriptionTextAsync(request);
+
+foreach (var word in response.Words)
+{
+    Debug.Log($"[{word.Start}-{word.End}] \"{word.Word}\"");
+}
+```
+
 #### [Create Translation](https://platform.openai.com/docs/api-reference/audio/createTranslation)
 
 Translates audio into into English.
@@ -1187,7 +1213,7 @@ Returns information about a specific file.
 
 ```csharp
 var api = new OpenAIClient();
-var file = await GetFileInfoAsync(fileId);
+var file = await api.FilesEndpoint.GetFileInfoAsync(fileId);
 Debug.Log($"{file.Id} -> {file.Object}: {file.FileName} | {file.Size} bytes");
 ```