Skip to content

Commit

Permalink
com.openai.unity 8.4.4 (#313)
Browse files Browse the repository at this point in the history
- refactored AudioEndpoint speech requests
  - added AudioEndpoint.GetSpeechAsync
  - deprecated AudioEndpoint.CreateSpeechAsync
  - deprecated AudioEndpoint.CreateSpeechStreamAsync
- added SpeechClip response
- added Realtime.ResponseAudioResponse.AudioSamples
- updated all sample scenes with new OnAudioFilterRead examples for better playback
- updated unit tests
  • Loading branch information
StephenHodgson authored Nov 25, 2024
1 parent 2e54704 commit b480299
Show file tree
Hide file tree
Showing 15 changed files with 755 additions and 669 deletions.
33 changes: 20 additions & 13 deletions OpenAI/Packages/com.openai.unity/Documentation~/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ The recommended installation method is though the unity package manager and [Ope
- [List Models](#list-models)
- [Retrieve Models](#retrieve-model)
- [Delete Fine Tuned Model](#delete-fine-tuned-model)
- [Realtime](#realtime) :new:
- [Create Realtime Session](#create-realtime-session) :new:
- [Client Events](#client-events) :new:
- [Sending Client Events](#sending-client-events) :new:
- [Server Events](#server-events) :new:
- [Receiving Server Events](#receiving-server-events) :new:
- [Realtime](#realtime)
- [Create Realtime Session](#create-realtime-session)
- [Client Events](#client-events)
- [Sending Client Events](#sending-client-events)
- [Server Events](#server-events)
- [Receiving Server Events](#receiving-server-events)
- [Assistants](#assistants)
- [List Assistants](#list-assistants)
- [Create Assistant](#create-assistant)
Expand Down Expand Up @@ -118,7 +118,7 @@ The recommended installation method is though the unity package manager and [Ope
- [Streaming](#chat-streaming)
- [Tools](#chat-tools)
- [Vision](#chat-vision)
- [Audio](#chat-audio) :new:
- [Audio](#chat-audio)
- [Structured Outputs](#chat-structured-outputs)
- [Json Mode](#chat-json-mode)
- [Audio](#audio)
Expand Down Expand Up @@ -1555,6 +1555,7 @@ Debug.Log($"{result.FirstChoice.Message.Role}: {result.FirstChoice} | Finish Rea
#### [Chat Audio](https://platform.openai.com/docs/guides/audio)

```csharp
var api = new OpenAIClient();
var messages = new List<Message>
{
new Message(Role.System, "You are a helpful assistant."),
Expand Down Expand Up @@ -1662,9 +1663,9 @@ Generates audio from the input text.
```csharp
var api = new OpenAIClient();
var request = new SpeechRequest("Hello world!");
var (path, clip) = await api.AudioEndpoint.CreateSpeechAsync(request);
audioSource.PlayOneShot(clip);
Debug.Log(path);
var speechClip = await api.AudioEndpoint.CreateSpeechAsync(request);
audioSource.PlayOneShot(speechClip);
Debug.Log(speechClip);
```

##### [Stream Speech]
Expand All @@ -1673,11 +1674,17 @@ Generate streamed audio from the input text.

```csharp
var api = new OpenAIClient();
var request = new SpeechRequest("Hello world!");
var (path, clip) = await api.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => audioSource.PlayOneShot(partialClip));
Debug.Log(path);
var request = new SpeechRequest("Hello world!", responseFormat: SpeechResponseFormat.PCM);
var speechClip = await api.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip =>
{
audioSource.PlayOneShot(partialClip);
});
Debug.Log(speechClip);
```

> [!NOTE]
> Checkout any of the demo scenes for best practices on how to handle playback with `OnAudioFilterRead`.
#### [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)

Transcribes audio into the input language.
Expand Down
81 changes: 24 additions & 57 deletions OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
using System.Threading;
using System.Threading.Tasks;
using UnityEngine;
using UnityEngine.Networking;
using Utilities.WebRequestRest;

namespace OpenAI.Audio
Expand All @@ -27,25 +26,29 @@ internal AudioEndpoint(OpenAIClient client) : base(client) { }

private static readonly object mutex = new();

/// <summary>
/// Generates audio from the input text.
/// </summary>
/// <param name="request"><see cref="SpeechRequest"/>.</param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns><see cref="AudioClip"/> and the cached path.</returns>
[Function("Generates audio from the input text.")]
[Obsolete("use GetSpeechAsync")]
public async Task<Tuple<string, AudioClip>> CreateSpeechAsync(SpeechRequest request, CancellationToken cancellationToken = default)
=> await CreateSpeechStreamAsync(request, null, cancellationToken);

[Obsolete("use GetSpeechAsync")]
public async Task<Tuple<string, AudioClip>> CreateSpeechStreamAsync(SpeechRequest request, Action<AudioClip> partialClipCallback, CancellationToken cancellationToken = default)
{
var result = await GetSpeechAsync(request, speechClip =>
{
partialClipCallback.Invoke(speechClip.AudioClip);
}, cancellationToken);
return Tuple.Create(result.CachePath, result.AudioClip);
}

/// <summary>
/// Generates streaming audio from the input text.
/// Generates audio from the input text.
/// </summary>
/// <param name="request"><see cref="SpeechRequest"/>.</param>
/// <param name="partialClipCallback">Optional, partial <see cref="AudioClip"/> callback used to stream audio.</param>
/// <param name="partialClipCallback">Optional, partial <see cref="SpeechClip"/> callback used to stream audio.</param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns><see cref="AudioClip"/> and the cached path.</returns>
[Function("Generates streaming audio from the input text.")]
public async Task<Tuple<string, AudioClip>> CreateSpeechStreamAsync(SpeechRequest request, Action<AudioClip> partialClipCallback, CancellationToken cancellationToken = default)
/// <returns><see cref="SpeechClip"/></returns>
[Function("Generates audio from the input text.")]
public async Task<SpeechClip> GetSpeechAsync(SpeechRequest request, Action<SpeechClip> partialClipCallback = null, CancellationToken cancellationToken = default)
{
if (partialClipCallback != null && request.ResponseFormat != SpeechResponseFormat.PCM)
{
Expand All @@ -70,52 +73,16 @@ public async Task<Tuple<string, AudioClip>> CreateSpeechStreamAsync(SpeechReques

Rest.TryGetDownloadCacheItem(clipName, out var cachedPath);

if (request.ResponseFormat == SpeechResponseFormat.PCM)
{
var part = 0;
var response = await Rest.PostAsync(
GetUrl("/speech"),
payload,
StreamCallback,
eventChunkSize: 8192,
new RestParameters(client.DefaultRequestHeaders),
cancellationToken);
response.Validate(EnableDebug);
var samples = Utilities.Audio.PCMEncoder.Decode(response.Data);
await File.WriteAllBytesAsync(cachedPath, response.Data, cancellationToken).ConfigureAwait(true);
return new Tuple<string, AudioClip>(cachedPath, AudioClip.Create(clipName, samples.Length, 1, 24000, false));

void StreamCallback(Response partialResponse)
{
var chunk = Utilities.Audio.PCMEncoder.Decode(partialResponse.Data);
var partialClip = AudioClip.Create($"{clipName}_{++part}", chunk.Length, 1, 24000, false);

if (!partialClip.SetData(chunk, 0))
{
Debug.LogError("Failed to set pcm data to partial clip.");
return;
}

partialClipCallback?.Invoke(partialClip);
}
}
var part = 0;
var pcmResponse = await Rest.PostAsync(GetUrl("/speech"), payload, StreamCallback, 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
pcmResponse.Validate(EnableDebug);
await File.WriteAllBytesAsync(cachedPath, pcmResponse.Data, cancellationToken).ConfigureAwait(true);
return new SpeechClip(clipName, cachedPath, new ReadOnlyMemory<byte>(pcmResponse.Data));

var audioFormat = request.ResponseFormat switch
void StreamCallback(Response partialResponse)
{
SpeechResponseFormat.MP3 => AudioType.MPEG,
SpeechResponseFormat.WAV => AudioType.WAV,
_ => throw new NotSupportedException(request.ResponseFormat.ToString())
};

var clip = await Rest.DownloadAudioClipAsync(
GetUrl("/speech"),
audioFormat,
UnityWebRequest.kHttpVerbPOST,
clipName,
payload,
parameters: new RestParameters(client.DefaultRequestHeaders, debug: EnableDebug),
cancellationToken: cancellationToken);
return new Tuple<string, AudioClip>(cachedPath, clip);
partialClipCallback?.Invoke(new SpeechClip($"{clipName}_{++part}", null, partialResponse.Data));
}
}

/// <summary>
Expand Down
56 changes: 56 additions & 0 deletions OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

using System;
using UnityEngine;
using UnityEngine.Scripting;
using Utilities.Audio;

namespace OpenAI.Audio
{
[Preserve]
public sealed class SpeechClip
{
[Preserve]
internal SpeechClip(string name, string cachePath, ReadOnlyMemory<byte> audioData, int sampleRate = 24000)
{
Name = name;
CachePath = cachePath;
AudioData = audioData;
SampleRate = sampleRate;
}

[Preserve]
public string Name { get; }

[Preserve]
public string CachePath { get; }

[Preserve]
public ReadOnlyMemory<byte> AudioData { get; }

[Preserve]
public float[] AudioSamples
=> PCMEncoder.Resample(PCMEncoder.Decode(AudioData.ToArray()), SampleRate, 44100);

[Preserve]
public int SampleRate { get; }

[Preserve]
public AudioClip AudioClip
{
get
{
var samples = AudioSamples;
var clip = AudioClip.Create(Name, samples.Length, 1, 44100, false);
clip.SetData(samples, 0);
return clip;
}
}

[Preserve]
public static implicit operator AudioClip(SpeechClip clip) => clip?.AudioClip;

[Preserve]
public static implicit operator string(SpeechClip clip) => clip?.CachePath;
}
}
11 changes: 11 additions & 0 deletions OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs.meta

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

using Newtonsoft.Json;
using System;
using UnityEngine;
using UnityEngine.Scripting;
using Utilities.Audio;
Expand Down Expand Up @@ -72,6 +73,11 @@ internal ResponseAudioResponse(
[JsonProperty("delta")]
public string Delta { get; }

[Preserve]
[JsonIgnore]
public float[] AudioSamples
=> PCMEncoder.Resample(PCMEncoder.Decode(Convert.FromBase64String(Delta)), 24000, 44100);

[Preserve]
[JsonIgnore]
public bool IsDelta => Type.EndsWith("delta");
Expand All @@ -83,8 +89,8 @@ internal ResponseAudioResponse(
[Preserve]
public static implicit operator AudioClip(ResponseAudioResponse response)
{
var audioSamples = PCMEncoder.Decode(System.Convert.FromBase64String(response.Delta));
var audioClip = AudioClip.Create($"{response.ItemId}_{response.OutputIndex}_delta", audioSamples.Length, 1, 24000, false);
var audioSamples = response.AudioSamples;
var audioClip = AudioClip.Create($"{response.ItemId}_{response.OutputIndex}_delta", audioSamples.Length, 1, 44100, false);
audioClip.SetData(audioSamples, 0);
return audioClip;
}
Expand Down
Loading

0 comments on commit b480299

Please sign in to comment.