Skip to content

Commit

Permalink
com.openai.unity 7.7.6 (#210)
Browse files Browse the repository at this point in the history
- Added support for streaming text to speech
  - Added AudioEndpoit.CreateSpeechStreamAsync(SpeechRequest, Action<AudioClip>, CancellationToken)
- Added support for Audio Transcription and Translation verbose json output
  - Added support for timestamp granularities for segments and words
  - Marked CreateTranscriptionAsync obsolete
  - Added CreateTranscriptionTextAsync
  - Added CreateTranscriptionJsonAsync
  - Marked CreateTranspationAsync obsolete
  - Added CreateTranslationTextAsync
  - Added CreateTranslationJsonAsync
- Updated SpeechResponseFormat to include wav and pcm
  • Loading branch information
StephenHodgson authored Mar 19, 2024
1 parent 052512c commit 93eed23
Show file tree
Hide file tree
Showing 19 changed files with 623 additions and 65 deletions.
28 changes: 27 additions & 1 deletion OpenAI/Packages/com.openai.unity/Documentation~/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ The recommended installation method is though the unity package manager and [Ope
- [Json Mode](#chat-json-mode)
- [Audio](#audio)
- [Create Speech](#create-speech)
- [Stream Speech](#stream-speech) :new:
- [Create Transcription](#create-transcription)
- [Create Translation](#create-translation)
- [Images](#images)
Expand Down Expand Up @@ -1047,6 +1048,18 @@ var api = new OpenAIClient();
var request = new SpeechRequest("Hello world!");
var (path, clip) = await api.AudioEndpoint.CreateSpeechAsync(request);
audioSource.PlayOneShot(clip);
Debug.Log(path);
```

##### [Stream Speech]

Generate streamed audio from the input text.

```csharp
var api = new OpenAIClient();
var request = new SpeechRequest("Hello world!");
var (path, clip) = await OpenAIClient.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => audioSource.PlayOneShot(partialClip));
Debug.Log(path);
```

#### [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
Expand All @@ -1060,6 +1073,19 @@ var result = await api.AudioEndpoint.CreateTranscriptionAsync(request);
Debug.Log(result);
```

You can also get detailed information using `verbose_json` to get timestamp granularities:

```csharp
var api = new OpenAIClient();
using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en");
var response = await api.AudioEndpoint.CreateTranscriptionTextAsync(request);

foreach (var word in response.Words)
{
Debug.Log($"[{word.Start}-{word.End}] \"{word.Word}\"");
}
```

#### [Create Translation](https://platform.openai.com/docs/api-reference/audio/createTranslation)

Translates audio into into English.
Expand Down Expand Up @@ -1187,7 +1213,7 @@ Returns information about a specific file.

```csharp
var api = new OpenAIClient();
var file = await GetFileInfoAsync(fileId);
var file = await api.FilesEndpoint.GetFileInfoAsync(fileId);
Debug.Log($"{file.Id} -> {file.Object}: {file.FileName} | {file.Size} bytes");
```

Expand Down
160 changes: 130 additions & 30 deletions OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
using System.Threading.Tasks;
using UnityEngine;
using UnityEngine.Networking;
using UnityEngine.Scripting;
using Utilities.WebRequestRest;

namespace OpenAI.Audio
Expand All @@ -18,21 +17,6 @@ namespace OpenAI.Audio
/// </summary>
public sealed class AudioEndpoint : OpenAIBaseEndpoint
{
[Preserve]
private class AudioResponse
{
[Preserve]
[JsonConstructor]
public AudioResponse([JsonProperty("text")] string text)
{
Text = text;
}

[Preserve]
[JsonProperty("text")]
public string Text { get; }
}

internal AudioEndpoint(OpenAIClient client) : base(client) { }

/// <inheritdoc />
Expand All @@ -48,25 +32,78 @@ internal AudioEndpoint(OpenAIClient client) : base(client) { }
/// <returns><see cref="AudioClip"/> and the cached path.</returns>
[Function("Generates audio from the input text.")]
public async Task<Tuple<string, AudioClip>> CreateSpeechAsync(SpeechRequest request, CancellationToken cancellationToken = default)
=> await CreateSpeechStreamAsync(request, null, cancellationToken);

/// <summary>
/// Generates streaming audio from the input text.
/// </summary>
/// <param name="request"><see cref="SpeechRequest"/>.</param>
/// <param name="partialClipCallback">Optional, partial <see cref="AudioClip"/> callback used to stream audio.</param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns><see cref="AudioClip"/> and the cached path.</returns>
[Function("Generates streaming audio from the input text.")]
public async Task<Tuple<string, AudioClip>> CreateSpeechStreamAsync(SpeechRequest request, Action<AudioClip> partialClipCallback, CancellationToken cancellationToken = default)
{
var audioFormat = request.ResponseFormat switch
if (partialClipCallback != null && request.ResponseFormat != SpeechResponseFormat.PCM)
{
SpeechResponseFormat.MP3 => AudioType.MPEG,
_ => throw new NotSupportedException(request.ResponseFormat.ToString())
};
Debug.LogWarning("Speech streaming only supported with PCM response format. Overriding to PCM...");
request.ResponseFormat = SpeechResponseFormat.PCM;
}

var ext = request.ResponseFormat switch
{
SpeechResponseFormat.MP3 => "mp3",
SpeechResponseFormat.WAV => "wav",
SpeechResponseFormat.PCM => "pcm",
_ => throw new NotSupportedException(request.ResponseFormat.ToString())
};
var payload = JsonConvert.SerializeObject(request, OpenAIClient.JsonSerializationOptions);
string clipName;

lock (mutex)
{
clipName = $"{request.Voice}-{DateTime.UtcNow:yyyyMMddThhmmssff}.{ext}";
clipName = $"{request.Voice}-{DateTime.UtcNow:yyyyMMddThhmmssfffff}.{ext}";
}

Rest.TryGetDownloadCacheItem(clipName, out var cachedPath);

if (request.ResponseFormat == SpeechResponseFormat.PCM)
{
var part = 0;
var response = await Rest.PostAsync(
GetUrl("/speech"),
payload,
StreamCallback,
eventChunkSize: 8192,
new RestParameters(client.DefaultRequestHeaders),
cancellationToken);
response.Validate(EnableDebug);
var samples = Utilities.Audio.PCMEncoder.Decode(response.Data);
await File.WriteAllBytesAsync(cachedPath, response.Data, cancellationToken).ConfigureAwait(true);
return new Tuple<string, AudioClip>(cachedPath, AudioClip.Create(clipName, samples.Length, 1, 24000, false));

void StreamCallback(Response partialResponse)
{
var chunk = Utilities.Audio.PCMEncoder.Decode(partialResponse.Data);
var partialClip = AudioClip.Create($"{clipName}_{++part}", chunk.Length, 1, 24000, false);

if (!partialClip.SetData(chunk, 0))
{
Debug.LogError("Failed to set pcm data to partial clip.");
return;
}

partialClipCallback?.Invoke(partialClip);
}
}

var audioFormat = request.ResponseFormat switch
{
SpeechResponseFormat.MP3 => AudioType.MPEG,
SpeechResponseFormat.WAV => AudioType.WAV,
_ => throw new NotSupportedException(request.ResponseFormat.ToString())
};

var clip = await Rest.DownloadAudioClipAsync(
GetUrl("/speech"),
audioFormat,
Expand All @@ -75,17 +112,46 @@ public async Task<Tuple<string, AudioClip>> CreateSpeechAsync(SpeechRequest requ
payload,
parameters: new RestParameters(client.DefaultRequestHeaders, debug: EnableDebug),
cancellationToken: cancellationToken);
Rest.TryGetDownloadCacheItem(clipName, out var cachedPath);
return new Tuple<string, AudioClip>(cachedPath, clip);
}

[Obsolete("Use CreateTranscriptionTextAsync or CreateTranscriptionJsonAsync instead.")]
public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
=> await CreateTranscriptionTextAsync(request, cancellationToken);

/// <summary>
/// Transcribes audio into the input language.
/// </summary>
/// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns>The transcribed text.</returns>
public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
public async Task<string> CreateTranscriptionTextAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
{
var response = await Internal_CreateTranscriptionAsync(request, cancellationToken);
return request.ResponseFormat == AudioResponseFormat.Json
? JsonConvert.DeserializeObject<AudioResponse>(response)?.Text
: response;
}

/// <summary>
/// Transcribes audio into the input language.
/// </summary>
/// <remarks>This method expects the request format to be either <see cref="AudioResponseFormat.Json"/> or <see cref="AudioResponseFormat.Verbose_Json"/>.</remarks>
/// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns><see cref="AudioResponse"/>.</returns>
public async Task<AudioResponse> CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
{
if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
{
throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat));
}

var response = await Internal_CreateTranscriptionAsync(request, cancellationToken);
return JsonConvert.DeserializeObject<AudioResponse>(response);
}

private async Task<string> Internal_CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
{
var form = new WWWForm();
using var audioData = new MemoryStream();
Expand All @@ -111,22 +177,58 @@ public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest req
form.AddField("language", request.Language);
}

switch (request.TimestampGranularities)
{
case TimestampGranularity.Segment:
case TimestampGranularity.Word:
form.AddField("timestamp_granularities[]", request.TimestampGranularities.ToString().ToLower());
break;
}

request.Dispose();

var response = await Rest.PostAsync(GetUrl("/transcriptions"), form, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
response.Validate(EnableDebug);
return responseFormat == AudioResponseFormat.Json
? JsonConvert.DeserializeObject<AudioResponse>(response.Body)?.Text
: response.Body;
return response.Body;
}

[Obsolete("Use CreateTranslationTextAsync or CreateTranslationJsonAsync instead.")]
public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
=> await CreateTranslationTextAsync(request, cancellationToken);

/// <summary>
/// Translates audio into English.
/// </summary>
/// <param name="request"></param>
/// <param name="cancellationToken"></param>
/// <returns>The translated text.</returns>
public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
public async Task<string> CreateTranslationTextAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
{
var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken);
return request.ResponseFormat == AudioResponseFormat.Json
? JsonConvert.DeserializeObject<AudioResponse>(responseAsString)?.Text
: responseAsString;
}

/// <summary>
/// Translates audio into English.
/// </summary>
/// <param name="request"></param>
/// <param name="cancellationToken"></param>
/// <returns></returns>
/// <exception cref="ArgumentException"></exception>
public async Task<AudioResponse> CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
{
if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
{
throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat));
}

var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken);
return JsonConvert.DeserializeObject<AudioResponse>(responseAsString);
}

private async Task<string> Internal_CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken)
{
var form = new WWWForm();
using var audioData = new MemoryStream();
Expand All @@ -151,9 +253,7 @@ public async Task<string> CreateTranslationAsync(AudioTranslationRequest request

var response = await Rest.PostAsync(GetUrl("/translations"), form, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
response.Validate(EnableDebug);
return responseFormat == AudioResponseFormat.Json
? JsonConvert.DeserializeObject<AudioResponse>(response.Body)?.Text
: response.Body;
return response.Body;
}
}
}
62 changes: 62 additions & 0 deletions OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioResponse.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

using Newtonsoft.Json;
using UnityEngine.Scripting;

namespace OpenAI.Audio
{
[Preserve]
public class AudioResponse
{
[Preserve]
[JsonConstructor]
public AudioResponse(
[JsonProperty("language")] string language,
[JsonProperty("duration")] double? duration,
[JsonProperty("text")] string text,
[JsonProperty("words")] TranscriptionWord[] words,
[JsonProperty("segments")] TranscriptionSegment[] segments)
{
Language = language;
Duration = duration;
Text = text;
Words = words;
Segments = segments;
}

/// <summary>
/// The language of the input audio.
/// </summary>
[Preserve]
[JsonProperty("language")]
public string Language { get; }

/// <summary>
/// The duration of the input audio.
/// </summary>
[Preserve]
[JsonProperty("duration")]
public double? Duration { get; }

/// <summary>
/// The transcribed text.
/// </summary>
[Preserve]
[JsonProperty("text")]
public string Text { get; }

/// <summary>
/// Extracted words and their corresponding timestamps.
/// </summary>
[Preserve]
[JsonProperty("words")]
public TranscriptionWord[] Words { get; }

/// <summary>
/// Segments of the transcribed text and their corresponding details.
/// </summary>
[Preserve]
[JsonProperty("segments")]
public TranscriptionSegment[] Segments { get; }
}
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 93eed23

Please sign in to comment.