1710 lines
56 KiB
TypeScript
Executable File
1710 lines
56 KiB
TypeScript
Executable File
import { APIResource } from "../../../resource.js";
|
||
import * as RealtimeAPI from "./realtime.js";
|
||
import * as Shared from "../../shared.js";
|
||
import * as SessionsAPI from "./sessions.js";
|
||
import { Session as SessionsAPISession, SessionCreateParams, SessionCreateResponse, Sessions } from "./sessions.js";
|
||
export declare class Realtime extends APIResource {
|
||
sessions: SessionsAPI.Sessions;
|
||
}
|
||
/**
|
||
* Returned when a conversation is created. Emitted right after session creation.
|
||
*/
|
||
export interface ConversationCreatedEvent {
|
||
/**
|
||
* The conversation resource.
|
||
*/
|
||
conversation: ConversationCreatedEvent.Conversation;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The event type, must be `conversation.created`.
|
||
*/
|
||
type: 'conversation.created';
|
||
}
|
||
export declare namespace ConversationCreatedEvent {
|
||
/**
|
||
* The conversation resource.
|
||
*/
|
||
interface Conversation {
|
||
/**
|
||
* The unique ID of the conversation.
|
||
*/
|
||
id?: string;
|
||
/**
|
||
* The object type, must be `realtime.conversation`.
|
||
*/
|
||
object?: 'realtime.conversation';
|
||
}
|
||
}
|
||
/**
|
||
* The item to add to the conversation.
|
||
*/
|
||
export interface ConversationItem {
|
||
/**
|
||
* The unique ID of the item, this can be generated by the client to help manage
|
||
* server-side context, but is not required because the server will generate one if
|
||
* not provided.
|
||
*/
|
||
id?: string;
|
||
/**
|
||
* The arguments of the function call (for `function_call` items).
|
||
*/
|
||
arguments?: string;
|
||
/**
|
||
* The ID of the function call (for `function_call` and `function_call_output`
|
||
* items). If passed on a `function_call_output` item, the server will check that a
|
||
* `function_call` item with the same ID exists in the conversation history.
|
||
*/
|
||
call_id?: string;
|
||
/**
|
||
* The content of the message, applicable for `message` items.
|
||
*
|
||
* - Message items of role `system` support only `input_text` content
|
||
* - Message items of role `user` support `input_text` and `input_audio` content
|
||
* - Message items of role `assistant` support `text` content.
|
||
*/
|
||
content?: Array<ConversationItemContent>;
|
||
/**
|
||
* The name of the function being called (for `function_call` items).
|
||
*/
|
||
name?: string;
|
||
/**
|
||
* Identifier for the API object being returned - always `realtime.item`.
|
||
*/
|
||
object?: 'realtime.item';
|
||
/**
|
||
* The output of the function call (for `function_call_output` items).
|
||
*/
|
||
output?: string;
|
||
/**
|
||
* The role of the message sender (`user`, `assistant`, `system`), only applicable
|
||
* for `message` items.
|
||
*/
|
||
role?: 'user' | 'assistant' | 'system';
|
||
/**
|
||
* The status of the item (`completed`, `incomplete`). These have no effect on the
|
||
* conversation, but are accepted for consistency with the
|
||
* `conversation.item.created` event.
|
||
*/
|
||
status?: 'completed' | 'incomplete';
|
||
/**
|
||
* The type of the item (`message`, `function_call`, `function_call_output`).
|
||
*/
|
||
type?: 'message' | 'function_call' | 'function_call_output';
|
||
}
|
||
export interface ConversationItemContent {
|
||
/**
|
||
* ID of a previous conversation item to reference (for `item_reference` content
|
||
* types in `response.create` events). These can reference both client and server
|
||
* created items.
|
||
*/
|
||
id?: string;
|
||
/**
|
||
* Base64-encoded audio bytes, used for `input_audio` content type.
|
||
*/
|
||
audio?: string;
|
||
/**
|
||
* The text content, used for `input_text` and `text` content types.
|
||
*/
|
||
text?: string;
|
||
/**
|
||
* The transcript of the audio, used for `input_audio` content type.
|
||
*/
|
||
transcript?: string;
|
||
/**
|
||
* The content type (`input_text`, `input_audio`, `item_reference`, `text`).
|
||
*/
|
||
type?: 'input_text' | 'input_audio' | 'item_reference' | 'text';
|
||
}
|
||
/**
|
||
* Add a new Item to the Conversation's context, including messages, function
|
||
* calls, and function call responses. This event can be used both to populate a
|
||
* "history" of the conversation and to add new items mid-stream, but has the
|
||
* current limitation that it cannot populate assistant audio messages.
|
||
*
|
||
* If successful, the server will respond with a `conversation.item.created` event,
|
||
* otherwise an `error` event will be sent.
|
||
*/
|
||
export interface ConversationItemCreateEvent {
|
||
/**
|
||
* The item to add to the conversation.
|
||
*/
|
||
item: ConversationItem;
|
||
/**
|
||
* The event type, must be `conversation.item.create`.
|
||
*/
|
||
type: 'conversation.item.create';
|
||
/**
|
||
* Optional client-generated ID used to identify this event.
|
||
*/
|
||
event_id?: string;
|
||
/**
|
||
* The ID of the preceding item after which the new item will be inserted. If not
|
||
* set, the new item will be appended to the end of the conversation. If set to
|
||
* `root`, the new item will be added to the beginning of the conversation. If set
|
||
* to an existing ID, it allows an item to be inserted mid-conversation. If the ID
|
||
* cannot be found, an error will be returned and the item will not be added.
|
||
*/
|
||
previous_item_id?: string;
|
||
}
|
||
/**
|
||
* Returned when a conversation item is created. There are several scenarios that
|
||
* produce this event:
|
||
*
|
||
* - The server is generating a Response, which if successful will produce either
|
||
* one or two Items, which will be of type `message` (role `assistant`) or type
|
||
* `function_call`.
|
||
* - The input audio buffer has been committed, either by the client or the server
|
||
* (in `server_vad` mode). The server will take the content of the input audio
|
||
* buffer and add it to a new user message Item.
|
||
* - The client has sent a `conversation.item.create` event to add a new Item to
|
||
* the Conversation.
|
||
*/
|
||
export interface ConversationItemCreatedEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The item to add to the conversation.
|
||
*/
|
||
item: ConversationItem;
|
||
/**
|
||
* The ID of the preceding item in the Conversation context, allows the client to
|
||
* understand the order of the conversation.
|
||
*/
|
||
previous_item_id: string;
|
||
/**
|
||
* The event type, must be `conversation.item.created`.
|
||
*/
|
||
type: 'conversation.item.created';
|
||
}
|
||
/**
|
||
* Send this event when you want to remove any item from the conversation history.
|
||
* The server will respond with a `conversation.item.deleted` event, unless the
|
||
* item does not exist in the conversation history, in which case the server will
|
||
* respond with an error.
|
||
*/
|
||
export interface ConversationItemDeleteEvent {
|
||
/**
|
||
* The ID of the item to delete.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The event type, must be `conversation.item.delete`.
|
||
*/
|
||
type: 'conversation.item.delete';
|
||
/**
|
||
* Optional client-generated ID used to identify this event.
|
||
*/
|
||
event_id?: string;
|
||
}
|
||
/**
|
||
* Returned when an item in the conversation is deleted by the client with a
|
||
* `conversation.item.delete` event. This event is used to synchronize the server's
|
||
* understanding of the conversation history with the client's view.
|
||
*/
|
||
export interface ConversationItemDeletedEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the item that was deleted.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The event type, must be `conversation.item.deleted`.
|
||
*/
|
||
type: 'conversation.item.deleted';
|
||
}
|
||
/**
|
||
* This event is the output of audio transcription for user audio written to the
|
||
* user audio buffer. Transcription begins when the input audio buffer is committed
|
||
* by the client or server (in `server_vad` mode). Transcription runs
|
||
* asynchronously with Response creation, so this event may come before or after
|
||
* the Response events.
|
||
*
|
||
* Realtime API models accept audio natively, and thus input transcription is a
|
||
* separate process run on a separate ASR (Automatic Speech Recognition) model,
|
||
* currently always `whisper-1`. Thus the transcript may diverge somewhat from the
|
||
* model's interpretation, and should be treated as a rough guide.
|
||
*/
|
||
export interface ConversationItemInputAudioTranscriptionCompletedEvent {
|
||
/**
|
||
* The index of the content part containing the audio.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the user message item containing the audio.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The transcribed text.
|
||
*/
|
||
transcript: string;
|
||
/**
|
||
* The event type, must be `conversation.item.input_audio_transcription.completed`.
|
||
*/
|
||
type: 'conversation.item.input_audio_transcription.completed';
|
||
}
|
||
/**
|
||
* Returned when input audio transcription is configured, and a transcription
|
||
* request for a user message failed. These events are separate from other `error`
|
||
* events so that the client can identify the related Item.
|
||
*/
|
||
export interface ConversationItemInputAudioTranscriptionFailedEvent {
|
||
/**
|
||
* The index of the content part containing the audio.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* Details of the transcription error.
|
||
*/
|
||
error: ConversationItemInputAudioTranscriptionFailedEvent.Error;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the user message item.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The event type, must be `conversation.item.input_audio_transcription.failed`.
|
||
*/
|
||
type: 'conversation.item.input_audio_transcription.failed';
|
||
}
|
||
export declare namespace ConversationItemInputAudioTranscriptionFailedEvent {
|
||
/**
|
||
* Details of the transcription error.
|
||
*/
|
||
interface Error {
|
||
/**
|
||
* Error code, if any.
|
||
*/
|
||
code?: string;
|
||
/**
|
||
* A human-readable error message.
|
||
*/
|
||
message?: string;
|
||
/**
|
||
* Parameter related to the error, if any.
|
||
*/
|
||
param?: string;
|
||
/**
|
||
* The type of error.
|
||
*/
|
||
type?: string;
|
||
}
|
||
}
|
||
/**
|
||
* Send this event to truncate a previous assistant message’s audio. The server
|
||
* will produce audio faster than realtime, so this event is useful when the user
|
||
* interrupts to truncate audio that has already been sent to the client but not
|
||
* yet played. This will synchronize the server's understanding of the audio with
|
||
* the client's playback.
|
||
*
|
||
* Truncating audio will delete the server-side text transcript to ensure there is
|
||
* not text in the context that hasn't been heard by the user.
|
||
*
|
||
* If successful, the server will respond with a `conversation.item.truncated`
|
||
* event.
|
||
*/
|
||
export interface ConversationItemTruncateEvent {
|
||
/**
|
||
* Inclusive duration up to which audio is truncated, in milliseconds. If the
|
||
* audio_end_ms is greater than the actual audio duration, the server will respond
|
||
* with an error.
|
||
*/
|
||
audio_end_ms: number;
|
||
/**
|
||
* The index of the content part to truncate. Set this to 0.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* The ID of the assistant message item to truncate. Only assistant message items
|
||
* can be truncated.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The event type, must be `conversation.item.truncate`.
|
||
*/
|
||
type: 'conversation.item.truncate';
|
||
/**
|
||
* Optional client-generated ID used to identify this event.
|
||
*/
|
||
event_id?: string;
|
||
}
|
||
/**
|
||
* Returned when an earlier assistant audio message item is truncated by the client
|
||
* with a `conversation.item.truncate` event. This event is used to synchronize the
|
||
* server's understanding of the audio with the client's playback.
|
||
*
|
||
* This action will truncate the audio and remove the server-side text transcript
|
||
* to ensure there is no text in the context that hasn't been heard by the user.
|
||
*/
|
||
export interface ConversationItemTruncatedEvent {
|
||
/**
|
||
* The duration up to which the audio was truncated, in milliseconds.
|
||
*/
|
||
audio_end_ms: number;
|
||
/**
|
||
* The index of the content part that was truncated.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the assistant message item that was truncated.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The event type, must be `conversation.item.truncated`.
|
||
*/
|
||
type: 'conversation.item.truncated';
|
||
}
|
||
/**
|
||
* The item to add to the conversation.
|
||
*/
|
||
export interface ConversationItemWithReference {
|
||
/**
|
||
* For an item of type (`message` | `function_call` | `function_call_output`) this
|
||
* field allows the client to assign the unique ID of the item. It is not required
|
||
* because the server will generate one if not provided.
|
||
*
|
||
* For an item of type `item_reference`, this field is required and is a reference
|
||
* to any item that has previously existed in the conversation.
|
||
*/
|
||
id?: string;
|
||
/**
|
||
* The arguments of the function call (for `function_call` items).
|
||
*/
|
||
arguments?: string;
|
||
/**
|
||
* The ID of the function call (for `function_call` and `function_call_output`
|
||
* items). If passed on a `function_call_output` item, the server will check that a
|
||
* `function_call` item with the same ID exists in the conversation history.
|
||
*/
|
||
call_id?: string;
|
||
/**
|
||
* The content of the message, applicable for `message` items.
|
||
*
|
||
* - Message items of role `system` support only `input_text` content
|
||
* - Message items of role `user` support `input_text` and `input_audio` content
|
||
* - Message items of role `assistant` support `text` content.
|
||
*/
|
||
content?: Array<ConversationItemContent>;
|
||
/**
|
||
* The name of the function being called (for `function_call` items).
|
||
*/
|
||
name?: string;
|
||
/**
|
||
* Identifier for the API object being returned - always `realtime.item`.
|
||
*/
|
||
object?: 'realtime.item';
|
||
/**
|
||
* The output of the function call (for `function_call_output` items).
|
||
*/
|
||
output?: string;
|
||
/**
|
||
* The role of the message sender (`user`, `assistant`, `system`), only applicable
|
||
* for `message` items.
|
||
*/
|
||
role?: 'user' | 'assistant' | 'system';
|
||
/**
|
||
* The status of the item (`completed`, `incomplete`). These have no effect on the
|
||
* conversation, but are accepted for consistency with the
|
||
* `conversation.item.created` event.
|
||
*/
|
||
status?: 'completed' | 'incomplete';
|
||
/**
|
||
* The type of the item (`message`, `function_call`, `function_call_output`,
|
||
* `item_reference`).
|
||
*/
|
||
type?: 'message' | 'function_call' | 'function_call_output' | 'item_reference';
|
||
}
|
||
/**
|
||
* Returned when an error occurs, which could be a client problem or a server
|
||
* problem. Most errors are recoverable and the session will stay open, we
|
||
* recommend to implementors to monitor and log error messages by default.
|
||
*/
|
||
export interface ErrorEvent {
|
||
/**
|
||
* Details of the error.
|
||
*/
|
||
error: ErrorEvent.Error;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The event type, must be `error`.
|
||
*/
|
||
type: 'error';
|
||
}
|
||
export declare namespace ErrorEvent {
|
||
/**
|
||
* Details of the error.
|
||
*/
|
||
interface Error {
|
||
/**
|
||
* A human-readable error message.
|
||
*/
|
||
message: string;
|
||
/**
|
||
* The type of error (e.g., "invalid_request_error", "server_error").
|
||
*/
|
||
type: string;
|
||
/**
|
||
* Error code, if any.
|
||
*/
|
||
code?: string | null;
|
||
/**
|
||
* The event_id of the client event that caused the error, if applicable.
|
||
*/
|
||
event_id?: string | null;
|
||
/**
|
||
* Parameter related to the error, if any.
|
||
*/
|
||
param?: string | null;
|
||
}
|
||
}
|
||
/**
|
||
* Send this event to append audio bytes to the input audio buffer. The audio
|
||
* buffer is temporary storage you can write to and later commit. In Server VAD
|
||
* mode, the audio buffer is used to detect speech and the server will decide when
|
||
* to commit. When Server VAD is disabled, you must commit the audio buffer
|
||
* manually.
|
||
*
|
||
* The client may choose how much audio to place in each event up to a maximum of
|
||
* 15 MiB, for example streaming smaller chunks from the client may allow the VAD
|
||
* to be more responsive. Unlike made other client events, the server will not send
|
||
* a confirmation response to this event.
|
||
*/
|
||
export interface InputAudioBufferAppendEvent {
|
||
/**
|
||
* Base64-encoded audio bytes. This must be in the format specified by the
|
||
* `input_audio_format` field in the session configuration.
|
||
*/
|
||
audio: string;
|
||
/**
|
||
* The event type, must be `input_audio_buffer.append`.
|
||
*/
|
||
type: 'input_audio_buffer.append';
|
||
/**
|
||
* Optional client-generated ID used to identify this event.
|
||
*/
|
||
event_id?: string;
|
||
}
|
||
/**
|
||
* Send this event to clear the audio bytes in the buffer. The server will respond
|
||
* with an `input_audio_buffer.cleared` event.
|
||
*/
|
||
export interface InputAudioBufferClearEvent {
|
||
/**
|
||
* The event type, must be `input_audio_buffer.clear`.
|
||
*/
|
||
type: 'input_audio_buffer.clear';
|
||
/**
|
||
* Optional client-generated ID used to identify this event.
|
||
*/
|
||
event_id?: string;
|
||
}
|
||
/**
|
||
* Returned when the input audio buffer is cleared by the client with a
|
||
* `input_audio_buffer.clear` event.
|
||
*/
|
||
export interface InputAudioBufferClearedEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The event type, must be `input_audio_buffer.cleared`.
|
||
*/
|
||
type: 'input_audio_buffer.cleared';
|
||
}
|
||
/**
|
||
* Send this event to commit the user input audio buffer, which will create a new
|
||
* user message item in the conversation. This event will produce an error if the
|
||
* input audio buffer is empty. When in Server VAD mode, the client does not need
|
||
* to send this event, the server will commit the audio buffer automatically.
|
||
*
|
||
* Committing the input audio buffer will trigger input audio transcription (if
|
||
* enabled in session configuration), but it will not create a response from the
|
||
* model. The server will respond with an `input_audio_buffer.committed` event.
|
||
*/
|
||
export interface InputAudioBufferCommitEvent {
|
||
/**
|
||
* The event type, must be `input_audio_buffer.commit`.
|
||
*/
|
||
type: 'input_audio_buffer.commit';
|
||
/**
|
||
* Optional client-generated ID used to identify this event.
|
||
*/
|
||
event_id?: string;
|
||
}
|
||
/**
|
||
* Returned when an input audio buffer is committed, either by the client or
|
||
* automatically in server VAD mode. The `item_id` property is the ID of the user
|
||
* message item that will be created, thus a `conversation.item.created` event will
|
||
* also be sent to the client.
|
||
*/
|
||
export interface InputAudioBufferCommittedEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the user message item that will be created.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The ID of the preceding item after which the new item will be inserted.
|
||
*/
|
||
previous_item_id: string;
|
||
/**
|
||
* The event type, must be `input_audio_buffer.committed`.
|
||
*/
|
||
type: 'input_audio_buffer.committed';
|
||
}
|
||
/**
|
||
* Sent by the server when in `server_vad` mode to indicate that speech has been
|
||
* detected in the audio buffer. This can happen any time audio is added to the
|
||
* buffer (unless speech is already detected). The client may want to use this
|
||
* event to interrupt audio playback or provide visual feedback to the user.
|
||
*
|
||
* The client should expect to receive a `input_audio_buffer.speech_stopped` event
|
||
* when speech stops. The `item_id` property is the ID of the user message item
|
||
* that will be created when speech stops and will also be included in the
|
||
* `input_audio_buffer.speech_stopped` event (unless the client manually commits
|
||
* the audio buffer during VAD activation).
|
||
*/
|
||
export interface InputAudioBufferSpeechStartedEvent {
|
||
/**
|
||
* Milliseconds from the start of all audio written to the buffer during the
|
||
* session when speech was first detected. This will correspond to the beginning of
|
||
* audio sent to the model, and thus includes the `prefix_padding_ms` configured in
|
||
* the Session.
|
||
*/
|
||
audio_start_ms: number;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the user message item that will be created when speech stops.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The event type, must be `input_audio_buffer.speech_started`.
|
||
*/
|
||
type: 'input_audio_buffer.speech_started';
|
||
}
|
||
/**
|
||
* Returned in `server_vad` mode when the server detects the end of speech in the
|
||
* audio buffer. The server will also send an `conversation.item.created` event
|
||
* with the user message item that is created from the audio buffer.
|
||
*/
|
||
export interface InputAudioBufferSpeechStoppedEvent {
|
||
/**
|
||
* Milliseconds since the session started when speech stopped. This will correspond
|
||
* to the end of audio sent to the model, and thus includes the
|
||
* `min_silence_duration_ms` configured in the Session.
|
||
*/
|
||
audio_end_ms: number;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the user message item that will be created.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The event type, must be `input_audio_buffer.speech_stopped`.
|
||
*/
|
||
type: 'input_audio_buffer.speech_stopped';
|
||
}
|
||
/**
|
||
* Emitted at the beginning of a Response to indicate the updated rate limits. When
|
||
* a Response is created some tokens will be "reserved" for the output tokens, the
|
||
* rate limits shown here reflect that reservation, which is then adjusted
|
||
* accordingly once the Response is completed.
|
||
*/
|
||
export interface RateLimitsUpdatedEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* List of rate limit information.
|
||
*/
|
||
rate_limits: Array<RateLimitsUpdatedEvent.RateLimit>;
|
||
/**
|
||
* The event type, must be `rate_limits.updated`.
|
||
*/
|
||
type: 'rate_limits.updated';
|
||
}
|
||
export declare namespace RateLimitsUpdatedEvent {
|
||
interface RateLimit {
|
||
/**
|
||
* The maximum allowed value for the rate limit.
|
||
*/
|
||
limit?: number;
|
||
/**
|
||
* The name of the rate limit (`requests`, `tokens`).
|
||
*/
|
||
name?: 'requests' | 'tokens';
|
||
/**
|
||
* The remaining value before the limit is reached.
|
||
*/
|
||
remaining?: number;
|
||
/**
|
||
* Seconds until the rate limit resets.
|
||
*/
|
||
reset_seconds?: number;
|
||
}
|
||
}
|
||
/**
|
||
* All events that the client can send to the Realtime API
|
||
*/
|
||
export type RealtimeClientEvent = SessionUpdateEvent | InputAudioBufferAppendEvent | InputAudioBufferCommitEvent | InputAudioBufferClearEvent | ConversationItemCreateEvent | ConversationItemTruncateEvent | ConversationItemDeleteEvent | ResponseCreateEvent | ResponseCancelEvent;
|
||
/**
|
||
* The response resource.
|
||
*/
|
||
export interface RealtimeResponse {
|
||
/**
|
||
* The unique ID of the response.
|
||
*/
|
||
id?: string;
|
||
/**
|
||
* Which conversation the response is added to, determined by the `conversation`
|
||
* field in the `response.create` event. If `auto`, the response will be added to
|
||
* the default conversation and the value of `conversation_id` will be an id like
|
||
* `conv_1234`. If `none`, the response will not be added to any conversation and
|
||
* the value of `conversation_id` will be `null`. If responses are being triggered
|
||
* by server VAD, the response will be added to the default conversation, thus the
|
||
* `conversation_id` will be an id like `conv_1234`.
|
||
*/
|
||
conversation_id?: string;
|
||
/**
|
||
* Maximum number of output tokens for a single assistant response, inclusive of
|
||
* tool calls, that was used in this response.
|
||
*/
|
||
max_output_tokens?: number | 'inf';
|
||
/**
|
||
* Set of 16 key-value pairs that can be attached to an object. This can be useful
|
||
* for storing additional information about the object in a structured format, and
|
||
* querying for objects via API or the dashboard.
|
||
*
|
||
* Keys are strings with a maximum length of 64 characters. Values are strings with
|
||
* a maximum length of 512 characters.
|
||
*/
|
||
metadata?: Shared.Metadata | null;
|
||
/**
|
||
* The set of modalities the model used to respond. If there are multiple
|
||
* modalities, the model will pick one, for example if `modalities` is
|
||
* `["text", "audio"]`, the model could be responding in either text or audio.
|
||
*/
|
||
modalities?: Array<'text' | 'audio'>;
|
||
/**
|
||
* The object type, must be `realtime.response`.
|
||
*/
|
||
object?: 'realtime.response';
|
||
/**
|
||
* The list of output items generated by the response.
|
||
*/
|
||
output?: Array<ConversationItem>;
|
||
/**
|
||
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
|
||
*/
|
||
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
|
||
/**
|
||
* The final status of the response (`completed`, `cancelled`, `failed`, or
|
||
* `incomplete`).
|
||
*/
|
||
status?: 'completed' | 'cancelled' | 'failed' | 'incomplete';
|
||
/**
|
||
* Additional details about the status.
|
||
*/
|
||
status_details?: RealtimeResponseStatus;
|
||
/**
|
||
* Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
|
||
*/
|
||
temperature?: number;
|
||
/**
|
||
* Usage statistics for the Response, this will correspond to billing. A Realtime
|
||
* API session will maintain a conversation context and append new Items to the
|
||
* Conversation, thus output from previous turns (text and audio tokens) will
|
||
* become the input for later turns.
|
||
*/
|
||
usage?: RealtimeResponseUsage;
|
||
/**
|
||
* The voice the model used to respond. Current voice options are `alloy`, `ash`,
|
||
* `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
|
||
*/
|
||
voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
|
||
}
|
||
/**
|
||
* Additional details about the status.
|
||
*/
|
||
export interface RealtimeResponseStatus {
|
||
/**
|
||
* A description of the error that caused the response to fail, populated when the
|
||
* `status` is `failed`.
|
||
*/
|
||
error?: RealtimeResponseStatus.Error;
|
||
/**
|
||
* The reason the Response did not complete. For a `cancelled` Response, one of
|
||
* `turn_detected` (the server VAD detected a new start of speech) or
|
||
* `client_cancelled` (the client sent a cancel event). For an `incomplete`
|
||
* Response, one of `max_output_tokens` or `content_filter` (the server-side safety
|
||
* filter activated and cut off the response).
|
||
*/
|
||
reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter';
|
||
/**
|
||
* The type of error that caused the response to fail, corresponding with the
|
||
* `status` field (`completed`, `cancelled`, `incomplete`, `failed`).
|
||
*/
|
||
type?: 'completed' | 'cancelled' | 'incomplete' | 'failed';
|
||
}
|
||
export declare namespace RealtimeResponseStatus {
|
||
/**
|
||
* A description of the error that caused the response to fail, populated when the
|
||
* `status` is `failed`.
|
||
*/
|
||
interface Error {
|
||
/**
|
||
* Error code, if any.
|
||
*/
|
||
code?: string;
|
||
/**
|
||
* The type of error.
|
||
*/
|
||
type?: string;
|
||
}
|
||
}
|
||
/**
|
||
* Usage statistics for the Response, this will correspond to billing. A Realtime
|
||
* API session will maintain a conversation context and append new Items to the
|
||
* Conversation, thus output from previous turns (text and audio tokens) will
|
||
* become the input for later turns.
|
||
*/
|
||
export interface RealtimeResponseUsage {
|
||
/**
|
||
* Details about the input tokens used in the Response.
|
||
*/
|
||
input_token_details?: RealtimeResponseUsage.InputTokenDetails;
|
||
/**
|
||
* The number of input tokens used in the Response, including text and audio
|
||
* tokens.
|
||
*/
|
||
input_tokens?: number;
|
||
/**
|
||
* Details about the output tokens used in the Response.
|
||
*/
|
||
output_token_details?: RealtimeResponseUsage.OutputTokenDetails;
|
||
/**
|
||
* The number of output tokens sent in the Response, including text and audio
|
||
* tokens.
|
||
*/
|
||
output_tokens?: number;
|
||
/**
|
||
* The total number of tokens in the Response including input and output text and
|
||
* audio tokens.
|
||
*/
|
||
total_tokens?: number;
|
||
}
|
||
export declare namespace RealtimeResponseUsage {
|
||
/**
|
||
* Details about the input tokens used in the Response.
|
||
*/
|
||
interface InputTokenDetails {
|
||
/**
|
||
* The number of audio tokens used in the Response.
|
||
*/
|
||
audio_tokens?: number;
|
||
/**
|
||
* The number of cached tokens used in the Response.
|
||
*/
|
||
cached_tokens?: number;
|
||
/**
|
||
* The number of text tokens used in the Response.
|
||
*/
|
||
text_tokens?: number;
|
||
}
|
||
/**
|
||
* Details about the output tokens used in the Response.
|
||
*/
|
||
interface OutputTokenDetails {
|
||
/**
|
||
* The number of audio tokens used in the Response.
|
||
*/
|
||
audio_tokens?: number;
|
||
/**
|
||
* The number of text tokens used in the Response.
|
||
*/
|
||
text_tokens?: number;
|
||
}
|
||
}
|
||
/**
|
||
* All events that the Realtime API can send back
|
||
*/
|
||
export type RealtimeServerEvent = ErrorEvent | SessionCreatedEvent | SessionUpdatedEvent | ConversationCreatedEvent | InputAudioBufferCommittedEvent | InputAudioBufferClearedEvent | InputAudioBufferSpeechStartedEvent | InputAudioBufferSpeechStoppedEvent | ConversationItemCreatedEvent | ConversationItemInputAudioTranscriptionCompletedEvent | ConversationItemInputAudioTranscriptionFailedEvent | ConversationItemTruncatedEvent | ConversationItemDeletedEvent | ResponseCreatedEvent | ResponseDoneEvent | ResponseOutputItemAddedEvent | ResponseOutputItemDoneEvent | ResponseContentPartAddedEvent | ResponseContentPartDoneEvent | ResponseTextDeltaEvent | ResponseTextDoneEvent | ResponseAudioTranscriptDeltaEvent | ResponseAudioTranscriptDoneEvent | ResponseAudioDeltaEvent | ResponseAudioDoneEvent | ResponseFunctionCallArgumentsDeltaEvent | ResponseFunctionCallArgumentsDoneEvent | RateLimitsUpdatedEvent;
|
||
/**
|
||
* Returned when the model-generated audio is updated.
|
||
*/
|
||
export interface ResponseAudioDeltaEvent {
|
||
/**
|
||
* The index of the content part in the item's content array.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* Base64-encoded audio data delta.
|
||
*/
|
||
delta: string;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the item.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The index of the output item in the response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The ID of the response.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The event type, must be `response.audio.delta`.
|
||
*/
|
||
type: 'response.audio.delta';
|
||
}
|
||
/**
|
||
* Returned when the model-generated audio is done. Also emitted when a Response is
|
||
* interrupted, incomplete, or cancelled.
|
||
*/
|
||
export interface ResponseAudioDoneEvent {
|
||
/**
|
||
* The index of the content part in the item's content array.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the item.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The index of the output item in the response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The ID of the response.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The event type, must be `response.audio.done`.
|
||
*/
|
||
type: 'response.audio.done';
|
||
}
|
||
/**
|
||
* Returned when the model-generated transcription of audio output is updated.
|
||
*/
|
||
export interface ResponseAudioTranscriptDeltaEvent {
|
||
/**
|
||
* The index of the content part in the item's content array.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* The transcript delta.
|
||
*/
|
||
delta: string;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the item.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The index of the output item in the response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The ID of the response.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The event type, must be `response.audio_transcript.delta`.
|
||
*/
|
||
type: 'response.audio_transcript.delta';
|
||
}
|
||
/**
|
||
* Returned when the model-generated transcription of audio output is done
|
||
* streaming. Also emitted when a Response is interrupted, incomplete, or
|
||
* cancelled.
|
||
*/
|
||
export interface ResponseAudioTranscriptDoneEvent {
|
||
/**
|
||
* The index of the content part in the item's content array.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the item.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The index of the output item in the response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The ID of the response.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The final transcript of the audio.
|
||
*/
|
||
transcript: string;
|
||
/**
|
||
* The event type, must be `response.audio_transcript.done`.
|
||
*/
|
||
type: 'response.audio_transcript.done';
|
||
}
|
||
/**
|
||
* Send this event to cancel an in-progress response. The server will respond with
|
||
* a `response.cancelled` event or an error if there is no response to cancel.
|
||
*/
|
||
export interface ResponseCancelEvent {
|
||
/**
|
||
* The event type, must be `response.cancel`.
|
||
*/
|
||
type: 'response.cancel';
|
||
/**
|
||
* Optional client-generated ID used to identify this event.
|
||
*/
|
||
event_id?: string;
|
||
/**
|
||
* A specific response ID to cancel - if not provided, will cancel an in-progress
|
||
* response in the default conversation.
|
||
*/
|
||
response_id?: string;
|
||
}
|
||
/**
|
||
* Returned when a new content part is added to an assistant message item during
|
||
* response generation.
|
||
*/
|
||
export interface ResponseContentPartAddedEvent {
|
||
/**
|
||
* The index of the content part in the item's content array.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the item to which the content part was added.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The index of the output item in the response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The content part that was added.
|
||
*/
|
||
part: ResponseContentPartAddedEvent.Part;
|
||
/**
|
||
* The ID of the response.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The event type, must be `response.content_part.added`.
|
||
*/
|
||
type: 'response.content_part.added';
|
||
}
|
||
export declare namespace ResponseContentPartAddedEvent {
|
||
/**
|
||
* The content part that was added.
|
||
*/
|
||
interface Part {
|
||
/**
|
||
* Base64-encoded audio data (if type is "audio").
|
||
*/
|
||
audio?: string;
|
||
/**
|
||
* The text content (if type is "text").
|
||
*/
|
||
text?: string;
|
||
/**
|
||
* The transcript of the audio (if type is "audio").
|
||
*/
|
||
transcript?: string;
|
||
/**
|
||
* The content type ("text", "audio").
|
||
*/
|
||
type?: 'text' | 'audio';
|
||
}
|
||
}
|
||
/**
|
||
* Returned when a content part is done streaming in an assistant message item.
|
||
* Also emitted when a Response is interrupted, incomplete, or cancelled.
|
||
*/
|
||
export interface ResponseContentPartDoneEvent {
|
||
/**
|
||
* The index of the content part in the item's content array.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the item.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The index of the output item in the response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The content part that is done.
|
||
*/
|
||
part: ResponseContentPartDoneEvent.Part;
|
||
/**
|
||
* The ID of the response.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The event type, must be `response.content_part.done`.
|
||
*/
|
||
type: 'response.content_part.done';
|
||
}
|
||
export declare namespace ResponseContentPartDoneEvent {
|
||
/**
|
||
* The content part that is done.
|
||
*/
|
||
interface Part {
|
||
/**
|
||
* Base64-encoded audio data (if type is "audio").
|
||
*/
|
||
audio?: string;
|
||
/**
|
||
* The text content (if type is "text").
|
||
*/
|
||
text?: string;
|
||
/**
|
||
* The transcript of the audio (if type is "audio").
|
||
*/
|
||
transcript?: string;
|
||
/**
|
||
* The content type ("text", "audio").
|
||
*/
|
||
type?: 'text' | 'audio';
|
||
}
|
||
}
|
||
/**
|
||
* This event instructs the server to create a Response, which means triggering
|
||
* model inference. When in Server VAD mode, the server will create Responses
|
||
* automatically.
|
||
*
|
||
* A Response will include at least one Item, and may have two, in which case the
|
||
* second will be a function call. These Items will be appended to the conversation
|
||
* history.
|
||
*
|
||
* The server will respond with a `response.created` event, events for Items and
|
||
* content created, and finally a `response.done` event to indicate the Response is
|
||
* complete.
|
||
*
|
||
* The `response.create` event includes inference configuration like
|
||
* `instructions`, and `temperature`. These fields will override the Session's
|
||
* configuration for this Response only.
|
||
*/
|
||
export interface ResponseCreateEvent {
|
||
/**
|
||
* The event type, must be `response.create`.
|
||
*/
|
||
type: 'response.create';
|
||
/**
|
||
* Optional client-generated ID used to identify this event.
|
||
*/
|
||
event_id?: string;
|
||
/**
|
||
* Create a new Realtime response with these parameters
|
||
*/
|
||
response?: ResponseCreateEvent.Response;
|
||
}
|
||
export declare namespace ResponseCreateEvent {
|
||
/**
|
||
* Create a new Realtime response with these parameters
|
||
*/
|
||
interface Response {
|
||
/**
|
||
* Controls which conversation the response is added to. Currently supports `auto`
|
||
* and `none`, with `auto` as the default value. The `auto` value means that the
|
||
* contents of the response will be added to the default conversation. Set this to
|
||
* `none` to create an out-of-band response which will not add items to default
|
||
* conversation.
|
||
*/
|
||
conversation?: (string & {}) | 'auto' | 'none';
|
||
/**
|
||
* Input items to include in the prompt for the model. Using this field creates a
|
||
* new context for this Response instead of using the default conversation. An
|
||
* empty array `[]` will clear the context for this Response. Note that this can
|
||
* include references to items from the default conversation.
|
||
*/
|
||
input?: Array<RealtimeAPI.ConversationItemWithReference>;
|
||
/**
|
||
* The default system instructions (i.e. system message) prepended to model calls.
|
||
* This field allows the client to guide the model on desired responses. The model
|
||
* can be instructed on response content and format, (e.g. "be extremely succinct",
|
||
* "act friendly", "here are examples of good responses") and on audio behavior
|
||
* (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
|
||
* instructions are not guaranteed to be followed by the model, but they provide
|
||
* guidance to the model on the desired behavior.
|
||
*
|
||
* Note that the server sets default instructions which will be used if this field
|
||
* is not set and are visible in the `session.created` event at the start of the
|
||
* session.
|
||
*/
|
||
instructions?: string;
|
||
/**
|
||
* Maximum number of output tokens for a single assistant response, inclusive of
|
||
* tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
|
||
* `inf` for the maximum available tokens for a given model. Defaults to `inf`.
|
||
*/
|
||
max_response_output_tokens?: number | 'inf';
|
||
/**
|
||
* Set of 16 key-value pairs that can be attached to an object. This can be useful
|
||
* for storing additional information about the object in a structured format, and
|
||
* querying for objects via API or the dashboard.
|
||
*
|
||
* Keys are strings with a maximum length of 64 characters. Values are strings with
|
||
* a maximum length of 512 characters.
|
||
*/
|
||
metadata?: Shared.Metadata | null;
|
||
/**
|
||
* The set of modalities the model can respond with. To disable audio, set this to
|
||
* ["text"].
|
||
*/
|
||
modalities?: Array<'text' | 'audio'>;
|
||
/**
|
||
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
|
||
*/
|
||
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
|
||
/**
|
||
* Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
|
||
*/
|
||
temperature?: number;
|
||
/**
|
||
* How the model chooses tools. Options are `auto`, `none`, `required`, or specify
|
||
* a function, like `{"type": "function", "function": {"name": "my_function"}}`.
|
||
*/
|
||
tool_choice?: string;
|
||
/**
|
||
* Tools (functions) available to the model.
|
||
*/
|
||
tools?: Array<Response.Tool>;
|
||
/**
|
||
* The voice the model uses to respond. Voice cannot be changed during the session
|
||
* once the model has responded with audio at least once. Current voice options are
|
||
* `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
|
||
*/
|
||
voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
|
||
}
|
||
namespace Response {
|
||
interface Tool {
|
||
/**
|
||
* The description of the function, including guidance on when and how to call it,
|
||
* and guidance about what to tell the user when calling (if anything).
|
||
*/
|
||
description?: string;
|
||
/**
|
||
* The name of the function.
|
||
*/
|
||
name?: string;
|
||
/**
|
||
* Parameters of the function in JSON Schema.
|
||
*/
|
||
parameters?: unknown;
|
||
/**
|
||
* The type of the tool, i.e. `function`.
|
||
*/
|
||
type?: 'function';
|
||
}
|
||
}
|
||
}
|
||
/**
|
||
* Returned when a new Response is created. The first event of response creation,
|
||
* where the response is in an initial state of `in_progress`.
|
||
*/
|
||
export interface ResponseCreatedEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The response resource.
|
||
*/
|
||
response: RealtimeResponse;
|
||
/**
|
||
* The event type, must be `response.created`.
|
||
*/
|
||
type: 'response.created';
|
||
}
|
||
/**
|
||
* Returned when a Response is done streaming. Always emitted, no matter the final
|
||
* state. The Response object included in the `response.done` event will include
|
||
* all output Items in the Response but will omit the raw audio data.
|
||
*/
|
||
export interface ResponseDoneEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The response resource.
|
||
*/
|
||
response: RealtimeResponse;
|
||
/**
|
||
* The event type, must be `response.done`.
|
||
*/
|
||
type: 'response.done';
|
||
}
|
||
/**
|
||
* Returned when the model-generated function call arguments are updated.
|
||
*/
|
||
export interface ResponseFunctionCallArgumentsDeltaEvent {
|
||
/**
|
||
* The ID of the function call.
|
||
*/
|
||
call_id: string;
|
||
/**
|
||
* The arguments delta as a JSON string.
|
||
*/
|
||
delta: string;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the function call item.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The index of the output item in the response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The ID of the response.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The event type, must be `response.function_call_arguments.delta`.
|
||
*/
|
||
type: 'response.function_call_arguments.delta';
|
||
}
|
||
/**
|
||
* Returned when the model-generated function call arguments are done streaming.
|
||
* Also emitted when a Response is interrupted, incomplete, or cancelled.
|
||
*/
|
||
export interface ResponseFunctionCallArgumentsDoneEvent {
|
||
/**
|
||
* The final arguments as a JSON string.
|
||
*/
|
||
arguments: string;
|
||
/**
|
||
* The ID of the function call.
|
||
*/
|
||
call_id: string;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the function call item.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The index of the output item in the response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The ID of the response.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The event type, must be `response.function_call_arguments.done`.
|
||
*/
|
||
type: 'response.function_call_arguments.done';
|
||
}
|
||
/**
|
||
* Returned when a new Item is created during Response generation.
|
||
*/
|
||
export interface ResponseOutputItemAddedEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The item to add to the conversation.
|
||
*/
|
||
item: ConversationItem;
|
||
/**
|
||
* The index of the output item in the Response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The ID of the Response to which the item belongs.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The event type, must be `response.output_item.added`.
|
||
*/
|
||
type: 'response.output_item.added';
|
||
}
|
||
/**
|
||
* Returned when an Item is done streaming. Also emitted when a Response is
|
||
* interrupted, incomplete, or cancelled.
|
||
*/
|
||
export interface ResponseOutputItemDoneEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The item to add to the conversation.
|
||
*/
|
||
item: ConversationItem;
|
||
/**
|
||
* The index of the output item in the Response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The ID of the Response to which the item belongs.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The event type, must be `response.output_item.done`.
|
||
*/
|
||
type: 'response.output_item.done';
|
||
}
|
||
/**
|
||
* Returned when the text value of a "text" content part is updated.
|
||
*/
|
||
export interface ResponseTextDeltaEvent {
|
||
/**
|
||
* The index of the content part in the item's content array.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* The text delta.
|
||
*/
|
||
delta: string;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the item.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The index of the output item in the response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The ID of the response.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The event type, must be `response.text.delta`.
|
||
*/
|
||
type: 'response.text.delta';
|
||
}
|
||
/**
|
||
* Returned when the text value of a "text" content part is done streaming. Also
|
||
* emitted when a Response is interrupted, incomplete, or cancelled.
|
||
*/
|
||
export interface ResponseTextDoneEvent {
|
||
/**
|
||
* The index of the content part in the item's content array.
|
||
*/
|
||
content_index: number;
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* The ID of the item.
|
||
*/
|
||
item_id: string;
|
||
/**
|
||
* The index of the output item in the response.
|
||
*/
|
||
output_index: number;
|
||
/**
|
||
* The ID of the response.
|
||
*/
|
||
response_id: string;
|
||
/**
|
||
* The final text content.
|
||
*/
|
||
text: string;
|
||
/**
|
||
* The event type, must be `response.text.done`.
|
||
*/
|
||
type: 'response.text.done';
|
||
}
|
||
/**
|
||
* Returned when a Session is created. Emitted automatically when a new connection
|
||
* is established as the first server event. This event will contain the default
|
||
* Session configuration.
|
||
*/
|
||
export interface SessionCreatedEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* Realtime session object configuration.
|
||
*/
|
||
session: SessionsAPI.Session;
|
||
/**
|
||
* The event type, must be `session.created`.
|
||
*/
|
||
type: 'session.created';
|
||
}
|
||
/**
|
||
* Send this event to update the session’s default configuration. The client may
|
||
* send this event at any time to update the session configuration, and any field
|
||
* may be updated at any time, except for "voice". The server will respond with a
|
||
* `session.updated` event that shows the full effective configuration. Only fields
|
||
* that are present are updated, thus the correct way to clear a field like
|
||
* "instructions" is to pass an empty string.
|
||
*/
|
||
export interface SessionUpdateEvent {
|
||
/**
|
||
* Realtime session object configuration.
|
||
*/
|
||
session: SessionUpdateEvent.Session;
|
||
/**
|
||
* The event type, must be `session.update`.
|
||
*/
|
||
type: 'session.update';
|
||
/**
|
||
* Optional client-generated ID used to identify this event.
|
||
*/
|
||
event_id?: string;
|
||
}
|
||
export declare namespace SessionUpdateEvent {
|
||
/**
|
||
* Realtime session object configuration.
|
||
*/
|
||
interface Session {
|
||
/**
|
||
* The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
|
||
* `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
|
||
* (mono), and little-endian byte order.
|
||
*/
|
||
input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
|
||
/**
|
||
* Configuration for input audio transcription, defaults to off and can be set to
|
||
* `null` to turn off once on. Input audio transcription is not native to the
|
||
* model, since the model consumes audio directly. Transcription runs
|
||
* asynchronously through
|
||
* [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
|
||
* and should be treated as rough guidance rather than the representation
|
||
* understood by the model. The client can optionally set the language and prompt
|
||
* for transcription, these fields will be passed to the Whisper API.
|
||
*/
|
||
input_audio_transcription?: Session.InputAudioTranscription;
|
||
/**
|
||
* The default system instructions (i.e. system message) prepended to model calls.
|
||
* This field allows the client to guide the model on desired responses. The model
|
||
* can be instructed on response content and format, (e.g. "be extremely succinct",
|
||
* "act friendly", "here are examples of good responses") and on audio behavior
|
||
* (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
|
||
* instructions are not guaranteed to be followed by the model, but they provide
|
||
* guidance to the model on the desired behavior.
|
||
*
|
||
* Note that the server sets default instructions which will be used if this field
|
||
* is not set and are visible in the `session.created` event at the start of the
|
||
* session.
|
||
*/
|
||
instructions?: string;
|
||
/**
|
||
* Maximum number of output tokens for a single assistant response, inclusive of
|
||
* tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
|
||
* `inf` for the maximum available tokens for a given model. Defaults to `inf`.
|
||
*/
|
||
max_response_output_tokens?: number | 'inf';
|
||
/**
|
||
* The set of modalities the model can respond with. To disable audio, set this to
|
||
* ["text"].
|
||
*/
|
||
modalities?: Array<'text' | 'audio'>;
|
||
/**
|
||
* The Realtime model used for this session.
|
||
*/
|
||
model?: 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17';
|
||
/**
|
||
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
|
||
* For `pcm16`, output audio is sampled at a rate of 24kHz.
|
||
*/
|
||
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
|
||
/**
|
||
* Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
|
||
*/
|
||
temperature?: number;
|
||
/**
|
||
* How the model chooses tools. Options are `auto`, `none`, `required`, or specify
|
||
* a function.
|
||
*/
|
||
tool_choice?: string;
|
||
/**
|
||
* Tools (functions) available to the model.
|
||
*/
|
||
tools?: Array<Session.Tool>;
|
||
/**
|
||
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
|
||
* means that the model will detect the start and end of speech based on audio
|
||
* volume and respond at the end of user speech.
|
||
*/
|
||
turn_detection?: Session.TurnDetection;
|
||
/**
|
||
* The voice the model uses to respond. Voice cannot be changed during the session
|
||
* once the model has responded with audio at least once. Current voice options are
|
||
* `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
|
||
*/
|
||
voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
|
||
}
|
||
namespace Session {
|
||
/**
|
||
* Configuration for input audio transcription, defaults to off and can be set to
|
||
* `null` to turn off once on. Input audio transcription is not native to the
|
||
* model, since the model consumes audio directly. Transcription runs
|
||
* asynchronously through
|
||
* [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
|
||
* and should be treated as rough guidance rather than the representation
|
||
* understood by the model. The client can optionally set the language and prompt
|
||
* for transcription, these fields will be passed to the Whisper API.
|
||
*/
|
||
interface InputAudioTranscription {
|
||
/**
|
||
* The language of the input audio. Supplying the input language in
|
||
* [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
|
||
* format will improve accuracy and latency.
|
||
*/
|
||
language?: string;
|
||
/**
|
||
* The model to use for transcription, `whisper-1` is the only currently supported
|
||
* model.
|
||
*/
|
||
model?: string;
|
||
/**
|
||
* An optional text to guide the model's style or continue a previous audio
|
||
* segment. The
|
||
* [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
|
||
* should match the audio language.
|
||
*/
|
||
prompt?: string;
|
||
}
|
||
interface Tool {
|
||
/**
|
||
* The description of the function, including guidance on when and how to call it,
|
||
* and guidance about what to tell the user when calling (if anything).
|
||
*/
|
||
description?: string;
|
||
/**
|
||
* The name of the function.
|
||
*/
|
||
name?: string;
|
||
/**
|
||
* Parameters of the function in JSON Schema.
|
||
*/
|
||
parameters?: unknown;
|
||
/**
|
||
* The type of the tool, i.e. `function`.
|
||
*/
|
||
type?: 'function';
|
||
}
|
||
/**
|
||
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
|
||
* means that the model will detect the start and end of speech based on audio
|
||
* volume and respond at the end of user speech.
|
||
*/
|
||
interface TurnDetection {
|
||
/**
|
||
* Whether or not to automatically generate a response when VAD is enabled. `true`
|
||
* by default.
|
||
*/
|
||
create_response?: boolean;
|
||
/**
|
||
* Amount of audio to include before the VAD detected speech (in milliseconds).
|
||
* Defaults to 300ms.
|
||
*/
|
||
prefix_padding_ms?: number;
|
||
/**
|
||
* Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
|
||
* With shorter values the model will respond more quickly, but may jump in on
|
||
* short pauses from the user.
|
||
*/
|
||
silence_duration_ms?: number;
|
||
/**
|
||
* Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
|
||
* threshold will require louder audio to activate the model, and thus might
|
||
* perform better in noisy environments.
|
||
*/
|
||
threshold?: number;
|
||
/**
|
||
* Type of turn detection, only `server_vad` is currently supported.
|
||
*/
|
||
type?: string;
|
||
}
|
||
}
|
||
}
|
||
/**
|
||
* Returned when a session is updated with a `session.update` event, unless there
|
||
* is an error.
|
||
*/
|
||
export interface SessionUpdatedEvent {
|
||
/**
|
||
* The unique ID of the server event.
|
||
*/
|
||
event_id: string;
|
||
/**
|
||
* Realtime session object configuration.
|
||
*/
|
||
session: SessionsAPI.Session;
|
||
/**
|
||
* The event type, must be `session.updated`.
|
||
*/
|
||
type: 'session.updated';
|
||
}
|
||
export declare namespace Realtime {
|
||
export { Sessions as Sessions, type SessionsAPISession as Session, type SessionCreateResponse as SessionCreateResponse, type SessionCreateParams as SessionCreateParams, };
|
||
}
|
||
//# sourceMappingURL=realtime.d.ts.map
|