agsamantha/node_modules/openai/resources/audio/transcriptions.d.ts

import { APIResource } from "../../resource.js";
import * as Core from "../../core.js";
import * as TranscriptionsAPI from "./transcriptions.js";
import * as AudioAPI from "./audio.js";
export declare class Transcriptions extends APIResource {
    /**
     * Transcribes audio into the input language.
     */
    create(body: TranscriptionCreateParams<'json' | undefined>, options?: Core.RequestOptions): Core.APIPromise<Transcription>;
    create(body: TranscriptionCreateParams<'verbose_json'>, options?: Core.RequestOptions): Core.APIPromise<TranscriptionVerbose>;
    create(body: TranscriptionCreateParams<'srt' | 'vtt' | 'text'>, options?: Core.RequestOptions): Core.APIPromise<string>;
    create(body: TranscriptionCreateParams, options?: Core.RequestOptions): Core.APIPromise<Transcription>;
}
/**
 * Represents a transcription response returned by model, based on the provided
 * input.
 */
export interface Transcription {
    /**
     * The transcribed text.
     */
    text: string;
}
export interface TranscriptionSegment {
    /**
     * Unique identifier of the segment.
     */
    id: number;
    /**
     * Average logprob of the segment. If the value is lower than -1, consider the
     * logprobs failed.
     */
    avg_logprob: number;
    /**
     * Compression ratio of the segment. If the value is greater than 2.4, consider the
     * compression failed.
     */
    compression_ratio: number;
    /**
     * End time of the segment in seconds.
     */
    end: number;
    /**
     * Probability of no speech in the segment. If the value is higher than 1.0 and the
     * `avg_logprob` is below -1, consider this segment silent.
     */
    no_speech_prob: number;
    /**
     * Seek offset of the segment.
     */
    seek: number;
    /**
     * Start time of the segment in seconds.
     */
    start: number;
    /**
     * Temperature parameter used for generating the segment.
     */
    temperature: number;
    /**
     * Text content of the segment.
     */
    text: string;
    /**
     * Array of token IDs for the text content.
     */
    tokens: Array<number>;
}
/**
 * Represents a verbose json transcription response returned by model, based on the
 * provided input.
 */
export interface TranscriptionVerbose {
    /**
     * The duration of the input audio.
     */
    duration: string;
    /**
     * The language of the input audio.
     */
    language: string;
    /**
     * The transcribed text.
     */
    text: string;
    /**
     * Segments of the transcribed text and their corresponding details.
     */
    segments?: Array<TranscriptionSegment>;
    /**
     * Extracted words and their corresponding timestamps.
     */
    words?: Array<TranscriptionWord>;
}
export interface TranscriptionWord {
    /**
     * End time of the word in seconds.
     */
    end: number;
    /**
     * Start time of the word in seconds.
     */
    start: number;
    /**
     * The text content of the word.
     */
    word: string;
}
/**
 * Represents a transcription response returned by model, based on the provided
 * input.
 */
export type TranscriptionCreateResponse = Transcription | TranscriptionVerbose;
export interface TranscriptionCreateParams<ResponseFormat extends AudioAPI.AudioResponseFormat | undefined = AudioAPI.AudioResponseFormat | undefined> {
    /**
     * The audio file object (not file name) to transcribe, in one of these formats:
     * flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
     */
    file: Core.Uploadable;
    /**
     * ID of the model to use. Only `whisper-1` (which is powered by our open source
     * Whisper V2 model) is currently available.
     */
    model: (string & {}) | AudioAPI.AudioModel;
    /**
     * The language of the input audio. Supplying the input language in
     * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will
     * improve accuracy and latency.
     */
    language?: string;
    /**
     * An optional text to guide the model's style or continue a previous audio
     * segment. The
     * [prompt](https://platform.openai.com/docs/guides/speech-to-text/prompting)
     * should match the audio language.
     */
    prompt?: string;
    /**
     * The format of the output, in one of these options: `json`, `text`, `srt`,
     * `verbose_json`, or `vtt`.
     */
    response_format?: ResponseFormat;
    /**
     * The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
     * output more random, while lower values like 0.2 will make it more focused and
     * deterministic. If set to 0, the model will use
     * [log probability](https://en.wikipedia.org/wiki/Log_probability) to
     * automatically increase the temperature until certain thresholds are hit.
     */
    temperature?: number;
    /**
     * The timestamp granularities to populate for this transcription.
     * `response_format` must be set `verbose_json` to use timestamp granularities.
     * Either or both of these options are supported: `word`, or `segment`. Note: There
     * is no additional latency for segment timestamps, but generating word timestamps
     * incurs additional latency.
     */
    timestamp_granularities?: Array<'word' | 'segment'>;
}
export declare namespace Transcriptions {
    export import Transcription = TranscriptionsAPI.Transcription;
    export import TranscriptionSegment = TranscriptionsAPI.TranscriptionSegment;
    export import TranscriptionVerbose = TranscriptionsAPI.TranscriptionVerbose;
    export import TranscriptionWord = TranscriptionsAPI.TranscriptionWord;
    export import TranscriptionCreateResponse = TranscriptionsAPI.TranscriptionCreateResponse;
    type TranscriptionCreateParams<ResponseFormat extends AudioAPI.AudioResponseFormat | undefined = AudioAPI.AudioResponseFormat | undefined> = TranscriptionsAPI.TranscriptionCreateParams<ResponseFormat>;
}
//# sourceMappingURL=transcriptions.d.ts.map