180 lines
6.7 KiB
JavaScript
180 lines
6.7 KiB
JavaScript
import { AssemblyAI, } from "assemblyai";
|
|
import { Document } from "@langchain/core/documents";
|
|
import { getEnvironmentVariable } from "@langchain/core/utils/env";
|
|
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
|
|
const defaultOptions = {
|
|
userAgent: {
|
|
integration: { name: "LangChainJS", version: "1.0.1" },
|
|
},
|
|
};
|
|
/**
|
|
* Base class for AssemblyAI loaders.
|
|
*/
|
|
class AssemblyAILoader extends BaseDocumentLoader {
|
|
/**
|
|
* Create a new AssemblyAI loader.
|
|
* @param assemblyAIOptions The options to configure the AssemblyAI loader.
|
|
* Configure the `assemblyAIOptions.apiKey` with your AssemblyAI API key, or configure it as the `ASSEMBLYAI_API_KEY` environment variable.
|
|
*/
|
|
constructor(assemblyAIOptions) {
|
|
super();
|
|
Object.defineProperty(this, "client", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
let options = assemblyAIOptions;
|
|
if (!options) {
|
|
options = {};
|
|
}
|
|
if (!options.apiKey) {
|
|
options.apiKey = getEnvironmentVariable("ASSEMBLYAI_API_KEY");
|
|
}
|
|
if (!options.apiKey) {
|
|
throw new Error("No AssemblyAI API key provided");
|
|
}
|
|
this.client = new AssemblyAI({
|
|
...defaultOptions,
|
|
...options,
|
|
});
|
|
}
|
|
}
|
|
class CreateTranscriptLoader extends AssemblyAILoader {
|
|
/**
|
|
* Transcribe audio or retrieve an existing transcript by its ID.
|
|
* @param params The parameters to transcribe audio, or the ID of the transcript to retrieve.
|
|
* @param assemblyAIOptions The options to configure the AssemblyAI loader.
|
|
* Configure the `assemblyAIOptions.apiKey` with your AssemblyAI API key, or configure it as the `ASSEMBLYAI_API_KEY` environment variable.
|
|
*/
|
|
constructor(params, assemblyAIOptions) {
|
|
super(assemblyAIOptions);
|
|
Object.defineProperty(this, "transcribeParams", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "transcriptId", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
if (typeof params === "string") {
|
|
this.transcriptId = params;
|
|
}
|
|
else {
|
|
this.transcribeParams = params;
|
|
}
|
|
}
|
|
async transcribeOrGetTranscript() {
|
|
if (this.transcriptId) {
|
|
return await this.client.transcripts.get(this.transcriptId);
|
|
}
|
|
if (this.transcribeParams) {
|
|
let transcribeParams;
|
|
if ("audio_url" in this.transcribeParams) {
|
|
transcribeParams = {
|
|
...this.transcribeParams,
|
|
audio: this.transcribeParams.audio_url,
|
|
};
|
|
}
|
|
else {
|
|
transcribeParams = this.transcribeParams;
|
|
}
|
|
return await this.client.transcripts.transcribe(transcribeParams);
|
|
}
|
|
else {
|
|
throw new Error("No transcript ID or transcribe parameters provided");
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* Transcribe audio and load the transcript as a document using AssemblyAI.
|
|
*/
|
|
export class AudioTranscriptLoader extends CreateTranscriptLoader {
|
|
/**
|
|
* Transcribe audio and load the transcript as a document using AssemblyAI.
|
|
* @returns A promise that resolves to a single document containing the transcript text
|
|
* as the page content, and the transcript object as the metadata.
|
|
*/
|
|
async load() {
|
|
const transcript = await this.transcribeOrGetTranscript();
|
|
return [
|
|
new Document({
|
|
pageContent: transcript.text,
|
|
metadata: transcript,
|
|
}),
|
|
];
|
|
}
|
|
}
|
|
/**
|
|
* Transcribe audio and load the paragraphs of the transcript, creating a document for each paragraph.
|
|
*/
|
|
export class AudioTranscriptParagraphsLoader extends CreateTranscriptLoader {
|
|
/**
|
|
* Transcribe audio and load the paragraphs of the transcript, creating a document for each paragraph.
|
|
* @returns A promise that resolves to an array of documents, each containing a paragraph of the transcript.
|
|
*/
|
|
async load() {
|
|
const transcript = await this.transcribeOrGetTranscript();
|
|
const paragraphsResponse = await this.client.transcripts.paragraphs(transcript.id);
|
|
return paragraphsResponse.paragraphs.map((p) => new Document({
|
|
pageContent: p.text,
|
|
metadata: p,
|
|
}));
|
|
}
|
|
}
|
|
/**
|
|
* Transcribe audio and load the sentences of the transcript, creating a document for each sentence.
|
|
*/
|
|
export class AudioTranscriptSentencesLoader extends CreateTranscriptLoader {
|
|
/**
|
|
* Transcribe audio and load the sentences of the transcript, creating a document for each sentence.
|
|
* @returns A promise that resolves to an array of documents, each containing a sentence of the transcript.
|
|
*/
|
|
async load() {
|
|
const transcript = await this.transcribeOrGetTranscript();
|
|
const sentencesResponse = await this.client.transcripts.sentences(transcript.id);
|
|
return sentencesResponse.sentences.map((p) => new Document({
|
|
pageContent: p.text,
|
|
metadata: p,
|
|
}));
|
|
}
|
|
}
|
|
/**
|
|
* Transcribe audio and load subtitles for the transcript as `srt` or `vtt` format.
|
|
*/
|
|
export class AudioSubtitleLoader extends CreateTranscriptLoader {
|
|
/**
|
|
* Create a new AudioSubtitleLoader.
|
|
* @param params The parameters to transcribe audio, or the ID of the transcript to retrieve.
|
|
* @param subtitleFormat The format of the subtitles, either `srt` or `vtt`.
|
|
* @param assemblyAIOptions The options to configure the AssemblyAI loader.
|
|
* Configure the `assemblyAIOptions.apiKey` with your AssemblyAI API key, or configure it as the `ASSEMBLYAI_API_KEY` environment variable.
|
|
*/
|
|
constructor(params, subtitleFormat = "srt", assemblyAIOptions) {
|
|
super(params, assemblyAIOptions);
|
|
Object.defineProperty(this, "subtitleFormat", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: subtitleFormat
|
|
});
|
|
this.subtitleFormat = subtitleFormat;
|
|
}
|
|
/**
|
|
* Transcribe audio and load subtitles for the transcript as `srt` or `vtt` format.
|
|
* @returns A promise that resolves a document containing the subtitles as the page content.
|
|
*/
|
|
async load() {
|
|
const transcript = await this.transcribeOrGetTranscript();
|
|
const subtitles = await this.client.transcripts.subtitles(transcript.id, this.subtitleFormat);
|
|
return [
|
|
new Document({
|
|
pageContent: subtitles,
|
|
}),
|
|
];
|
|
}
|
|
}
|