35 lines
1.4 KiB
JavaScript
35 lines
1.4 KiB
JavaScript
import { parseOfficeAsync } from "officeparser";
|
|
import { Document } from "@langchain/core/documents";
|
|
import { BufferLoader } from "langchain/document_loaders/fs/buffer";
|
|
/**
|
|
* A class that extends the `BufferLoader` class. It represents a document
|
|
* loader that loads documents from PDF files.
|
|
*/
|
|
export class PPTXLoader extends BufferLoader {
|
|
constructor(filePathOrBlob) {
|
|
super(filePathOrBlob);
|
|
}
|
|
/**
|
|
* A method that takes a `raw` buffer and `metadata` as parameters and
|
|
* returns a promise that resolves to an array of `Document` instances. It
|
|
* uses the `parseOfficeAsync` function from the `officeparser` module to extract
|
|
* the raw text content from the buffer. If the extracted powerpoint content is
|
|
* empty, it returns an empty array. Otherwise, it creates a new
|
|
* `Document` instance with the extracted powerpoint content and the provided
|
|
* metadata, and returns it as an array.
|
|
* @param raw The buffer to be parsed.
|
|
* @param metadata The metadata of the document.
|
|
* @returns A promise that resolves to an array of `Document` instances.
|
|
*/
|
|
async parse(raw, metadata) {
|
|
const pptx = await parseOfficeAsync(raw, { outputErrorToConsole: true });
|
|
if (!pptx)
|
|
return [];
|
|
return [
|
|
new Document({
|
|
pageContent: pptx,
|
|
metadata,
|
|
}),
|
|
];
|
|
}
|
|
}
|