394 lines
15 KiB
JavaScript
394 lines
15 KiB
JavaScript
"use strict";
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.NotionAPILoader = exports.isDatabase = exports.isPage = exports.isErrorResponse = exports.isDatabaseResponse = exports.isPageResponse = void 0;
|
|
const client_1 = require("@notionhq/client");
|
|
const notion_to_md_1 = require("notion-to-md");
|
|
const notion_js_1 = require("notion-to-md/build/utils/notion.js");
|
|
const js_yaml_1 = __importDefault(require("js-yaml"));
|
|
const documents_1 = require("@langchain/core/documents");
|
|
const async_caller_1 = require("@langchain/core/utils/async_caller");
|
|
const base_1 = require("@langchain/core/document_loaders/base");
|
|
const isPageResponse = (res) => !(0, client_1.isNotionClientError)(res) && res.object === "page";
|
|
exports.isPageResponse = isPageResponse;
|
|
const isDatabaseResponse = (res) => !(0, client_1.isNotionClientError)(res) && res.object === "database";
|
|
exports.isDatabaseResponse = isDatabaseResponse;
|
|
const isErrorResponse = (res) => (0, client_1.isNotionClientError)(res);
|
|
exports.isErrorResponse = isErrorResponse;
|
|
const isPage = (res) => (0, exports.isPageResponse)(res) && (0, client_1.isFullPage)(res);
|
|
exports.isPage = isPage;
|
|
const isDatabase = (res) => (0, exports.isDatabaseResponse)(res) && (0, client_1.isFullDatabase)(res);
|
|
exports.isDatabase = isDatabase;
|
|
/**
|
|
* A class that extends the BaseDocumentLoader class. It represents a
|
|
* document loader for loading documents from Notion using the Notion API.
|
|
* @example
|
|
* ```typescript
|
|
* const pageLoader = new NotionAPILoader({
|
|
* clientOptions: { auth: "<NOTION_INTEGRATION_TOKEN>" },
|
|
* id: "<PAGE_ID>",
|
|
* type: "page",
|
|
* });
|
|
* const pageDocs = await pageLoader.loadAndSplit();
|
|
* const dbLoader = new NotionAPILoader({
|
|
* clientOptions: { auth: "<NOTION_INTEGRATION_TOKEN>" },
|
|
* id: "<DATABASE_ID>",
|
|
* type: "database",
|
|
* propertiesAsHeader: true,
|
|
* });
|
|
* const dbDocs = await dbLoader.load();
|
|
* ```
|
|
*/
|
|
class NotionAPILoader extends base_1.BaseDocumentLoader {
|
|
constructor(options) {
|
|
super();
|
|
Object.defineProperty(this, "caller", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "notionClient", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "n2mClient", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "id", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "pageQueue", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "pageCompleted", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "pageQueueTotal", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "documents", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "rootTitle", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "onDocumentLoaded", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "propertiesAsHeader", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
this.caller = new async_caller_1.AsyncCaller({
|
|
maxConcurrency: 64,
|
|
...options.callerOptions,
|
|
});
|
|
this.notionClient = new client_1.Client({
|
|
logger: () => { },
|
|
...options.clientOptions,
|
|
});
|
|
this.n2mClient = new notion_to_md_1.NotionToMarkdown({
|
|
notionClient: this.notionClient,
|
|
config: { parseChildPages: false, convertImagesToBase64: false },
|
|
});
|
|
this.id = options.id;
|
|
this.pageQueue = [];
|
|
this.pageCompleted = [];
|
|
this.pageQueueTotal = 0;
|
|
this.documents = [];
|
|
this.rootTitle = "";
|
|
this.onDocumentLoaded = options.onDocumentLoaded ?? ((_ti, _cu) => { });
|
|
this.propertiesAsHeader = options.propertiesAsHeader || false;
|
|
}
|
|
/**
|
|
* Adds a selection of page ids to the pageQueue and removes duplicates.
|
|
* @param items An array of string ids
|
|
*/
|
|
addToQueue(...items) {
|
|
const deDuped = items.filter((item) => !this.pageCompleted.concat(this.pageQueue).includes(item));
|
|
this.pageQueue.push(...deDuped);
|
|
this.pageQueueTotal += deDuped.length;
|
|
}
|
|
/**
|
|
* Parses a Notion GetResponse object (page or database) and returns a string of the title.
|
|
* @param obj The Notion GetResponse object to parse.
|
|
* @returns The string of the title.
|
|
*/
|
|
getTitle(obj) {
|
|
if ((0, exports.isPage)(obj)) {
|
|
const titleProp = Object.values(obj.properties).find((prop) => prop.type === "title");
|
|
if (titleProp)
|
|
return this.getPropValue(titleProp);
|
|
}
|
|
if ((0, exports.isDatabase)(obj))
|
|
return obj.title
|
|
.map((v) => this.n2mClient.annotatePlainText(v.plain_text, v.annotations))
|
|
.join("");
|
|
return null;
|
|
}
|
|
/**
|
|
* Parses the property type and returns a string
|
|
* @param page The Notion page property to parse.
|
|
* @returns A string of parsed property.
|
|
*/
|
|
getPropValue(prop) {
|
|
switch (prop.type) {
|
|
case "number": {
|
|
const propNumber = prop[prop.type];
|
|
return propNumber !== null ? propNumber.toString() : "";
|
|
}
|
|
case "url":
|
|
return prop[prop.type] || "";
|
|
case "select":
|
|
return prop[prop.type]?.name ?? "";
|
|
case "multi_select":
|
|
return `[${prop[prop.type].map((v) => `"${v.name}"`).join(", ")}]`;
|
|
case "status":
|
|
return prop[prop.type]?.name ?? "";
|
|
case "date":
|
|
return `${prop[prop.type]?.start ?? ""}${prop[prop.type]?.end ? ` - ${prop[prop.type]?.end}` : ""}`;
|
|
case "email":
|
|
return prop[prop.type] || "";
|
|
case "phone_number":
|
|
return prop[prop.type] || "";
|
|
case "checkbox":
|
|
return prop[prop.type].toString();
|
|
case "files":
|
|
return `[${prop[prop.type].map((v) => `"${v.name}"`).join(", ")}]`;
|
|
case "created_by":
|
|
return `["${prop[prop.type].object}", "${prop[prop.type].id}"]`;
|
|
case "created_time":
|
|
return prop[prop.type];
|
|
case "last_edited_by":
|
|
return `["${prop[prop.type].object}", "${prop[prop.type].id}"]`;
|
|
case "last_edited_time":
|
|
return prop[prop.type];
|
|
case "title":
|
|
return prop[prop.type]
|
|
.map((v) => this.n2mClient.annotatePlainText(v.plain_text, v.annotations))
|
|
.join("");
|
|
case "rich_text":
|
|
return prop[prop.type]
|
|
.map((v) => this.n2mClient.annotatePlainText(v.plain_text, v.annotations))
|
|
.join("");
|
|
case "people":
|
|
return `[${prop[prop.type]
|
|
.map((v) => `["${v.object}", "${v.id}"]`)
|
|
.join(", ")}]`;
|
|
case "unique_id":
|
|
return `${prop[prop.type].prefix || ""}${prop[prop.type].number}`;
|
|
case "relation":
|
|
return `[${prop[prop.type].map((v) => `"${v.id}"`).join(", ")}]`;
|
|
default:
|
|
return `Unsupported type: ${prop.type}`;
|
|
}
|
|
}
|
|
/**
|
|
* Parses the properties of a Notion page and returns them as key-value
|
|
* pairs.
|
|
* @param page The Notion page to parse.
|
|
* @returns An object containing the parsed properties as key-value pairs.
|
|
*/
|
|
parsePageProperties(page) {
|
|
return Object.entries(page.properties).reduce((accum, [propName, prop]) => {
|
|
const value = this.getPropValue(prop);
|
|
const props = { ...accum, [propName]: value };
|
|
return prop.type === "title" ? { ...props, _title: value } : props;
|
|
}, {});
|
|
}
|
|
/**
|
|
* Parses the details of a Notion page and returns them as an object.
|
|
* @param page The Notion page to parse.
|
|
* @returns An object containing the parsed details of the page.
|
|
*/
|
|
parsePageDetails(page) {
|
|
const { id, ...rest } = page;
|
|
return {
|
|
...rest,
|
|
notionId: id,
|
|
properties: this.parsePageProperties(page),
|
|
};
|
|
}
|
|
/**
|
|
* Loads a Notion block and returns it as an MdBlock object.
|
|
* @param block The Notion block to load.
|
|
* @returns A Promise that resolves to an MdBlock object.
|
|
*/
|
|
async loadBlock(block) {
|
|
const mdBlock = {
|
|
type: block.type,
|
|
blockId: block.id,
|
|
parent: await this.caller.call(() => this.n2mClient.blockToMarkdown(block)),
|
|
children: [],
|
|
};
|
|
if (block.has_children) {
|
|
const block_id = block.type === "synced_block" &&
|
|
block.synced_block?.synced_from?.block_id
|
|
? block.synced_block.synced_from.block_id
|
|
: block.id;
|
|
const childBlocks = await this.loadBlocks(await this.caller.call(() => (0, notion_js_1.getBlockChildren)(this.notionClient, block_id, null)));
|
|
mdBlock.children = childBlocks;
|
|
}
|
|
return mdBlock;
|
|
}
|
|
/**
|
|
* Loads Notion blocks and their children recursively.
|
|
* @param blocksResponse The response from the Notion API containing the blocks to load.
|
|
* @returns A Promise that resolves to an array containing the loaded MdBlocks.
|
|
*/
|
|
async loadBlocks(blocksResponse) {
|
|
const blocks = blocksResponse.filter(client_1.isFullBlock);
|
|
// Add child pages to queue
|
|
const childPages = blocks
|
|
.filter((block) => block.type.includes("child_page"))
|
|
.map((block) => block.id);
|
|
if (childPages.length > 0)
|
|
this.addToQueue(...childPages);
|
|
// Add child database pages to queue
|
|
const childDatabases = blocks
|
|
.filter((block) => block.type.includes("child_database"))
|
|
.map((block) => this.caller.call(() => this.loadDatabase(block.id)));
|
|
// Load this block and child blocks
|
|
const loadingMdBlocks = blocks
|
|
.filter((block) => !["child_page", "child_database"].includes(block.type))
|
|
.map((block) => this.loadBlock(block));
|
|
const [mdBlocks] = await Promise.all([
|
|
Promise.all(loadingMdBlocks),
|
|
Promise.all(childDatabases),
|
|
]);
|
|
return mdBlocks;
|
|
}
|
|
/**
|
|
* Loads a Notion page and its child documents, then adds it to the completed documents array.
|
|
* @param page The Notion page or page ID to load.
|
|
*/
|
|
async loadPage(page) {
|
|
// Check page is a page ID or a PageObjectResponse
|
|
const [pageData, pageId] = typeof page === "string"
|
|
? [
|
|
this.caller.call(() => this.notionClient.pages.retrieve({ page_id: page })),
|
|
page,
|
|
]
|
|
: [page, page.id];
|
|
const [pageDetails, pageBlocks] = await Promise.all([
|
|
pageData,
|
|
this.caller.call(() => (0, notion_js_1.getBlockChildren)(this.notionClient, pageId, null)),
|
|
]);
|
|
if (!(0, client_1.isFullPage)(pageDetails)) {
|
|
this.pageCompleted.push(pageId);
|
|
return;
|
|
}
|
|
const mdBlocks = await this.loadBlocks(pageBlocks);
|
|
const mdStringObject = this.n2mClient.toMarkdownString(mdBlocks);
|
|
let pageContent = mdStringObject.parent;
|
|
const metadata = this.parsePageDetails(pageDetails);
|
|
if (this.propertiesAsHeader) {
|
|
pageContent =
|
|
`---\n` +
|
|
`${js_yaml_1.default.dump(metadata.properties)}` +
|
|
`---\n\n` +
|
|
`${pageContent ?? ""}`;
|
|
}
|
|
if (!pageContent) {
|
|
this.pageCompleted.push(pageId);
|
|
return;
|
|
}
|
|
const pageDocument = new documents_1.Document({ pageContent, metadata });
|
|
this.documents.push(pageDocument);
|
|
this.pageCompleted.push(pageId);
|
|
this.onDocumentLoaded(this.documents.length, this.pageQueueTotal, this.getTitle(pageDetails) || undefined, this.rootTitle);
|
|
}
|
|
/**
|
|
* Loads a Notion database and adds it's pages to the queue.
|
|
* @param id The ID of the Notion database to load.
|
|
*/
|
|
async loadDatabase(id) {
|
|
try {
|
|
for await (const page of (0, client_1.iteratePaginatedAPI)(this.notionClient.databases.query, {
|
|
database_id: id,
|
|
page_size: 50,
|
|
})) {
|
|
this.addToQueue(page.id);
|
|
}
|
|
}
|
|
catch (e) {
|
|
console.log(e);
|
|
// TODO: Catch and report api request errors
|
|
}
|
|
}
|
|
/**
|
|
* Loads the documents from Notion based on the specified options.
|
|
* @returns A Promise that resolves to an array of Documents.
|
|
*/
|
|
async load() {
|
|
const resPagePromise = this.notionClient.pages
|
|
.retrieve({ page_id: this.id })
|
|
.then((res) => {
|
|
this.addToQueue(this.id);
|
|
return res;
|
|
})
|
|
.catch((error) => error);
|
|
const resDatabasePromise = this.notionClient.databases
|
|
.retrieve({ database_id: this.id })
|
|
.then(async (res) => {
|
|
await this.loadDatabase(this.id);
|
|
return res;
|
|
})
|
|
.catch((error) => error);
|
|
const [resPage, resDatabase] = await Promise.all([
|
|
resPagePromise,
|
|
resDatabasePromise,
|
|
]);
|
|
// Check if both resPage and resDatabase resulted in error responses
|
|
const errors = [resPage, resDatabase].filter(exports.isErrorResponse);
|
|
if (errors.length === 2) {
|
|
if (errors.every((e) => e.code === client_1.APIErrorCode.ObjectNotFound)) {
|
|
throw new AggregateError([
|
|
Error(`Could not find object with ID: ${this.id}. Make sure the relevant pages and databases are shared with your integration.`),
|
|
...errors,
|
|
]);
|
|
}
|
|
throw new AggregateError(errors);
|
|
}
|
|
this.rootTitle =
|
|
this.getTitle(resPage) || this.getTitle(resDatabase) || this.id;
|
|
let pageId = this.pageQueue.shift();
|
|
while (pageId) {
|
|
await this.loadPage(pageId);
|
|
pageId = this.pageQueue.shift();
|
|
}
|
|
return this.documents;
|
|
}
|
|
}
|
|
exports.NotionAPILoader = NotionAPILoader;
|