383 lines
14 KiB
JavaScript
383 lines
14 KiB
JavaScript
|
import { Client, isFullBlock, isFullPage, iteratePaginatedAPI, APIErrorCode, isNotionClientError, isFullDatabase, } from "@notionhq/client";
|
||
|
import { NotionToMarkdown } from "notion-to-md";
|
||
|
import { getBlockChildren } from "notion-to-md/build/utils/notion.js";
|
||
|
import yaml from "js-yaml";
|
||
|
import { Document } from "@langchain/core/documents";
|
||
|
import { AsyncCaller } from "@langchain/core/utils/async_caller";
|
||
|
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
|
||
|
export const isPageResponse = (res) => !isNotionClientError(res) && res.object === "page";
|
||
|
export const isDatabaseResponse = (res) => !isNotionClientError(res) && res.object === "database";
|
||
|
export const isErrorResponse = (res) => isNotionClientError(res);
|
||
|
export const isPage = (res) => isPageResponse(res) && isFullPage(res);
|
||
|
export const isDatabase = (res) => isDatabaseResponse(res) && isFullDatabase(res);
|
||
|
/**
|
||
|
* A class that extends the BaseDocumentLoader class. It represents a
|
||
|
* document loader for loading documents from Notion using the Notion API.
|
||
|
* @example
|
||
|
* ```typescript
|
||
|
* const pageLoader = new NotionAPILoader({
|
||
|
* clientOptions: { auth: "<NOTION_INTEGRATION_TOKEN>" },
|
||
|
* id: "<PAGE_ID>",
|
||
|
* type: "page",
|
||
|
* });
|
||
|
* const pageDocs = await pageLoader.loadAndSplit();
|
||
|
* const dbLoader = new NotionAPILoader({
|
||
|
* clientOptions: { auth: "<NOTION_INTEGRATION_TOKEN>" },
|
||
|
* id: "<DATABASE_ID>",
|
||
|
* type: "database",
|
||
|
* propertiesAsHeader: true,
|
||
|
* });
|
||
|
* const dbDocs = await dbLoader.load();
|
||
|
* ```
|
||
|
*/
|
||
|
export class NotionAPILoader extends BaseDocumentLoader {
|
||
|
constructor(options) {
|
||
|
super();
|
||
|
Object.defineProperty(this, "caller", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "notionClient", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "n2mClient", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "id", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "pageQueue", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "pageCompleted", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "pageQueueTotal", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "documents", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "rootTitle", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "onDocumentLoaded", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
Object.defineProperty(this, "propertiesAsHeader", {
|
||
|
enumerable: true,
|
||
|
configurable: true,
|
||
|
writable: true,
|
||
|
value: void 0
|
||
|
});
|
||
|
this.caller = new AsyncCaller({
|
||
|
maxConcurrency: 64,
|
||
|
...options.callerOptions,
|
||
|
});
|
||
|
this.notionClient = new Client({
|
||
|
logger: () => { },
|
||
|
...options.clientOptions,
|
||
|
});
|
||
|
this.n2mClient = new NotionToMarkdown({
|
||
|
notionClient: this.notionClient,
|
||
|
config: { parseChildPages: false, convertImagesToBase64: false },
|
||
|
});
|
||
|
this.id = options.id;
|
||
|
this.pageQueue = [];
|
||
|
this.pageCompleted = [];
|
||
|
this.pageQueueTotal = 0;
|
||
|
this.documents = [];
|
||
|
this.rootTitle = "";
|
||
|
this.onDocumentLoaded = options.onDocumentLoaded ?? ((_ti, _cu) => { });
|
||
|
this.propertiesAsHeader = options.propertiesAsHeader || false;
|
||
|
}
|
||
|
/**
|
||
|
* Adds a selection of page ids to the pageQueue and removes duplicates.
|
||
|
* @param items An array of string ids
|
||
|
*/
|
||
|
addToQueue(...items) {
|
||
|
const deDuped = items.filter((item) => !this.pageCompleted.concat(this.pageQueue).includes(item));
|
||
|
this.pageQueue.push(...deDuped);
|
||
|
this.pageQueueTotal += deDuped.length;
|
||
|
}
|
||
|
/**
|
||
|
* Parses a Notion GetResponse object (page or database) and returns a string of the title.
|
||
|
* @param obj The Notion GetResponse object to parse.
|
||
|
* @returns The string of the title.
|
||
|
*/
|
||
|
getTitle(obj) {
|
||
|
if (isPage(obj)) {
|
||
|
const titleProp = Object.values(obj.properties).find((prop) => prop.type === "title");
|
||
|
if (titleProp)
|
||
|
return this.getPropValue(titleProp);
|
||
|
}
|
||
|
if (isDatabase(obj))
|
||
|
return obj.title
|
||
|
.map((v) => this.n2mClient.annotatePlainText(v.plain_text, v.annotations))
|
||
|
.join("");
|
||
|
return null;
|
||
|
}
|
||
|
/**
|
||
|
* Parses the property type and returns a string
|
||
|
* @param page The Notion page property to parse.
|
||
|
* @returns A string of parsed property.
|
||
|
*/
|
||
|
getPropValue(prop) {
|
||
|
switch (prop.type) {
|
||
|
case "number": {
|
||
|
const propNumber = prop[prop.type];
|
||
|
return propNumber !== null ? propNumber.toString() : "";
|
||
|
}
|
||
|
case "url":
|
||
|
return prop[prop.type] || "";
|
||
|
case "select":
|
||
|
return prop[prop.type]?.name ?? "";
|
||
|
case "multi_select":
|
||
|
return `[${prop[prop.type].map((v) => `"${v.name}"`).join(", ")}]`;
|
||
|
case "status":
|
||
|
return prop[prop.type]?.name ?? "";
|
||
|
case "date":
|
||
|
return `${prop[prop.type]?.start ?? ""}${prop[prop.type]?.end ? ` - ${prop[prop.type]?.end}` : ""}`;
|
||
|
case "email":
|
||
|
return prop[prop.type] || "";
|
||
|
case "phone_number":
|
||
|
return prop[prop.type] || "";
|
||
|
case "checkbox":
|
||
|
return prop[prop.type].toString();
|
||
|
case "files":
|
||
|
return `[${prop[prop.type].map((v) => `"${v.name}"`).join(", ")}]`;
|
||
|
case "created_by":
|
||
|
return `["${prop[prop.type].object}", "${prop[prop.type].id}"]`;
|
||
|
case "created_time":
|
||
|
return prop[prop.type];
|
||
|
case "last_edited_by":
|
||
|
return `["${prop[prop.type].object}", "${prop[prop.type].id}"]`;
|
||
|
case "last_edited_time":
|
||
|
return prop[prop.type];
|
||
|
case "title":
|
||
|
return prop[prop.type]
|
||
|
.map((v) => this.n2mClient.annotatePlainText(v.plain_text, v.annotations))
|
||
|
.join("");
|
||
|
case "rich_text":
|
||
|
return prop[prop.type]
|
||
|
.map((v) => this.n2mClient.annotatePlainText(v.plain_text, v.annotations))
|
||
|
.join("");
|
||
|
case "people":
|
||
|
return `[${prop[prop.type]
|
||
|
.map((v) => `["${v.object}", "${v.id}"]`)
|
||
|
.join(", ")}]`;
|
||
|
case "unique_id":
|
||
|
return `${prop[prop.type].prefix || ""}${prop[prop.type].number}`;
|
||
|
case "relation":
|
||
|
return `[${prop[prop.type].map((v) => `"${v.id}"`).join(", ")}]`;
|
||
|
default:
|
||
|
return `Unsupported type: ${prop.type}`;
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* Parses the properties of a Notion page and returns them as key-value
|
||
|
* pairs.
|
||
|
* @param page The Notion page to parse.
|
||
|
* @returns An object containing the parsed properties as key-value pairs.
|
||
|
*/
|
||
|
parsePageProperties(page) {
|
||
|
return Object.entries(page.properties).reduce((accum, [propName, prop]) => {
|
||
|
const value = this.getPropValue(prop);
|
||
|
const props = { ...accum, [propName]: value };
|
||
|
return prop.type === "title" ? { ...props, _title: value } : props;
|
||
|
}, {});
|
||
|
}
|
||
|
/**
|
||
|
* Parses the details of a Notion page and returns them as an object.
|
||
|
* @param page The Notion page to parse.
|
||
|
* @returns An object containing the parsed details of the page.
|
||
|
*/
|
||
|
parsePageDetails(page) {
|
||
|
const { id, ...rest } = page;
|
||
|
return {
|
||
|
...rest,
|
||
|
notionId: id,
|
||
|
properties: this.parsePageProperties(page),
|
||
|
};
|
||
|
}
|
||
|
/**
|
||
|
* Loads a Notion block and returns it as an MdBlock object.
|
||
|
* @param block The Notion block to load.
|
||
|
* @returns A Promise that resolves to an MdBlock object.
|
||
|
*/
|
||
|
async loadBlock(block) {
|
||
|
const mdBlock = {
|
||
|
type: block.type,
|
||
|
blockId: block.id,
|
||
|
parent: await this.caller.call(() => this.n2mClient.blockToMarkdown(block)),
|
||
|
children: [],
|
||
|
};
|
||
|
if (block.has_children) {
|
||
|
const block_id = block.type === "synced_block" &&
|
||
|
block.synced_block?.synced_from?.block_id
|
||
|
? block.synced_block.synced_from.block_id
|
||
|
: block.id;
|
||
|
const childBlocks = await this.loadBlocks(await this.caller.call(() => getBlockChildren(this.notionClient, block_id, null)));
|
||
|
mdBlock.children = childBlocks;
|
||
|
}
|
||
|
return mdBlock;
|
||
|
}
|
||
|
/**
|
||
|
* Loads Notion blocks and their children recursively.
|
||
|
* @param blocksResponse The response from the Notion API containing the blocks to load.
|
||
|
* @returns A Promise that resolves to an array containing the loaded MdBlocks.
|
||
|
*/
|
||
|
async loadBlocks(blocksResponse) {
|
||
|
const blocks = blocksResponse.filter(isFullBlock);
|
||
|
// Add child pages to queue
|
||
|
const childPages = blocks
|
||
|
.filter((block) => block.type.includes("child_page"))
|
||
|
.map((block) => block.id);
|
||
|
if (childPages.length > 0)
|
||
|
this.addToQueue(...childPages);
|
||
|
// Add child database pages to queue
|
||
|
const childDatabases = blocks
|
||
|
.filter((block) => block.type.includes("child_database"))
|
||
|
.map((block) => this.caller.call(() => this.loadDatabase(block.id)));
|
||
|
// Load this block and child blocks
|
||
|
const loadingMdBlocks = blocks
|
||
|
.filter((block) => !["child_page", "child_database"].includes(block.type))
|
||
|
.map((block) => this.loadBlock(block));
|
||
|
const [mdBlocks] = await Promise.all([
|
||
|
Promise.all(loadingMdBlocks),
|
||
|
Promise.all(childDatabases),
|
||
|
]);
|
||
|
return mdBlocks;
|
||
|
}
|
||
|
/**
|
||
|
* Loads a Notion page and its child documents, then adds it to the completed documents array.
|
||
|
* @param page The Notion page or page ID to load.
|
||
|
*/
|
||
|
async loadPage(page) {
|
||
|
// Check page is a page ID or a PageObjectResponse
|
||
|
const [pageData, pageId] = typeof page === "string"
|
||
|
? [
|
||
|
this.caller.call(() => this.notionClient.pages.retrieve({ page_id: page })),
|
||
|
page,
|
||
|
]
|
||
|
: [page, page.id];
|
||
|
const [pageDetails, pageBlocks] = await Promise.all([
|
||
|
pageData,
|
||
|
this.caller.call(() => getBlockChildren(this.notionClient, pageId, null)),
|
||
|
]);
|
||
|
if (!isFullPage(pageDetails)) {
|
||
|
this.pageCompleted.push(pageId);
|
||
|
return;
|
||
|
}
|
||
|
const mdBlocks = await this.loadBlocks(pageBlocks);
|
||
|
const mdStringObject = this.n2mClient.toMarkdownString(mdBlocks);
|
||
|
let pageContent = mdStringObject.parent;
|
||
|
const metadata = this.parsePageDetails(pageDetails);
|
||
|
if (this.propertiesAsHeader) {
|
||
|
pageContent =
|
||
|
`---\n` +
|
||
|
`${yaml.dump(metadata.properties)}` +
|
||
|
`---\n\n` +
|
||
|
`${pageContent ?? ""}`;
|
||
|
}
|
||
|
if (!pageContent) {
|
||
|
this.pageCompleted.push(pageId);
|
||
|
return;
|
||
|
}
|
||
|
const pageDocument = new Document({ pageContent, metadata });
|
||
|
this.documents.push(pageDocument);
|
||
|
this.pageCompleted.push(pageId);
|
||
|
this.onDocumentLoaded(this.documents.length, this.pageQueueTotal, this.getTitle(pageDetails) || undefined, this.rootTitle);
|
||
|
}
|
||
|
/**
|
||
|
* Loads a Notion database and adds it's pages to the queue.
|
||
|
* @param id The ID of the Notion database to load.
|
||
|
*/
|
||
|
async loadDatabase(id) {
|
||
|
try {
|
||
|
for await (const page of iteratePaginatedAPI(this.notionClient.databases.query, {
|
||
|
database_id: id,
|
||
|
page_size: 50,
|
||
|
})) {
|
||
|
this.addToQueue(page.id);
|
||
|
}
|
||
|
}
|
||
|
catch (e) {
|
||
|
console.log(e);
|
||
|
// TODO: Catch and report api request errors
|
||
|
}
|
||
|
}
|
||
|
/**
|
||
|
* Loads the documents from Notion based on the specified options.
|
||
|
* @returns A Promise that resolves to an array of Documents.
|
||
|
*/
|
||
|
async load() {
|
||
|
const resPagePromise = this.notionClient.pages
|
||
|
.retrieve({ page_id: this.id })
|
||
|
.then((res) => {
|
||
|
this.addToQueue(this.id);
|
||
|
return res;
|
||
|
})
|
||
|
.catch((error) => error);
|
||
|
const resDatabasePromise = this.notionClient.databases
|
||
|
.retrieve({ database_id: this.id })
|
||
|
.then(async (res) => {
|
||
|
await this.loadDatabase(this.id);
|
||
|
return res;
|
||
|
})
|
||
|
.catch((error) => error);
|
||
|
const [resPage, resDatabase] = await Promise.all([
|
||
|
resPagePromise,
|
||
|
resDatabasePromise,
|
||
|
]);
|
||
|
// Check if both resPage and resDatabase resulted in error responses
|
||
|
const errors = [resPage, resDatabase].filter(isErrorResponse);
|
||
|
if (errors.length === 2) {
|
||
|
if (errors.every((e) => e.code === APIErrorCode.ObjectNotFound)) {
|
||
|
throw new AggregateError([
|
||
|
Error(`Could not find object with ID: ${this.id}. Make sure the relevant pages and databases are shared with your integration.`),
|
||
|
...errors,
|
||
|
]);
|
||
|
}
|
||
|
throw new AggregateError(errors);
|
||
|
}
|
||
|
this.rootTitle =
|
||
|
this.getTitle(resPage) || this.getTitle(resDatabase) || this.id;
|
||
|
let pageId = this.pageQueue.shift();
|
||
|
while (pageId) {
|
||
|
await this.loadPage(pageId);
|
||
|
pageId = this.pageQueue.shift();
|
||
|
}
|
||
|
return this.documents;
|
||
|
}
|
||
|
}
|