192 lines
7 KiB
JavaScript
192 lines
7 KiB
JavaScript
import { htmlToText } from "html-to-text";
|
|
import { Document } from "@langchain/core/documents";
|
|
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
|
|
/**
|
|
* Class representing a document loader for loading pages from Confluence.
|
|
* @example
|
|
* ```typescript
|
|
* const loader = new ConfluencePagesLoader({
|
|
* baseUrl: "https:
|
|
* spaceKey: "~EXAMPLE362906de5d343d49dcdbae5dEXAMPLE",
|
|
* username: "your-username",
|
|
* accessToken: "your-access-token",
|
|
* });
|
|
* const documents = await loader.load();
|
|
* console.log(documents);
|
|
* ```
|
|
*/
|
|
export class ConfluencePagesLoader extends BaseDocumentLoader {
|
|
constructor({ baseUrl, spaceKey, username, accessToken, limit = 25, expand = "body.storage,version", personalAccessToken, maxRetries = 5, }) {
|
|
super();
|
|
Object.defineProperty(this, "baseUrl", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "spaceKey", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "username", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "accessToken", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "limit", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "maxRetries", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
/**
|
|
* expand parameter for confluence rest api
|
|
* description can be found at https://developer.atlassian.com/server/confluence/expansions-in-the-rest-api/
|
|
*/
|
|
Object.defineProperty(this, "expand", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "personalAccessToken", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
this.baseUrl = baseUrl;
|
|
this.spaceKey = spaceKey;
|
|
this.username = username;
|
|
this.accessToken = accessToken;
|
|
this.limit = limit;
|
|
this.expand = expand;
|
|
this.personalAccessToken = personalAccessToken;
|
|
this.maxRetries = maxRetries;
|
|
}
|
|
/**
|
|
* Returns the authorization header for the request.
|
|
* @returns The authorization header as a string, or undefined if no credentials were provided.
|
|
*/
|
|
get authorizationHeader() {
|
|
if (this.personalAccessToken) {
|
|
return `Bearer ${this.personalAccessToken}`;
|
|
}
|
|
else if (this.username && this.accessToken) {
|
|
const authToken = Buffer.from(`${this.username}:${this.accessToken}`).toString("base64");
|
|
return `Basic ${authToken}`;
|
|
}
|
|
return undefined;
|
|
}
|
|
/**
|
|
* Fetches all the pages in the specified space and converts each page to
|
|
* a Document instance.
|
|
* @param options the extra options of the load function
|
|
* @param options.limit The limit parameter to overwrite the size to fetch pages.
|
|
* @param options.start The start parameter to set inital offset to fetch pages.
|
|
* @returns Promise resolving to an array of Document instances.
|
|
*/
|
|
async load(options) {
|
|
try {
|
|
const pages = await this.fetchAllPagesInSpace(options?.start, options?.limit);
|
|
return pages.map((page) => this.createDocumentFromPage(page));
|
|
}
|
|
catch (error) {
|
|
console.error("Error:", error);
|
|
return [];
|
|
}
|
|
}
|
|
/**
|
|
* Fetches data from the Confluence API using the provided URL.
|
|
* @param url The URL to fetch data from.
|
|
* @returns Promise resolving to the JSON response from the API.
|
|
*/
|
|
async fetchConfluenceData(url) {
|
|
let retryCounter = 0;
|
|
// eslint-disable-next-line no-constant-condition
|
|
while (true) {
|
|
retryCounter += 1;
|
|
try {
|
|
const initialHeaders = {
|
|
"Content-Type": "application/json",
|
|
Accept: "application/json",
|
|
};
|
|
const authHeader = this.authorizationHeader;
|
|
if (authHeader) {
|
|
initialHeaders.Authorization = authHeader;
|
|
}
|
|
const response = await fetch(url, {
|
|
headers: initialHeaders,
|
|
});
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to fetch ${url} from Confluence: ${response.status}. Retrying...`);
|
|
}
|
|
return await response.json();
|
|
}
|
|
catch (error) {
|
|
if (retryCounter >= this.maxRetries)
|
|
throw new Error(`Failed to fetch ${url} from Confluence (retry: ${retryCounter}): ${error}`);
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* Recursively fetches all the pages in the specified space.
|
|
* @param start The start parameter to paginate through the results.
|
|
* @returns Promise resolving to an array of ConfluencePage objects.
|
|
*/
|
|
async fetchAllPagesInSpace(start = 0, limit = this.limit) {
|
|
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
|
|
const data = await this.fetchConfluenceData(url);
|
|
if (data.size === 0) {
|
|
return [];
|
|
}
|
|
const nextPageStart = start + data.size;
|
|
const nextPageResults = await this.fetchAllPagesInSpace(nextPageStart, limit);
|
|
return data.results.concat(nextPageResults);
|
|
}
|
|
/**
|
|
* Creates a Document instance from a ConfluencePage object.
|
|
* @param page The ConfluencePage object to convert.
|
|
* @returns A Document instance.
|
|
*/
|
|
createDocumentFromPage(page) {
|
|
// Convert the HTML content to plain text
|
|
const plainTextContent = htmlToText(page.body.storage.value, {
|
|
wordwrap: false,
|
|
preserveNewlines: false,
|
|
});
|
|
// Remove empty lines
|
|
const textWithoutEmptyLines = plainTextContent.replace(/^\s*[\r\n]/gm, "");
|
|
// Generate the URL
|
|
const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`;
|
|
// Return a langchain document
|
|
return new Document({
|
|
pageContent: textWithoutEmptyLines,
|
|
metadata: {
|
|
id: page.id,
|
|
status: page.status,
|
|
title: page.title,
|
|
type: page.type,
|
|
url: pageUrl,
|
|
version: page.version?.number,
|
|
updated_by: page.version?.by?.displayName,
|
|
updated_at: page.version?.when,
|
|
},
|
|
});
|
|
}
|
|
}
|