import { htmlToText } from "html-to-text";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
/**
* Class representing a document loader for loading pages from Confluence.
* @example
* ```typescript
* const loader = new ConfluencePagesLoader({
* baseUrl: "https:
* spaceKey: "~EXAMPLE362906de5d343d49dcdbae5dEXAMPLE",
* username: "your-username",
* accessToken: "your-access-token",
* });
* const documents = await loader.load();
* console.log(documents);
* ```
*/
export class ConfluencePagesLoader extends BaseDocumentLoader {
constructor({ baseUrl, spaceKey, username, accessToken, limit = 25, expand = "body.storage,version", personalAccessToken, maxRetries = 5, }) {
super();
Object.defineProperty(this, "baseUrl", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "spaceKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "username", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "accessToken", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "limit", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxRetries", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
/**
* expand parameter for confluence rest api
* description can be found at https://developer.atlassian.com/server/confluence/expansions-in-the-rest-api/
*/
Object.defineProperty(this, "expand", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "personalAccessToken", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.baseUrl = baseUrl;
this.spaceKey = spaceKey;
this.username = username;
this.accessToken = accessToken;
this.limit = limit;
this.expand = expand;
this.personalAccessToken = personalAccessToken;
this.maxRetries = maxRetries;
}
/**
* Returns the authorization header for the request.
* @returns The authorization header as a string, or undefined if no credentials were provided.
*/
get authorizationHeader() {
if (this.personalAccessToken) {
return `Bearer ${this.personalAccessToken}`;
}
else if (this.username && this.accessToken) {
const authToken = Buffer.from(`${this.username}:${this.accessToken}`).toString("base64");
return `Basic ${authToken}`;
}
return undefined;
}
/**
* Fetches all the pages in the specified space and converts each page to
* a Document instance.
* @param options the extra options of the load function
* @param options.limit The limit parameter to overwrite the size to fetch pages.
* @param options.start The start parameter to set inital offset to fetch pages.
* @returns Promise resolving to an array of Document instances.
*/
async load(options) {
try {
const pages = await this.fetchAllPagesInSpace(options?.start, options?.limit);
return pages.map((page) => this.createDocumentFromPage(page));
}
catch (error) {
console.error("Error:", error);
return [];
}
}
/**
* Fetches data from the Confluence API using the provided URL.
* @param url The URL to fetch data from.
* @returns Promise resolving to the JSON response from the API.
*/
async fetchConfluenceData(url) {
let retryCounter = 0;
// eslint-disable-next-line no-constant-condition
while (true) {
retryCounter += 1;
try {
const initialHeaders = {
"Content-Type": "application/json",
Accept: "application/json",
};
const authHeader = this.authorizationHeader;
if (authHeader) {
initialHeaders.Authorization = authHeader;
}
const response = await fetch(url, {
headers: initialHeaders,
});
if (!response.ok) {
throw new Error(`Failed to fetch ${url} from Confluence: ${response.status}. Retrying...`);
}
return await response.json();
}
catch (error) {
if (retryCounter >= this.maxRetries)
throw new Error(`Failed to fetch ${url} from Confluence (retry: ${retryCounter}): ${error}`);
}
}
}
/**
* Recursively fetches all the pages in the specified space.
* @param start The start parameter to paginate through the results.
* @returns Promise resolving to an array of ConfluencePage objects.
*/
async fetchAllPagesInSpace(start = 0, limit = this.limit) {
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
const data = await this.fetchConfluenceData(url);
if (data.size === 0) {
return [];
}
const nextPageStart = start + data.size;
const nextPageResults = await this.fetchAllPagesInSpace(nextPageStart, limit);
return data.results.concat(nextPageResults);
}
/**
* Creates a Document instance from a ConfluencePage object.
* @param page The ConfluencePage object to convert.
* @returns A Document instance.
*/
createDocumentFromPage(page) {
// Convert the HTML content to plain text
const plainTextContent = htmlToText(page.body.storage.value, {
wordwrap: false,
preserveNewlines: false,
});
// Remove empty lines
const textWithoutEmptyLines = plainTextContent.replace(/^\s*[\r\n]/gm, "");
// Generate the URL
const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`;
// Return a langchain document
return new Document({
pageContent: textWithoutEmptyLines,
metadata: {
id: page.id,
status: page.status,
title: page.title,
type: page.type,
url: pageUrl,
version: page.version?.number,
updated_by: page.version?.by?.displayName,
updated_at: page.version?.when,
},
});
}
}