601 lines
23 KiB
JavaScript
601 lines
23 KiB
JavaScript
import ignore from "ignore";
|
|
import binaryExtensions from "binary-extensions";
|
|
import { Document } from "@langchain/core/documents";
|
|
import { getEnvironmentVariable } from "@langchain/core/utils/env";
|
|
import { AsyncCaller, } from "@langchain/core/utils/async_caller";
|
|
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
|
|
import { UnknownHandling } from "langchain/document_loaders/fs/directory";
|
|
import { extname } from "../../utils/extname.js";
|
|
const extensions = /* #__PURE__ */ new Set(binaryExtensions);
|
|
/**
|
|
* A function that checks if a file path is a binary file based on its
|
|
* extension.
|
|
* @param name The file path to check.
|
|
* @returns A boolean indicating whether the file path is a binary file.
|
|
*/
|
|
function isBinaryPath(name) {
|
|
return extensions.has(extname(name).slice(1).toLowerCase());
|
|
}
|
|
/**
|
|
* A class that extends the BaseDocumentLoader and implements the
|
|
* GithubRepoLoaderParams interface. It represents a document loader for
|
|
* loading files from a GitHub repository.
|
|
*/
|
|
export class GithubRepoLoader extends BaseDocumentLoader {
|
|
constructor(githubUrl, { accessToken = getEnvironmentVariable("GITHUB_ACCESS_TOKEN"), baseUrl = "https://github.com", apiUrl = "https://api.github.com", branch = "main", recursive = true, processSubmodules = false, unknown = UnknownHandling.Warn, ignoreFiles = [], ignorePaths, verbose = false, maxConcurrency = 2, maxRetries = 2, ...rest } = {}) {
|
|
super();
|
|
Object.defineProperty(this, "baseUrl", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "apiUrl", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "owner", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "repo", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "initialPath", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "headers", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: {}
|
|
});
|
|
Object.defineProperty(this, "branch", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "recursive", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "processSubmodules", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "unknown", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "accessToken", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "ignoreFiles", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "ignore", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "verbose", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "maxConcurrency", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "maxRetries", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "caller", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "ignorePaths", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "submoduleInfos", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
this.baseUrl = baseUrl;
|
|
this.apiUrl = apiUrl;
|
|
const { owner, repo, path } = this.extractOwnerAndRepoAndPath(githubUrl);
|
|
this.owner = owner;
|
|
this.repo = repo;
|
|
this.initialPath = path;
|
|
this.branch = branch;
|
|
this.recursive = recursive;
|
|
// processing submodules without processing contents of other directories makes no sense
|
|
if (processSubmodules && !recursive) {
|
|
throw new Error(`Input property "recursive" must be true if "processSubmodules" is true.`);
|
|
}
|
|
this.processSubmodules = processSubmodules;
|
|
this.unknown = unknown;
|
|
this.accessToken = accessToken;
|
|
this.ignoreFiles = ignoreFiles;
|
|
this.verbose = verbose;
|
|
this.maxConcurrency = maxConcurrency;
|
|
this.maxRetries = maxRetries;
|
|
this.headers = {
|
|
"User-Agent": "langchain",
|
|
};
|
|
this.caller = new AsyncCaller({
|
|
maxConcurrency,
|
|
maxRetries,
|
|
...rest,
|
|
});
|
|
this.ignorePaths = ignorePaths;
|
|
if (ignorePaths) {
|
|
this.ignore = ignore.default().add(ignorePaths);
|
|
}
|
|
if (this.accessToken) {
|
|
this.headers = {
|
|
...this.headers,
|
|
Authorization: `Bearer ${this.accessToken}`,
|
|
};
|
|
}
|
|
}
|
|
/**
|
|
* Extracts the owner, repository, and path from a GitHub URL.
|
|
* @param url The GitHub URL to extract information from.
|
|
* @returns An object containing the owner, repository, and path extracted from the GitHub URL.
|
|
*/
|
|
extractOwnerAndRepoAndPath(url) {
|
|
const match = url.match(new RegExp(`${this.baseUrl}/([^/]+)/([^/]+)(/tree/[^/]+/(.+))?`, "i"));
|
|
if (!match) {
|
|
throw new Error("Invalid GitHub URL format.");
|
|
}
|
|
return { owner: match[1], repo: match[2], path: match[4] || "" };
|
|
}
|
|
/**
|
|
* Fetches the files from the GitHub repository and creates Document
|
|
* instances for each file. It also handles error handling based on the
|
|
* unknown handling option.
|
|
* @returns A promise that resolves to an array of Document instances.
|
|
*/
|
|
async load() {
|
|
this.log(`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`);
|
|
// process repository without submodules
|
|
const documents = (await this.processRepo()).map((fileResponse) => new Document({
|
|
pageContent: fileResponse.contents,
|
|
metadata: fileResponse.metadata,
|
|
}));
|
|
if (this.processSubmodules) {
|
|
// process submodules
|
|
await this.getSubmoduleInfo();
|
|
for (const submoduleInfo of this.submoduleInfos) {
|
|
documents.push(...(await this.loadSubmodule(submoduleInfo)));
|
|
}
|
|
}
|
|
return documents;
|
|
}
|
|
/**
|
|
* Asynchronously streams documents from the entire GitHub repository.
|
|
* It is suitable for situations where processing large repositories in a memory-efficient manner is required.
|
|
* @yields Yields a Promise that resolves to a Document object for each file or submodule content found in the repository.
|
|
*/
|
|
async *loadAsStream() {
|
|
this.log(`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`);
|
|
yield* await this.processRepoAsStream(this.initialPath);
|
|
if (!this.processSubmodules) {
|
|
return;
|
|
}
|
|
await this.getSubmoduleInfo();
|
|
for (const submoduleInfo of this.submoduleInfos) {
|
|
yield* await this.loadSubmoduleAsStream(submoduleInfo);
|
|
}
|
|
}
|
|
/**
|
|
* Loads the information about Git submodules from the repository, if available.
|
|
*/
|
|
async getSubmoduleInfo() {
|
|
this.log("Loading info about submodules...");
|
|
// we have to fetch the files of the root directory to get the download url of the .gitmodules file
|
|
// however, we cannot reuse the files retrieved in processRepo() as initialPath may be != ""
|
|
// so it may be that we end up fetching this file list twice
|
|
const repoFiles = await this.fetchRepoFiles("");
|
|
const gitmodulesFile = repoFiles.filter(({ name }) => name === ".gitmodules")?.[0];
|
|
if (gitmodulesFile) {
|
|
const gitmodulesContent = await this.fetchFileContent({
|
|
download_url: gitmodulesFile.download_url,
|
|
});
|
|
this.submoduleInfos = await this.parseGitmodules(gitmodulesContent);
|
|
}
|
|
else {
|
|
this.submoduleInfos = [];
|
|
}
|
|
this.log(`Found ${this.submoduleInfos.length} submodules:`);
|
|
for (const submoduleInfo of this.submoduleInfos) {
|
|
this.log(JSON.stringify(submoduleInfo));
|
|
}
|
|
}
|
|
/**
|
|
* Parses the given content of a .gitmodules file. Furthermore, queries the current SHA ref of all submodules.
|
|
* Returns the submodule information as array.
|
|
* @param gitmodulesContent the content of a .gitmodules file
|
|
*/
|
|
async parseGitmodules(gitmodulesContent) {
|
|
let validGitmodulesContent = gitmodulesContent;
|
|
// in case the .gitmodules file does not end with a newline, we add one to make the regex work
|
|
if (!validGitmodulesContent.endsWith("\n")) {
|
|
validGitmodulesContent += "\n";
|
|
}
|
|
// catches the initial line of submodule entries
|
|
const submodulePattern = /\[submodule "(.*?)"]\n((\s+.*?\s*=\s*.*?\n)*)/g;
|
|
// catches the properties of a submodule
|
|
const keyValuePattern = /\s+(.*?)\s*=\s*(.*?)\s/g;
|
|
const submoduleInfos = [];
|
|
for (const [, name, propertyLines] of validGitmodulesContent.matchAll(submodulePattern)) {
|
|
if (!name || !propertyLines) {
|
|
throw new Error("Could not parse submodule entry");
|
|
}
|
|
const submodulePropertyLines = propertyLines.matchAll(keyValuePattern);
|
|
let path;
|
|
let url;
|
|
for (const [, key, value] of submodulePropertyLines) {
|
|
if (!key || !value) {
|
|
throw new Error(`Could not parse key/value pairs for submodule ${name}`);
|
|
}
|
|
switch (key) {
|
|
case "path":
|
|
path = value;
|
|
break;
|
|
case "url":
|
|
url = value;
|
|
if (url.endsWith(".git")) {
|
|
url = url.substring(0, url.length - 4);
|
|
}
|
|
break;
|
|
default:
|
|
// ignoring unused keys
|
|
}
|
|
}
|
|
if (!path || !url) {
|
|
throw new Error(`Missing properties for submodule ${name}`);
|
|
}
|
|
// fetch the current ref of the submodule
|
|
const files = await this.fetchRepoFiles(path);
|
|
const submoduleInfo = {
|
|
name,
|
|
path,
|
|
url,
|
|
ref: files[0].sha,
|
|
};
|
|
submoduleInfos.push(submoduleInfo);
|
|
}
|
|
return submoduleInfos;
|
|
}
|
|
/**
|
|
* Loads the documents of the given submodule. Uses the same parameters as for the current repository.
|
|
* External submodules, i.e. submodules pointing to another GitHub instance, are ignored.
|
|
* @param submoduleInfo the info about the submodule to be loaded
|
|
*/
|
|
async loadSubmodule(submoduleInfo) {
|
|
if (!submoduleInfo.url.startsWith(this.baseUrl)) {
|
|
this.log(`Ignoring external submodule ${submoduleInfo.url}.`);
|
|
return [];
|
|
}
|
|
else if (!submoduleInfo.path.startsWith(this.initialPath)) {
|
|
this.log(`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`);
|
|
return [];
|
|
}
|
|
else {
|
|
this.log(`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`);
|
|
return new GithubRepoLoader(submoduleInfo.url, {
|
|
accessToken: this.accessToken,
|
|
apiUrl: this.apiUrl,
|
|
baseUrl: this.baseUrl,
|
|
branch: submoduleInfo.ref,
|
|
recursive: this.recursive,
|
|
processSubmodules: this.processSubmodules,
|
|
unknown: this.unknown,
|
|
ignoreFiles: this.ignoreFiles,
|
|
ignorePaths: this.ignorePaths,
|
|
verbose: this.verbose,
|
|
maxConcurrency: this.maxConcurrency,
|
|
maxRetries: this.maxRetries,
|
|
}).load();
|
|
}
|
|
}
|
|
/**
|
|
* Asynchronously processes and streams the contents of a specified submodule in the GitHub repository.
|
|
* @param submoduleInfo the info about the submodule to be loaded
|
|
* @yields Yields a Promise that resolves to a Document object for each file found in the submodule.
|
|
*/
|
|
async *loadSubmoduleAsStream(submoduleInfo) {
|
|
if (!submoduleInfo.url.startsWith(this.baseUrl)) {
|
|
this.log(`Ignoring external submodule ${submoduleInfo.url}.`);
|
|
yield* [];
|
|
}
|
|
if (!submoduleInfo.path.startsWith(this.initialPath)) {
|
|
this.log(`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`);
|
|
yield* [];
|
|
}
|
|
this.log(`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`);
|
|
const submoduleLoader = new GithubRepoLoader(submoduleInfo.url, {
|
|
accessToken: this.accessToken,
|
|
baseUrl: this.baseUrl,
|
|
apiUrl: this.apiUrl,
|
|
branch: submoduleInfo.ref,
|
|
recursive: this.recursive,
|
|
processSubmodules: this.processSubmodules,
|
|
unknown: this.unknown,
|
|
ignoreFiles: this.ignoreFiles,
|
|
ignorePaths: this.ignorePaths,
|
|
verbose: this.verbose,
|
|
maxConcurrency: this.maxConcurrency,
|
|
maxRetries: this.maxRetries,
|
|
});
|
|
yield* await submoduleLoader.processRepoAsStream(submoduleInfo.path);
|
|
}
|
|
/**
|
|
* Determines whether a file or directory should be ignored based on its
|
|
* path and type.
|
|
* @param path The path of the file or directory.
|
|
* @param fileType The type of the file or directory.
|
|
* @returns A boolean indicating whether the file or directory should be ignored.
|
|
*/
|
|
shouldIgnore(path, fileType) {
|
|
if (fileType !== "dir" && isBinaryPath(path)) {
|
|
return true;
|
|
}
|
|
if (this.ignore !== undefined) {
|
|
return this.ignore.ignores(path);
|
|
}
|
|
return (fileType !== "dir" &&
|
|
this.ignoreFiles.some((pattern) => {
|
|
if (typeof pattern === "string") {
|
|
return path === pattern;
|
|
}
|
|
try {
|
|
return pattern.test(path);
|
|
}
|
|
catch {
|
|
throw new Error(`Unknown ignore file pattern: ${pattern}`);
|
|
}
|
|
}));
|
|
}
|
|
/**
|
|
* Takes the file info and wrap it in a promise that will resolve to the file content and metadata
|
|
* @param file
|
|
* @returns
|
|
*/
|
|
async fetchFileContentWrapper(file) {
|
|
const fileContent = await this.fetchFileContent(file).catch((error) => {
|
|
this.handleError(`Failed wrap file content: ${file}, ${error}`);
|
|
});
|
|
return {
|
|
contents: fileContent || "",
|
|
metadata: {
|
|
source: file.path,
|
|
repository: `${this.baseUrl}/${this.owner}/${this.repo}`,
|
|
branch: this.branch,
|
|
},
|
|
};
|
|
}
|
|
/**
|
|
* Maps a list of files / directories to a list of promises that will fetch the file / directory contents
|
|
*/
|
|
async getCurrentDirectoryFilePromises(files) {
|
|
const currentDirectoryFilePromises = [];
|
|
// Directories have nested files / directories, which is why this is a list of promises of promises
|
|
const currentDirectoryDirectoryPromises = [];
|
|
for (const file of files) {
|
|
if (file.type !== "dir" && this.shouldIgnore(file.path, file.type)) {
|
|
continue;
|
|
}
|
|
if (file.type === "file" && file.size === 0) {
|
|
// this is a submodule. ignoring for the moment. submodule processing is done separately
|
|
continue;
|
|
}
|
|
if (file.type !== "dir") {
|
|
try {
|
|
currentDirectoryFilePromises.push(this.fetchFileContentWrapper(file));
|
|
}
|
|
catch (e) {
|
|
this.handleError(`Failed to fetch file content: ${file.path}, ${e}`);
|
|
}
|
|
}
|
|
else if (this.recursive) {
|
|
currentDirectoryDirectoryPromises.push(this.processDirectory(file.path));
|
|
}
|
|
}
|
|
const curDirDirectories = await Promise.all(currentDirectoryDirectoryPromises);
|
|
return [...currentDirectoryFilePromises, ...curDirDirectories.flat()];
|
|
}
|
|
/**
|
|
* Begins the process of fetching the contents of the repository
|
|
*/
|
|
async processRepo() {
|
|
try {
|
|
// Get the list of file / directory names in the root directory
|
|
const files = await this.fetchRepoFiles(this.initialPath);
|
|
// Map the file / directory paths to promises that will fetch the file / directory contents
|
|
const currentDirectoryFilePromises = await this.getCurrentDirectoryFilePromises(files);
|
|
return Promise.all(currentDirectoryFilePromises);
|
|
}
|
|
catch (error) {
|
|
this.handleError(`Failed to process directory: ${this.initialPath}, ${error}`);
|
|
return Promise.reject(error);
|
|
}
|
|
}
|
|
/**
|
|
* Asynchronously processes the contents of the entire GitHub repository,
|
|
* streaming each file as a Document object.
|
|
* @param path The path of the directory to process.
|
|
* @yields Yields a Promise that resolves to a Document object for each file found in the repository.
|
|
*/
|
|
async *processRepoAsStream(path) {
|
|
const files = await this.fetchRepoFiles(path);
|
|
for (const file of files) {
|
|
if (file.type !== "dir" && this.shouldIgnore(file.path, file.type)) {
|
|
continue;
|
|
}
|
|
if (file.type === "file") {
|
|
try {
|
|
const fileResponse = await this.fetchFileContentWrapper(file);
|
|
yield new Document({
|
|
pageContent: fileResponse.contents,
|
|
metadata: fileResponse.metadata,
|
|
});
|
|
}
|
|
catch (error) {
|
|
this.handleError(`Failed to fetch file content: ${file.path}, ${error}`);
|
|
}
|
|
}
|
|
else if (this.recursive) {
|
|
yield* await this.processDirectoryAsStream(file.path);
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* Fetches the contents of a directory and maps the file / directory paths
|
|
* to promises that will fetch the file / directory contents.
|
|
* @param path The path of the directory to process.
|
|
* @returns A promise that resolves to an array of promises that will fetch the file / directory contents.
|
|
*/
|
|
async processDirectory(path) {
|
|
try {
|
|
const files = await this.fetchRepoFiles(path);
|
|
return this.getCurrentDirectoryFilePromises(files);
|
|
}
|
|
catch (error) {
|
|
this.handleError(`Failed to process directory: ${path}, ${error}`);
|
|
return Promise.reject(error);
|
|
}
|
|
}
|
|
/**
|
|
* Asynchronously processes the contents of a given directory in the GitHub repository,
|
|
* streaming each file as a Document object.
|
|
* @param path The path of the directory to process.
|
|
* @yields Yields a Promise that resolves to a Document object for each file in the directory.
|
|
*/
|
|
async *processDirectoryAsStream(path) {
|
|
const files = await this.fetchRepoFiles(path);
|
|
for (const file of files) {
|
|
if (file.type !== "dir" && this.shouldIgnore(file.path, file.type)) {
|
|
continue;
|
|
}
|
|
if (file.type === "file") {
|
|
try {
|
|
const fileResponse = await this.fetchFileContentWrapper(file);
|
|
yield new Document({
|
|
pageContent: fileResponse.contents,
|
|
metadata: fileResponse.metadata,
|
|
});
|
|
}
|
|
catch {
|
|
this.handleError(`Failed to fetch file content: ${file.path}`);
|
|
}
|
|
}
|
|
else if (this.recursive) {
|
|
yield* await this.processDirectoryAsStream(file.path);
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* Fetches the files from a GitHub repository.
|
|
* If the path denotes a single file, the resulting array contains only one element.
|
|
* @param path The path of the repository to fetch the files from.
|
|
* @returns A promise that resolves to an array of GithubFile instances.
|
|
*/
|
|
async fetchRepoFiles(path) {
|
|
const url = `${this.apiUrl}/repos/${this.owner}/${this.repo}/contents/${path}?ref=${this.branch}`;
|
|
return this.caller.call(async () => {
|
|
this.log(`Fetching ${url}`);
|
|
const response = await fetch(url, { headers: this.headers });
|
|
const data = await response.json();
|
|
if (!response.ok) {
|
|
throw new Error(`Unable to fetch repository files: ${response.status} ${JSON.stringify(data)}`);
|
|
}
|
|
if (Array.isArray(data)) {
|
|
return data;
|
|
}
|
|
else {
|
|
return [data];
|
|
}
|
|
});
|
|
}
|
|
/**
|
|
* Fetches the content of a file from a GitHub repository.
|
|
* @param file The file to fetch the content from.
|
|
* @returns A promise that resolves to the content of the file.
|
|
*/
|
|
async fetchFileContent(file) {
|
|
return this.caller.call(async () => {
|
|
this.log(`Fetching ${file.download_url}`);
|
|
const response = await fetch(file.download_url, {
|
|
headers: this.headers,
|
|
});
|
|
return response.text();
|
|
});
|
|
}
|
|
/**
|
|
* Handles errors based on the unknown handling option.
|
|
* @param message The error message.
|
|
* @returns void
|
|
*/
|
|
handleError(message) {
|
|
switch (this.unknown) {
|
|
case UnknownHandling.Ignore:
|
|
break;
|
|
case UnknownHandling.Warn:
|
|
console.warn(message);
|
|
break;
|
|
case UnknownHandling.Error:
|
|
throw new Error(message);
|
|
default:
|
|
throw new Error(`Unknown unknown handling: ${this.unknown}`);
|
|
}
|
|
}
|
|
/**
|
|
* Logs the given message to the console, if parameter 'verbose' is set to true.
|
|
* @param message the message to be logged.
|
|
*/
|
|
log(message) {
|
|
if (this.verbose) {
|
|
console.log(message);
|
|
}
|
|
}
|
|
}
|