agsamantha/node_modules/@langchain/community/dist/document_loaders/web/github.cjs

609 lines
24 KiB
JavaScript
Raw Normal View History

2024-10-02 15:15:21 -05:00
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.GithubRepoLoader = void 0;
const ignore_1 = __importDefault(require("ignore"));
const binary_extensions_1 = __importDefault(require("binary-extensions"));
const documents_1 = require("@langchain/core/documents");
const env_1 = require("@langchain/core/utils/env");
const async_caller_1 = require("@langchain/core/utils/async_caller");
const base_1 = require("@langchain/core/document_loaders/base");
const directory_1 = require("langchain/document_loaders/fs/directory");
const extname_js_1 = require("../../utils/extname.cjs");
const extensions = /* #__PURE__ */ new Set(binary_extensions_1.default);
/**
* A function that checks if a file path is a binary file based on its
* extension.
* @param name The file path to check.
* @returns A boolean indicating whether the file path is a binary file.
*/
function isBinaryPath(name) {
return extensions.has((0, extname_js_1.extname)(name).slice(1).toLowerCase());
}
/**
* A class that extends the BaseDocumentLoader and implements the
* GithubRepoLoaderParams interface. It represents a document loader for
* loading files from a GitHub repository.
*/
class GithubRepoLoader extends base_1.BaseDocumentLoader {
constructor(githubUrl, { accessToken = (0, env_1.getEnvironmentVariable)("GITHUB_ACCESS_TOKEN"), baseUrl = "https://github.com", apiUrl = "https://api.github.com", branch = "main", recursive = true, processSubmodules = false, unknown = directory_1.UnknownHandling.Warn, ignoreFiles = [], ignorePaths, verbose = false, maxConcurrency = 2, maxRetries = 2, ...rest } = {}) {
super();
Object.defineProperty(this, "baseUrl", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "apiUrl", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "owner", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "repo", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "initialPath", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "headers", {
enumerable: true,
configurable: true,
writable: true,
value: {}
});
Object.defineProperty(this, "branch", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "recursive", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "processSubmodules", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "unknown", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "accessToken", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "ignoreFiles", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "ignore", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "verbose", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxConcurrency", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxRetries", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "caller", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "ignorePaths", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "submoduleInfos", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.baseUrl = baseUrl;
this.apiUrl = apiUrl;
const { owner, repo, path } = this.extractOwnerAndRepoAndPath(githubUrl);
this.owner = owner;
this.repo = repo;
this.initialPath = path;
this.branch = branch;
this.recursive = recursive;
// processing submodules without processing contents of other directories makes no sense
if (processSubmodules && !recursive) {
throw new Error(`Input property "recursive" must be true if "processSubmodules" is true.`);
}
this.processSubmodules = processSubmodules;
this.unknown = unknown;
this.accessToken = accessToken;
this.ignoreFiles = ignoreFiles;
this.verbose = verbose;
this.maxConcurrency = maxConcurrency;
this.maxRetries = maxRetries;
this.headers = {
"User-Agent": "langchain",
};
this.caller = new async_caller_1.AsyncCaller({
maxConcurrency,
maxRetries,
...rest,
});
this.ignorePaths = ignorePaths;
if (ignorePaths) {
this.ignore = ignore_1.default.default().add(ignorePaths);
}
if (this.accessToken) {
this.headers = {
...this.headers,
Authorization: `Bearer ${this.accessToken}`,
};
}
}
/**
* Extracts the owner, repository, and path from a GitHub URL.
* @param url The GitHub URL to extract information from.
* @returns An object containing the owner, repository, and path extracted from the GitHub URL.
*/
extractOwnerAndRepoAndPath(url) {
const match = url.match(new RegExp(`${this.baseUrl}/([^/]+)/([^/]+)(/tree/[^/]+/(.+))?`, "i"));
if (!match) {
throw new Error("Invalid GitHub URL format.");
}
return { owner: match[1], repo: match[2], path: match[4] || "" };
}
/**
* Fetches the files from the GitHub repository and creates Document
* instances for each file. It also handles error handling based on the
* unknown handling option.
* @returns A promise that resolves to an array of Document instances.
*/
async load() {
this.log(`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`);
// process repository without submodules
const documents = (await this.processRepo()).map((fileResponse) => new documents_1.Document({
pageContent: fileResponse.contents,
metadata: fileResponse.metadata,
}));
if (this.processSubmodules) {
// process submodules
await this.getSubmoduleInfo();
for (const submoduleInfo of this.submoduleInfos) {
documents.push(...(await this.loadSubmodule(submoduleInfo)));
}
}
return documents;
}
/**
* Asynchronously streams documents from the entire GitHub repository.
* It is suitable for situations where processing large repositories in a memory-efficient manner is required.
* @yields Yields a Promise that resolves to a Document object for each file or submodule content found in the repository.
*/
async *loadAsStream() {
this.log(`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`);
yield* await this.processRepoAsStream(this.initialPath);
if (!this.processSubmodules) {
return;
}
await this.getSubmoduleInfo();
for (const submoduleInfo of this.submoduleInfos) {
yield* await this.loadSubmoduleAsStream(submoduleInfo);
}
}
/**
* Loads the information about Git submodules from the repository, if available.
*/
async getSubmoduleInfo() {
this.log("Loading info about submodules...");
// we have to fetch the files of the root directory to get the download url of the .gitmodules file
// however, we cannot reuse the files retrieved in processRepo() as initialPath may be != ""
// so it may be that we end up fetching this file list twice
const repoFiles = await this.fetchRepoFiles("");
const gitmodulesFile = repoFiles.filter(({ name }) => name === ".gitmodules")?.[0];
if (gitmodulesFile) {
const gitmodulesContent = await this.fetchFileContent({
download_url: gitmodulesFile.download_url,
});
this.submoduleInfos = await this.parseGitmodules(gitmodulesContent);
}
else {
this.submoduleInfos = [];
}
this.log(`Found ${this.submoduleInfos.length} submodules:`);
for (const submoduleInfo of this.submoduleInfos) {
this.log(JSON.stringify(submoduleInfo));
}
}
/**
* Parses the given content of a .gitmodules file. Furthermore, queries the current SHA ref of all submodules.
* Returns the submodule information as array.
* @param gitmodulesContent the content of a .gitmodules file
*/
async parseGitmodules(gitmodulesContent) {
let validGitmodulesContent = gitmodulesContent;
// in case the .gitmodules file does not end with a newline, we add one to make the regex work
if (!validGitmodulesContent.endsWith("\n")) {
validGitmodulesContent += "\n";
}
// catches the initial line of submodule entries
const submodulePattern = /\[submodule "(.*?)"]\n((\s+.*?\s*=\s*.*?\n)*)/g;
// catches the properties of a submodule
const keyValuePattern = /\s+(.*?)\s*=\s*(.*?)\s/g;
const submoduleInfos = [];
for (const [, name, propertyLines] of validGitmodulesContent.matchAll(submodulePattern)) {
if (!name || !propertyLines) {
throw new Error("Could not parse submodule entry");
}
const submodulePropertyLines = propertyLines.matchAll(keyValuePattern);
let path;
let url;
for (const [, key, value] of submodulePropertyLines) {
if (!key || !value) {
throw new Error(`Could not parse key/value pairs for submodule ${name}`);
}
switch (key) {
case "path":
path = value;
break;
case "url":
url = value;
if (url.endsWith(".git")) {
url = url.substring(0, url.length - 4);
}
break;
default:
// ignoring unused keys
}
}
if (!path || !url) {
throw new Error(`Missing properties for submodule ${name}`);
}
// fetch the current ref of the submodule
const files = await this.fetchRepoFiles(path);
const submoduleInfo = {
name,
path,
url,
ref: files[0].sha,
};
submoduleInfos.push(submoduleInfo);
}
return submoduleInfos;
}
/**
* Loads the documents of the given submodule. Uses the same parameters as for the current repository.
* External submodules, i.e. submodules pointing to another GitHub instance, are ignored.
* @param submoduleInfo the info about the submodule to be loaded
*/
async loadSubmodule(submoduleInfo) {
if (!submoduleInfo.url.startsWith(this.baseUrl)) {
this.log(`Ignoring external submodule ${submoduleInfo.url}.`);
return [];
}
else if (!submoduleInfo.path.startsWith(this.initialPath)) {
this.log(`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`);
return [];
}
else {
this.log(`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`);
return new GithubRepoLoader(submoduleInfo.url, {
accessToken: this.accessToken,
apiUrl: this.apiUrl,
baseUrl: this.baseUrl,
branch: submoduleInfo.ref,
recursive: this.recursive,
processSubmodules: this.processSubmodules,
unknown: this.unknown,
ignoreFiles: this.ignoreFiles,
ignorePaths: this.ignorePaths,
verbose: this.verbose,
maxConcurrency: this.maxConcurrency,
maxRetries: this.maxRetries,
}).load();
}
}
/**
* Asynchronously processes and streams the contents of a specified submodule in the GitHub repository.
* @param submoduleInfo the info about the submodule to be loaded
* @yields Yields a Promise that resolves to a Document object for each file found in the submodule.
*/
async *loadSubmoduleAsStream(submoduleInfo) {
if (!submoduleInfo.url.startsWith(this.baseUrl)) {
this.log(`Ignoring external submodule ${submoduleInfo.url}.`);
yield* [];
}
if (!submoduleInfo.path.startsWith(this.initialPath)) {
this.log(`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`);
yield* [];
}
this.log(`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`);
const submoduleLoader = new GithubRepoLoader(submoduleInfo.url, {
accessToken: this.accessToken,
baseUrl: this.baseUrl,
apiUrl: this.apiUrl,
branch: submoduleInfo.ref,
recursive: this.recursive,
processSubmodules: this.processSubmodules,
unknown: this.unknown,
ignoreFiles: this.ignoreFiles,
ignorePaths: this.ignorePaths,
verbose: this.verbose,
maxConcurrency: this.maxConcurrency,
maxRetries: this.maxRetries,
});
yield* await submoduleLoader.processRepoAsStream(submoduleInfo.path);
}
/**
* Determines whether a file or directory should be ignored based on its
* path and type.
* @param path The path of the file or directory.
* @param fileType The type of the file or directory.
* @returns A boolean indicating whether the file or directory should be ignored.
*/
shouldIgnore(path, fileType) {
if (fileType !== "dir" && isBinaryPath(path)) {
return true;
}
if (this.ignore !== undefined) {
return this.ignore.ignores(path);
}
return (fileType !== "dir" &&
this.ignoreFiles.some((pattern) => {
if (typeof pattern === "string") {
return path === pattern;
}
try {
return pattern.test(path);
}
catch {
throw new Error(`Unknown ignore file pattern: ${pattern}`);
}
}));
}
/**
* Takes the file info and wrap it in a promise that will resolve to the file content and metadata
* @param file
* @returns
*/
async fetchFileContentWrapper(file) {
const fileContent = await this.fetchFileContent(file).catch((error) => {
this.handleError(`Failed wrap file content: ${file}, ${error}`);
});
return {
contents: fileContent || "",
metadata: {
source: file.path,
repository: `${this.baseUrl}/${this.owner}/${this.repo}`,
branch: this.branch,
},
};
}
/**
* Maps a list of files / directories to a list of promises that will fetch the file / directory contents
*/
async getCurrentDirectoryFilePromises(files) {
const currentDirectoryFilePromises = [];
// Directories have nested files / directories, which is why this is a list of promises of promises
const currentDirectoryDirectoryPromises = [];
for (const file of files) {
if (file.type !== "dir" && this.shouldIgnore(file.path, file.type)) {
continue;
}
if (file.type === "file" && file.size === 0) {
// this is a submodule. ignoring for the moment. submodule processing is done separately
continue;
}
if (file.type !== "dir") {
try {
currentDirectoryFilePromises.push(this.fetchFileContentWrapper(file));
}
catch (e) {
this.handleError(`Failed to fetch file content: ${file.path}, ${e}`);
}
}
else if (this.recursive) {
currentDirectoryDirectoryPromises.push(this.processDirectory(file.path));
}
}
const curDirDirectories = await Promise.all(currentDirectoryDirectoryPromises);
return [...currentDirectoryFilePromises, ...curDirDirectories.flat()];
}
/**
* Begins the process of fetching the contents of the repository
*/
async processRepo() {
try {
// Get the list of file / directory names in the root directory
const files = await this.fetchRepoFiles(this.initialPath);
// Map the file / directory paths to promises that will fetch the file / directory contents
const currentDirectoryFilePromises = await this.getCurrentDirectoryFilePromises(files);
return Promise.all(currentDirectoryFilePromises);
}
catch (error) {
this.handleError(`Failed to process directory: ${this.initialPath}, ${error}`);
return Promise.reject(error);
}
}
/**
* Asynchronously processes the contents of the entire GitHub repository,
* streaming each file as a Document object.
* @param path The path of the directory to process.
* @yields Yields a Promise that resolves to a Document object for each file found in the repository.
*/
async *processRepoAsStream(path) {
const files = await this.fetchRepoFiles(path);
for (const file of files) {
if (file.type !== "dir" && this.shouldIgnore(file.path, file.type)) {
continue;
}
if (file.type === "file") {
try {
const fileResponse = await this.fetchFileContentWrapper(file);
yield new documents_1.Document({
pageContent: fileResponse.contents,
metadata: fileResponse.metadata,
});
}
catch (error) {
this.handleError(`Failed to fetch file content: ${file.path}, ${error}`);
}
}
else if (this.recursive) {
yield* await this.processDirectoryAsStream(file.path);
}
}
}
/**
* Fetches the contents of a directory and maps the file / directory paths
* to promises that will fetch the file / directory contents.
* @param path The path of the directory to process.
* @returns A promise that resolves to an array of promises that will fetch the file / directory contents.
*/
async processDirectory(path) {
try {
const files = await this.fetchRepoFiles(path);
return this.getCurrentDirectoryFilePromises(files);
}
catch (error) {
this.handleError(`Failed to process directory: ${path}, ${error}`);
return Promise.reject(error);
}
}
/**
* Asynchronously processes the contents of a given directory in the GitHub repository,
* streaming each file as a Document object.
* @param path The path of the directory to process.
* @yields Yields a Promise that resolves to a Document object for each file in the directory.
*/
async *processDirectoryAsStream(path) {
const files = await this.fetchRepoFiles(path);
for (const file of files) {
if (file.type !== "dir" && this.shouldIgnore(file.path, file.type)) {
continue;
}
if (file.type === "file") {
try {
const fileResponse = await this.fetchFileContentWrapper(file);
yield new documents_1.Document({
pageContent: fileResponse.contents,
metadata: fileResponse.metadata,
});
}
catch {
this.handleError(`Failed to fetch file content: ${file.path}`);
}
}
else if (this.recursive) {
yield* await this.processDirectoryAsStream(file.path);
}
}
}
/**
* Fetches the files from a GitHub repository.
* If the path denotes a single file, the resulting array contains only one element.
* @param path The path of the repository to fetch the files from.
* @returns A promise that resolves to an array of GithubFile instances.
*/
async fetchRepoFiles(path) {
const url = `${this.apiUrl}/repos/${this.owner}/${this.repo}/contents/${path}?ref=${this.branch}`;
return this.caller.call(async () => {
this.log(`Fetching ${url}`);
const response = await fetch(url, { headers: this.headers });
const data = await response.json();
if (!response.ok) {
throw new Error(`Unable to fetch repository files: ${response.status} ${JSON.stringify(data)}`);
}
if (Array.isArray(data)) {
return data;
}
else {
return [data];
}
});
}
/**
* Fetches the content of a file from a GitHub repository.
* @param file The file to fetch the content from.
* @returns A promise that resolves to the content of the file.
*/
async fetchFileContent(file) {
return this.caller.call(async () => {
this.log(`Fetching ${file.download_url}`);
const response = await fetch(file.download_url, {
headers: this.headers,
});
return response.text();
});
}
/**
* Handles errors based on the unknown handling option.
* @param message The error message.
* @returns void
*/
handleError(message) {
switch (this.unknown) {
case directory_1.UnknownHandling.Ignore:
break;
case directory_1.UnknownHandling.Warn:
console.warn(message);
break;
case directory_1.UnknownHandling.Error:
throw new Error(message);
default:
throw new Error(`Unknown unknown handling: ${this.unknown}`);
}
}
/**
* Logs the given message to the console, if parameter 'verbose' is set to true.
* @param message the message to be logged.
*/
log(message) {
if (this.verbose) {
console.log(message);
}
}
}
exports.GithubRepoLoader = GithubRepoLoader;