233 lines
8.8 KiB
JavaScript
233 lines
8.8 KiB
JavaScript
import yaml from "js-yaml";
|
|
import { Document } from "@langchain/core/documents";
|
|
import { getEnv } from "@langchain/core/utils/env";
|
|
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
|
|
import { DirectoryLoader, UnknownHandling, } from "langchain/document_loaders/fs/directory";
|
|
/**
|
|
* Represents a loader for Obsidian markdown files. This loader extends the BaseDocumentLoader
|
|
* and provides functionality to parse and extract metadata, tags, and dataview fields from
|
|
* Obsidian markdown files.
|
|
*/
|
|
class ObsidianFileLoader extends BaseDocumentLoader {
|
|
/**
|
|
* Initializes a new instance of the ObsidianFileLoader class.
|
|
* @param filePath The path to the Obsidian markdown file.
|
|
* @param encoding The character encoding to use when reading the file. Defaults to 'utf-8'.
|
|
* @param collectMetadata Determines whether metadata should be collected from the file. Defaults to true.
|
|
*/
|
|
constructor(filePath, { encoding = "utf-8", collectMetadata = true, } = {}) {
|
|
super();
|
|
Object.defineProperty(this, "filePath", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "encoding", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
Object.defineProperty(this, "collectMetadata", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: void 0
|
|
});
|
|
this.filePath = filePath;
|
|
this.encoding = encoding;
|
|
this.collectMetadata = collectMetadata;
|
|
}
|
|
/**
|
|
* Parses the YAML front matter from the given content string.
|
|
* @param content The string content of the markdown file.
|
|
* @returns An object representing the parsed front matter.
|
|
*/
|
|
parseFrontMatter(content) {
|
|
if (!this.collectMetadata) {
|
|
return {};
|
|
}
|
|
const match = content.match(ObsidianFileLoader.FRONT_MATTER_REGEX);
|
|
if (!match) {
|
|
return {};
|
|
}
|
|
try {
|
|
const frontMatter = yaml.load(match[1]);
|
|
if (frontMatter.tags && typeof frontMatter.tags === "string") {
|
|
frontMatter.tags = frontMatter.tags.split(", ");
|
|
}
|
|
return frontMatter;
|
|
}
|
|
catch (e) {
|
|
console.warn("Encountered non-yaml frontmatter");
|
|
return {};
|
|
}
|
|
}
|
|
/**
|
|
* Removes YAML front matter from the given content string.
|
|
* @param content The string content of the markdown file.
|
|
* @returns The content string with the front matter removed.
|
|
*/
|
|
removeFrontMatter(content) {
|
|
if (!this.collectMetadata) {
|
|
return content;
|
|
}
|
|
return content.replace(ObsidianFileLoader.FRONT_MATTER_REGEX, "");
|
|
}
|
|
/**
|
|
* Parses Obsidian-style tags from the given content string.
|
|
* @param content The string content of the markdown file.
|
|
* @returns A set of parsed tags.
|
|
*/
|
|
parseObsidianTags(content) {
|
|
if (!this.collectMetadata) {
|
|
return new Set();
|
|
}
|
|
const matches = content.matchAll(ObsidianFileLoader.TAG_REGEX);
|
|
const tags = new Set();
|
|
for (const match of matches) {
|
|
tags.add(match[1]);
|
|
}
|
|
return tags;
|
|
}
|
|
/**
|
|
* Parses dataview fields from the given content string.
|
|
* @param content The string content of the markdown file.
|
|
* @returns A record object containing key-value pairs of dataview fields.
|
|
*/
|
|
parseObsidianDataviewFields(content) {
|
|
if (!this.collectMetadata) {
|
|
return {};
|
|
}
|
|
const fields = {};
|
|
const lineMatches = content.matchAll(ObsidianFileLoader.DATAVIEW_LINE_REGEX);
|
|
for (const [, key, value] of lineMatches) {
|
|
fields[key] = value;
|
|
}
|
|
const bracketMatches = content.matchAll(ObsidianFileLoader.DATAVIEW_INLINE_BRACKET_REGEX);
|
|
for (const [, key, value] of bracketMatches) {
|
|
fields[key] = value;
|
|
}
|
|
const parenMatches = content.matchAll(ObsidianFileLoader.DATAVIEW_INLINE_PAREN_REGEX);
|
|
for (const [, key, value] of parenMatches) {
|
|
fields[key] = value;
|
|
}
|
|
return fields;
|
|
}
|
|
/**
|
|
* Converts metadata to a format compatible with Langchain.
|
|
* @param metadata The metadata object to convert.
|
|
* @returns A record object containing key-value pairs of Langchain-compatible metadata.
|
|
*/
|
|
toLangchainCompatibleMetadata(metadata) {
|
|
const result = {};
|
|
for (const [key, value] of Object.entries(metadata)) {
|
|
if (typeof value === "string" || typeof value === "number") {
|
|
result[key] = value;
|
|
}
|
|
else {
|
|
result[key] = JSON.stringify(value);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
/**
|
|
* It loads the Obsidian file, parses it, and returns a `Document` instance.
|
|
* @returns An array of `Document` instances to comply with the BaseDocumentLoader interface.
|
|
*/
|
|
async load() {
|
|
const documents = [];
|
|
const { basename, readFile, stat } = await ObsidianFileLoader.imports();
|
|
const fileName = basename(this.filePath);
|
|
const stats = await stat(this.filePath);
|
|
let content = await readFile(this.filePath, this.encoding);
|
|
const frontMatter = this.parseFrontMatter(content);
|
|
const tags = this.parseObsidianTags(content);
|
|
const dataviewFields = this.parseObsidianDataviewFields(content);
|
|
content = this.removeFrontMatter(content);
|
|
const metadata = {
|
|
source: fileName,
|
|
path: this.filePath,
|
|
created: stats.birthtimeMs,
|
|
lastModified: stats.mtimeMs,
|
|
lastAccessed: stats.atimeMs,
|
|
...this.toLangchainCompatibleMetadata(frontMatter),
|
|
...dataviewFields,
|
|
};
|
|
if (tags.size || frontMatter.tags) {
|
|
metadata.tags = Array.from(new Set([...tags, ...(frontMatter.tags ?? [])])).join(",");
|
|
}
|
|
documents.push(new Document({
|
|
pageContent: content,
|
|
metadata,
|
|
}));
|
|
return documents;
|
|
}
|
|
/**
|
|
* Imports the necessary functions from the `node:path` and
|
|
* `node:fs/promises` modules. It is used to dynamically import the
|
|
* functions when needed. If the import fails, it throws an error
|
|
* indicating that the modules failed to load.
|
|
* @returns A promise that resolves to an object containing the imported functions.
|
|
*/
|
|
static async imports() {
|
|
try {
|
|
const { basename } = await import("node:path");
|
|
const { readFile, stat } = await import("node:fs/promises");
|
|
return { basename, readFile, stat };
|
|
}
|
|
catch (e) {
|
|
console.error(e);
|
|
throw new Error(`Failed to load fs/promises. ObsidianFileLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https://<link to docs> for alternatives.`);
|
|
}
|
|
}
|
|
}
|
|
Object.defineProperty(ObsidianFileLoader, "FRONT_MATTER_REGEX", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: /^---\n(.*?)\n---\n/s
|
|
});
|
|
Object.defineProperty(ObsidianFileLoader, "TAG_REGEX", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: /(?:\s|^)#([a-zA-Z_][\w/-]*)/g
|
|
});
|
|
Object.defineProperty(ObsidianFileLoader, "DATAVIEW_LINE_REGEX", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: /^\s*(\w+)::\s*(.*)$/gm
|
|
});
|
|
Object.defineProperty(ObsidianFileLoader, "DATAVIEW_INLINE_BRACKET_REGEX", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: /\[(\w+)::\s*(.*)\]/gm
|
|
});
|
|
Object.defineProperty(ObsidianFileLoader, "DATAVIEW_INLINE_PAREN_REGEX", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: /\((\w+)::\s*(.*)\)/gm
|
|
});
|
|
/**
|
|
* Represents a loader for directories containing Obsidian markdown files. This loader extends
|
|
* the DirectoryLoader and provides functionality to load and parse '.md' files with YAML frontmatter,
|
|
* Obsidian tags, and Dataview fields.
|
|
*/
|
|
export class ObsidianLoader extends DirectoryLoader {
|
|
/**
|
|
* Initializes a new instance of the ObsidianLoader class.
|
|
* @param directoryPath The path to the directory containing Obsidian markdown files.
|
|
* @param encoding The character encoding to use when reading files. Defaults to 'utf-8'.
|
|
* @param collectMetadata Determines whether metadata should be collected from the files. Defaults to true.
|
|
*/
|
|
constructor(directoryPath, options) {
|
|
super(directoryPath, {
|
|
".md": (filePath) => new ObsidianFileLoader(filePath, options),
|
|
}, true, UnknownHandling.Ignore);
|
|
}
|
|
}
|