import ignore, { Ignore } from "ignore"; import binaryExtensions from "binary-extensions"; import { Document } from "../../document.js"; import { BaseDocumentLoader } from "../base.js"; import { UnknownHandling } from "../fs/directory.js"; import { extname } from "../../util/extname.js"; import { getEnvironmentVariable } from "../../util/env.js"; const extensions = new Set(binaryExtensions); function isBinaryPath(name: string) { return extensions.has(extname(name).slice(1).toLowerCase()); } interface GithubFile { name: string; path: string; sha: string; size: number; url: string; html_url: string; git_url: string; download_url: string; type: string; _links: { self: string; git: string; html: string; }; } export interface GithubRepoLoaderParams { branch?: string; recursive?: boolean; unknown?: UnknownHandling; accessToken?: string; ignoreFiles?: (string | RegExp)[]; ignorePaths?: string[]; } export class GithubRepoLoader extends BaseDocumentLoader implements GithubRepoLoaderParams { private readonly owner: string; private readonly repo: string; private readonly initialPath: string; private headers: Record = {}; public branch: string; public recursive: boolean; public unknown: UnknownHandling; public accessToken?: string; public ignoreFiles: (string | RegExp)[]; public ignore?: Ignore; constructor( githubUrl: string, { accessToken = getEnvironmentVariable("GITHUB_ACCESS_TOKEN"), branch = "main", recursive = true, unknown = UnknownHandling.Warn, ignoreFiles = [], ignorePaths, }: GithubRepoLoaderParams = {} ) { super(); const { owner, repo, path } = this.extractOwnerAndRepoAndPath(githubUrl); this.owner = owner; this.repo = repo; this.initialPath = path; this.branch = branch; this.recursive = recursive; this.unknown = unknown; this.accessToken = accessToken; this.ignoreFiles = ignoreFiles; if (ignorePaths) { this.ignore = ignore.default().add(ignorePaths); } if (this.accessToken) { this.headers = { Authorization: `Bearer ${this.accessToken}`, }; } } private extractOwnerAndRepoAndPath(url: string): { owner: string; repo: string; path: string; } { const match = url.match( /https:\/\/github.com\/([^/]+)\/([^/]+)(\/tree\/[^/]+\/(.+))?/i ); if (!match) { throw new Error("Invalid GitHub URL format."); } return { owner: match[1], repo: match[2], path: match[4] || "" }; } public async load(): Promise { const documents: Document[] = []; await this.processDirectory(this.initialPath, documents); return documents; } protected async shouldIgnore( path: string, fileType: string ): Promise { if (fileType !== "dir" && isBinaryPath(path)) { return true; } if (this.ignore !== undefined) { return this.ignore.ignores(path); } return ( fileType !== "dir" && this.ignoreFiles.some((pattern) => { if (typeof pattern === "string") { return path === pattern; } try { return pattern.test(path); } catch { throw new Error(`Unknown ignore file pattern: ${pattern}`); } }) ); } private async processDirectory( path: string, documents: Document[] ): Promise { try { const files = await this.fetchRepoFiles(path); for (const file of files) { if (!(await this.shouldIgnore(file.path, file.type))) { if (file.type !== "dir") { try { const fileContent = await this.fetchFileContent(file); const metadata = { source: file.path }; documents.push( new Document({ pageContent: fileContent, metadata }) ); } catch (e) { this.handleError( `Failed to fetch file content: ${file.path}, ${e}` ); } } else if (this.recursive) { await this.processDirectory(file.path, documents); } } } } catch (error) { this.handleError(`Failed to process directory: ${path}, ${error}`); } } private async fetchRepoFiles(path: string): Promise { const url = `https://api.github.com/repos/${this.owner}/${this.repo}/contents/${path}?ref=${this.branch}`; const response = await fetch(url, { headers: this.headers }); const data = await response.json(); if (!response.ok) { throw new Error( `Unable to fetch repository files: ${response.status} ${JSON.stringify( data )}` ); } if (!Array.isArray(data)) { throw new Error("Unable to fetch repository files."); } return data as GithubFile[]; } private async fetchFileContent(file: GithubFile): Promise { const response = await fetch(file.download_url, { headers: this.headers }); return response.text(); } private handleError(message: string): void { switch (this.unknown) { case UnknownHandling.Ignore: break; case UnknownHandling.Warn: console.warn(message); break; case UnknownHandling.Error: throw new Error(message); default: throw new Error(`Unknown unknown handling: ${this.unknown}`); } } }