import { htmlToText } from "html-to-text"; import { Document } from "../../document.js"; import { BaseDocumentLoader } from "../base.js"; export interface ConfluencePagesLoaderParams { baseUrl: string; spaceKey: string; username: string; accessToken: string; limit?: number; } export interface ConfluencePage { id: string; title: string; body: { storage: { value: string; }; }; } export interface ConfluenceAPIResponse { size: number; results: ConfluencePage[]; } export class ConfluencePagesLoader extends BaseDocumentLoader { public readonly baseUrl: string; public readonly spaceKey: string; public readonly username: string; public readonly accessToken: string; public readonly limit: number; constructor({ baseUrl, spaceKey, username, accessToken, limit = 25, }: ConfluencePagesLoaderParams) { super(); this.baseUrl = baseUrl; this.spaceKey = spaceKey; this.username = username; this.accessToken = accessToken; this.limit = limit; } public async load(): Promise { try { const pages = await this.fetchAllPagesInSpace(); return pages.map((page) => this.createDocumentFromPage(page)); } catch (error) { console.error("Error:", error); return []; } } protected async fetchConfluenceData( url: string ): Promise { try { const authToken = Buffer.from( `${this.username}:${this.accessToken}` ).toString("base64"); const response = await fetch(url, { headers: { Authorization: `Basic ${authToken}`, "Content-Type": "application/json", Accept: "application/json", }, }); if (!response.ok) { throw new Error( `Failed to fetch ${url} from Confluence: ${response.status}` ); } return await response.json(); } catch (error) { throw new Error(`Failed to fetch ${url} from Confluence: ${error}`); } } private async fetchAllPagesInSpace(start = 0): Promise { const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${this.limit}&start=${start}&expand=body.storage`; const data = await this.fetchConfluenceData(url); if (data.size === 0) { return []; } const nextPageStart = start + data.size; const nextPageResults = await this.fetchAllPagesInSpace(nextPageStart); return data.results.concat(nextPageResults); } private createDocumentFromPage(page: ConfluencePage): Document { // Convert the HTML content to plain text const plainTextContent = htmlToText(page.body.storage.value, { wordwrap: false, preserveNewlines: false, }); // Remove empty lines const textWithoutEmptyLines = plainTextContent.replace(/^\s*[\r\n]/gm, ""); // Generate the URL const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`; // Return a langchain document return new Document({ pageContent: textWithoutEmptyLines, metadata: { title: page.title, url: pageUrl, }, }); } }