/** * Compaction composition: wires all three layers into the transformContext chain. * * Layer 1 (Microcompaction): tool result trimming at threshold crossings * Layer 2 (Compaction): conversation summarization via LLM * Layer 3 (Failsafe): emergency truncation, purely mechanical * * All three layers run inside transformContext, which fires before every LLM * call. Compaction is fully self-contained within Cortex; no external calls * from the backend are needed to trigger it. Layer 2 fires when token usage * exceeds 70% of the context window and a completeFn + source accessors are * provided. Layer 3 fires whenever tokens exceed 90% of the model's context * window. * * References: * - compaction-strategy.md * - phase-5-compaction.md (5.5) */ import type { AgentMessage, AgentContext } from '../context-manager.js'; import type { CortexLogger, CortexCompactionConfig, AdaptiveThresholdConfig, CompactionResult, CompactionTarget, CompactionDegradedInfo, CompactionExhaustedInfo, } from '../types.js'; import { NOOP_LOGGER } from '../noop-logger.js'; import { estimateTokens } from '../token-estimator.js'; import { MicrocompactionEngine, MICROCOMPACTION_DEFAULTS, extractTextContent, isToolResultMessage, capToolResult, extractToolName, getToolCategory, applyBookend } from './microcompaction.js'; import { runCompaction, shouldCompact, COMPACTION_DEFAULTS, } from './compaction.js'; import type { CompleteFn, BeforeCompactionHandler, PostCompactionHandler, CompactionErrorHandler } from './compaction.js'; import { emergencyTruncate, shouldTruncate, FAILSAFE_DEFAULTS, } from './failsafe.js'; import { ObservationalMemoryEngine } from './observational/index.js'; import type { ObservationalMemoryConfig, ObservationalMemoryState, ObservationEvent, ReflectionEvent } from './observational/types.js'; import { PROVIDER_CACHE_CONFIG, type CacheRetention } from '../provider-registry.js'; // --------------------------------------------------------------------------- // Re-exports for consumer convenience // --------------------------------------------------------------------------- export { MicrocompactionEngine, capToolResult } from './microcompaction.js'; export type { TrimAction, TrimState } from './microcompaction.js'; export { runCompaction, shouldCompact, partitionHistory, buildSummaryMessage } from './compaction.js'; export type { CompleteFn } from './compaction.js'; export { emergencyTruncate, shouldTruncate, isContextOverflow } from './failsafe.js'; export type { FailsafeTruncationResult } from './failsafe.js'; export { ObservationalMemoryEngine } from './observational/index.js'; export type { ObservationalMemoryConfig, ObservationalMemoryState, ObservationChunk, ObservationEvent, ReflectionEvent, RecallResult, RecallConfig } from './observational/types.js'; export { createRecallTool } from './observational/recall-tool.js'; // computeAdaptiveThreshold is defined below in this file and exported at the declaration site // --------------------------------------------------------------------------- // Default config // --------------------------------------------------------------------------- export const ADAPTIVE_DEFAULTS: AdaptiveThresholdConfig = { enabled: true, recentWindowMs: 300_000, // 5 minutes idleWindowMs: 1_800_000, // 30 minutes recentReduction: 0.0, // no change when recent moderateReduction: 0.10, // lower threshold by 0.10 when moderately idle idleReduction: 0.20, // lower threshold by 0.20 when fully idle }; export const DEFAULT_COMPACTION_CONFIG: CortexCompactionConfig = { microcompaction: MICROCOMPACTION_DEFAULTS, compaction: COMPACTION_DEFAULTS, failsafe: FAILSAFE_DEFAULTS, adaptive: ADAPTIVE_DEFAULTS, }; /** * Build a full compaction config from partial overrides. */ export function buildCompactionConfig( partial?: Partial, ): CortexCompactionConfig { if (!partial) return DEFAULT_COMPACTION_CONFIG; const config: CortexCompactionConfig = { microcompaction: { ...MICROCOMPACTION_DEFAULTS, ...partial.microcompaction, }, compaction: { ...COMPACTION_DEFAULTS, ...partial.compaction, }, failsafe: { ...FAILSAFE_DEFAULTS, ...partial.failsafe, }, adaptive: { ...ADAPTIVE_DEFAULTS, ...partial.adaptive, }, }; if (partial.strategy !== undefined) { config.strategy = partial.strategy; } if (partial.observational !== undefined) { config.observational = partial.observational; } return config; } // --------------------------------------------------------------------------- // Adaptive threshold calculation // --------------------------------------------------------------------------- /** * Compute the effective Layer 2 compaction threshold adjusted by interaction * recency. When the user has not interacted recently, the threshold is lowered * (i.e., compaction fires sooner), reducing token costs for idle sessions. * * @param baseThreshold - The configured Layer 2 threshold (e.g., 0.70) * @param adaptiveConfig - Adaptive threshold configuration * @param lastInteractionTime - Timestamp (ms) of the last user interaction, or null if never * @param now - Current timestamp (ms), injectable for testing * @returns The adjusted threshold (always >= 0) */ export function computeAdaptiveThreshold( baseThreshold: number, adaptiveConfig: AdaptiveThresholdConfig, lastInteractionTime: number | null, now: number = Date.now(), ): number { if (!adaptiveConfig.enabled) { return baseThreshold; } // No interaction recorded yet: treat as fully idle if (lastInteractionTime === null) { return Math.max(0, baseThreshold - adaptiveConfig.idleReduction); } const elapsed = now - lastInteractionTime; if (elapsed < adaptiveConfig.recentWindowMs) { // Recent interaction: apply recentReduction (default 0, no change) return Math.max(0, baseThreshold - adaptiveConfig.recentReduction); } if (elapsed < adaptiveConfig.idleWindowMs) { // Moderate idle: apply moderateReduction return Math.max(0, baseThreshold - adaptiveConfig.moderateReduction); } // Fully idle: apply idleReduction return Math.max(0, baseThreshold - adaptiveConfig.idleReduction); } // --------------------------------------------------------------------------- // CompactionManager // --------------------------------------------------------------------------- /** * CompactionManager orchestrates all three compaction layers. * * It is stateful: it tracks the current token count and the microcompaction * cache. The CortexAgent creates one instance and delegates all compaction * decisions to it. Compaction is fully autonomous: all three layers run * inside applyInTransformContext(), which fires before every LLM call. */ export class CompactionManager { private readonly config: CortexCompactionConfig; private readonly microcompaction: MicrocompactionEngine; private readonly slotCount: number; private readonly _strategy: 'observational' | 'classic'; private observationalEngine: ObservationalMemoryEngine | null = null; /** Post-hoc current-context token count, updated after each parent LLM call. */ private _currentContextTokenCount = 0; /** Context budget for Layer 1/2 compaction decisions (may be artificially limited). */ private _contextWindow = 0; /** Actual model context window for Layer 3 failsafe (never artificially limited). */ private _modelContextWindow = 0; /** * Timestamp (ms) of the last user interaction. Used by the adaptive * threshold system to decide how aggressively to compact. Updated by * the consumer (backend) when a message-triggered tick fires. * Null means no interaction has been recorded yet. */ private _lastInteractionTime: number | null = null; /** * Timestamp (ms) of the last LLM call. Used by L1 to decide whether the * prompt cache has gone cold. Updated automatically in * updateCurrentContextTokenCount() (which fires after every LLM response). * Null means no LLM call has been recorded yet (treated as cold). */ private _lastLlmCallTimestamp: number | null = null; /** * Effective cache TTL (ms) for the current provider + cache retention. * Zero means caching is unsupported or disabled, in which case L1 treats * the cache as perpetually cold (trim freely). Set via setCacheInfo(). */ private _providerCacheTtlMs = 0; /** Consumer handlers for compaction lifecycle events. */ private beforeCompactionHandlers: BeforeCompactionHandler[] = []; private postCompactionHandlers: PostCompactionHandler[] = []; private compactionErrorHandlers: CompactionErrorHandler[] = []; private compactionResultHandlers: Array<(result: CompactionResult) => void> = []; private compactionDegradedHandlers: Array<(info: CompactionDegradedInfo) => void> = []; private compactionExhaustedHandlers: Array<(info: CompactionExhaustedInfo) => void> = []; /** Consecutive Layer 2 failure count for circuit breaker. Reset on success. */ private _consecutiveLayer2Failures = 0; /** LLM completion function, set by CortexAgent. */ private completeFn: CompleteFn | null = null; /** Logger for compaction diagnostics. */ private logger: CortexLogger = NOOP_LOGGER; constructor( config: CortexCompactionConfig, slotCount: number, ) { this.config = config; this.slotCount = slotCount; this.microcompaction = new MicrocompactionEngine(config.microcompaction); this._strategy = config.strategy ?? 'observational'; if (this._strategy === 'observational') { this.observationalEngine = new ObservationalMemoryEngine( config.observational ?? {}, slotCount - 1, ); } } // ----------------------------------------------------------------------- // Configuration // ----------------------------------------------------------------------- /** Get the compaction strategy. */ get strategy(): 'observational' | 'classic' { return this._strategy; } /** * Set the context budget (the effective limit for Layer 1/2 compaction). * This may be smaller than the model's actual context window when a * user-configured limit is applied. */ setContextWindow(contextWindow: number): void { this._contextWindow = contextWindow; this.observationalEngine?.setContextWindow(contextWindow); } /** * Set the model's actual context window (for Layer 3 failsafe only). * Layer 3 emergency truncation uses this to avoid dropping messages * when the model still has capacity, even if the user-configured * budget has been exceeded. * * Also used as a proxy for the utility model context window until the * actual utility model window is set via setUtilityModelContextWindow(). */ setModelContextWindow(modelContextWindow: number): void { this._modelContextWindow = modelContextWindow; this.observationalEngine?.setUtilityModelContextWindow(modelContextWindow); } /** * Set the LLM completion function for Layer 2 summarization. */ setCompleteFn(fn: CompleteFn): void { this.completeFn = fn; } /** * Set the LLM completion function for observational memory (utility model). */ setObservationalCompleteFn(fn: CompleteFn): void { this.observationalEngine?.setCompleteFn(fn); } /** * Update the utility model context window for observer/reflector clamps. */ setUtilityModelContextWindow(utilityModelContextWindow: number): void { this.observationalEngine?.setUtilityModelContextWindow(utilityModelContextWindow); } /** * Set a logger for compaction diagnostics. */ setLogger(logger: CortexLogger): void { this.logger = logger; this.observationalEngine?.setLogger(logger); } /** * Signal when the user last interacted with the system. * The consumer (backend) calls this during GATHER when a message-triggered * tick fires. For interval ticks, it is not called, so the timestamp * naturally ages. */ setLastInteractionTime(timestamp: number): void { this._lastInteractionTime = timestamp; } /** * Get the timestamp of the last user interaction, or null if none recorded. */ get lastInteractionTime(): number | null { return this._lastInteractionTime; } /** * Set the active provider and cache retention. Resolves the effective * cache TTL from PROVIDER_CACHE_CONFIG and stores it for L1's cache-aware * gating. Called by CortexAgent at construction, on provider changes, and * on cache retention changes. * * @param provider - The active provider name (e.g., "anthropic", "openai") * @param cacheRetention - The configured cache retention ('none' | 'short' | 'long') */ setCacheInfo(provider: string, cacheRetention: CacheRetention): void { const cfg = PROVIDER_CACHE_CONFIG[provider]; if (!cfg || !cfg.supported || cacheRetention === 'none') { this._providerCacheTtlMs = 0; return; } this._providerCacheTtlMs = cacheRetention === 'long' ? cfg.longTtlMs : cfg.shortTtlMs; } /** * Check whether the prompt cache has gone cold (or is unused). * * Returns true when: * - Caching is unsupported / disabled (TTL <= 0), OR * - No LLM call has been recorded yet, OR * - The elapsed time since the last LLM call >= the cache TTL. * * @param now - Current timestamp (ms), injectable for testing */ isCacheCold(now: number = Date.now()): boolean { if (this._providerCacheTtlMs <= 0) return true; if (this._lastLlmCallTimestamp === null) return true; return (now - this._lastLlmCallTimestamp) >= this._providerCacheTtlMs; } /** * Get the effective cache TTL (ms) for the current provider + retention. * Zero means caching is unsupported or disabled. */ get providerCacheTtlMs(): number { return this._providerCacheTtlMs; } /** * Get the timestamp of the last LLM call, or null if none recorded. */ get lastLlmCallTimestamp(): number | null { return this._lastLlmCallTimestamp; } /** * Compute the effective Layer 2 compaction threshold, adjusted for * interaction recency when adaptive thresholds are enabled. * * @param now - Current timestamp (ms), injectable for testing */ getEffectiveThreshold(now?: number): number { return computeAdaptiveThreshold( this.config.compaction.threshold, this.config.adaptive, this._lastInteractionTime, now, ); } // ----------------------------------------------------------------------- // Token Tracking // ----------------------------------------------------------------------- /** * Update the post-hoc current-context token count from LLM usage data. */ updateCurrentContextTokenCount(inputTokens: number): void { const prev = this._currentContextTokenCount; this._currentContextTokenCount = inputTokens; // Track the LLM call timestamp so L1 can decide whether the prompt cache // is still warm. updateCurrentContextTokenCount() is called after every // parent LLM call, so this is the natural point to record it. this._lastLlmCallTimestamp = Date.now(); this.logger.debug('[Compaction] updateCurrentContextTokenCount', { prev, inputTokens }); // Log significant drops to help diagnose token count display issues if (prev > 0 && inputTokens < prev * 0.5) { this.logger.warn('[Compaction] currentContextTokenCount dropped >50%', { prev, inputTokens, drop: `${((1 - inputTokens / prev) * 100).toFixed(1)}%`, }); } } /** * Get the post-hoc current-context token count from the most recent parent turn. */ get currentContextTokenCount(): number { return this._currentContextTokenCount; } /** * Get the context budget (effective limit for Layer 1/2). */ get contextWindow(): number { return this._contextWindow; } /** * Get the model's actual context window (for Layer 3 failsafe). */ get modelContextWindow(): number { return this._modelContextWindow; } /** * Get the current context usage ratio. */ get usageRatio(): number { if (this._contextWindow <= 0) return 0; return this._currentContextTokenCount / this._contextWindow; } /** * Estimate current context tokens from a transformed AgentContext snapshot. * * Returns the larger of: * - the heuristic estimate of the provided context snapshot * - the post-hoc token count from the most recent parent turn * * This mirrors the compaction decision logic so consumers can reason about * context pressure using the same semantics Cortex uses internally. */ estimateCurrentContextTokens(context: AgentContext): number { const estimated = this.estimateContextTokens(context); return this._currentContextTokenCount > 0 ? Math.max(this._currentContextTokenCount, estimated) : estimated; } // ----------------------------------------------------------------------- // Event Handlers // ----------------------------------------------------------------------- /** * Register a handler called before compaction starts (awaited). */ onBeforeCompaction(handler: BeforeCompactionHandler): void { this.beforeCompactionHandlers.push(handler); } /** * Register a handler called after compaction completes. */ onPostCompaction(handler: PostCompactionHandler): void { this.postCompactionHandlers.push(handler); } /** * Register a handler called if compaction fails. */ onCompactionError(handler: CompactionErrorHandler): void { this.compactionErrorHandlers.push(handler); } /** * Register a handler that receives the CompactionResult (for CortexAgent event emission). */ onCompactionResult(handler: (result: CompactionResult) => void): void { this.compactionResultHandlers.push(handler); } /** * Register a handler called when Layer 2 failed and Layer 3 was used as fallback. */ onCompactionDegraded(handler: (info: CompactionDegradedInfo) => void): void { this.compactionDegradedHandlers.push(handler); } /** * Register a handler called when all compaction layers have failed. */ onCompactionExhausted(handler: (info: CompactionExhaustedInfo) => void): void { this.compactionExhaustedHandlers.push(handler); } // ----------------------------------------------------------------------- // Observational Memory // ----------------------------------------------------------------------- /** * Called at turn_end to trigger async buffer checks. */ onTurnEnd(totalTokens: number, contextWindow: number, messages: AgentMessage[], slotCount: number): void { this.observationalEngine?.onTurnEnd(totalTokens, contextWindow, messages, slotCount); } /** * Register observation event handler. */ onObservation(handler: (event: ObservationEvent) => void): void { this.observationalEngine?.onObservation(handler); } /** * Register reflection event handler. */ onReflection(handler: (event: ReflectionEvent) => void): void { this.observationalEngine?.onReflection(handler); } /** * Get observational memory state for persistence. */ getObservationalMemoryState(): ObservationalMemoryState | null { return this.observationalEngine?.getState() ?? null; } /** * Restore observational memory state from a previous session. * * @param state - the persisted observational memory state * @param historyLength - length of the restored post-slot conversation * history, used to clamp the buffer watermark against drift */ restoreObservationalMemoryState(state: ObservationalMemoryState, historyLength?: number): void { this.observationalEngine?.restoreState(state, historyLength); } /** * Force a synchronous observation cycle. */ async triggerObservation(messages: AgentMessage[], slotCount: number): Promise { await this.observationalEngine?.triggerObservation(messages, slotCount); } /** * Get the observation slot content string (for ContextManager.setSlot). */ getObservationSlotContent(): string { return this.observationalEngine?.getSlotContent() ?? ''; } /** * Whether observations have been produced (non-empty observation text). */ hasObservations(): boolean { return (this.observationalEngine?.getObservations() ?? '').length > 0; } /** * Whether the recall tool should be registered. */ hasRecallTool(): boolean { return this.observationalEngine?.hasRecall() ?? false; } /** * Get the recall config if available. */ getRecallConfig() { return this.observationalEngine?.getRecallConfig(); } /** * Current token count of activated observations only. * Returns 0 when not using the observational strategy. */ getObservationTokenCount(): number { return this.observationalEngine?.getObservationTokenCount() ?? 0; } /** * Whether the observer or reflector is currently running in the background. * Returns false when not using the observational strategy. */ isObservationalProcessing(): boolean { return this.observationalEngine?.isProcessing() ?? false; } /** * Whether the observer specifically is in-flight. */ isObserverInFlight(): boolean { return this.observationalEngine?.isObserverInFlight() ?? false; } /** * Whether the reflector specifically is in-flight. */ isReflectorInFlight(): boolean { return this.observationalEngine?.isReflectorInFlight() ?? false; } // ----------------------------------------------------------------------- // Insertion-time cap // ----------------------------------------------------------------------- /** * Cap a tool result at insertion time (before it enters conversation history). */ capToolResult(content: string): string { return this.microcompaction.capAtInsertion(content); } /** * Apply insertion-time cap to all uncapped tool results in the source * messages array (mutates in place). * * Called from the transformContext hook on `agent.state.messages` so that * Tier 1 capping is automatically applied when tool results enter * conversation history through pi-agent-core's internal tool execution * loop. The cap is applied at most once per tool result part; already * capped content (containing the insertion marker) is skipped. * * @param messages - The source messages array (mutated in place) * @param slotCount - Number of slot messages to skip at the start */ async applyInsertionCap(messages: AgentMessage[], slotCount: number): Promise { const config = this.microcompaction.getConfig(); // Phase 1: Individual per-result cap for (let i = slotCount; i < messages.length; i++) { const msg = messages[i]!; if (!isToolResultMessage(msg)) continue; if (typeof msg.content === 'string') continue; let modified = false; const newContent = msg.content.map(part => { const isLegacyToolResult = part.type === 'tool_result' && typeof part.text === 'string'; const isRuntimeToolResultText = msg.role === 'toolResult' && part.type === 'text' && typeof part.text === 'string'; if (!isLegacyToolResult && !isRuntimeToolResultText) { return part; } // Skip already-capped content if ((part.text as string).includes('tokens trimmed at insertion')) { return part; } const capped = capToolResult(part.text as string, config); if (capped !== part.text) { modified = true; return { ...part, text: capped }; } return part; }); if (modified) { messages[i] = { ...msg, content: newContent }; } } // Phase 2: Aggregate per-message budget const aggregateLimit = config.maxAggregateTurnTokens ?? 150_000; if (aggregateLimit <= 0) return; for (let i = slotCount; i < messages.length; i++) { const msg = messages[i]!; if (!isToolResultMessage(msg)) continue; if (typeof msg.content === 'string') continue; const parts = msg.content; const partInfos: Array<{ index: number; tokens: number; text: string; toolName: string }> = []; let totalTokens = 0; for (let p = 0; p < parts.length; p++) { const part = parts[p]!; const isLegacyToolResult = part.type === 'tool_result' && typeof part.text === 'string'; const isRuntimeToolResultText = msg.role === 'toolResult' && part.type === 'text' && typeof part.text === 'string'; if (!isLegacyToolResult && !isRuntimeToolResultText) { continue; } const text = part.text as string; const tokens = estimateTokens(text); const name = (typeof (part as Record)['name'] === 'string' ? (part as Record)['name'] as string : null) ?? extractToolName(msg) ?? 'unknown'; partInfos.push({ index: p, tokens, text, toolName: name }); totalTokens += tokens; } if (totalTokens <= aggregateLimit) continue; const sorted = [...partInfos].sort((a, b) => b.tokens - a.tokens); const newParts = [...parts]; let currentTotal = totalTokens; for (const info of sorted) { if (currentTotal <= aggregateLimit) break; if (info.tokens <= config.maxResultTokens / 2) break; const part = newParts[info.index]!; let replacement: string; if (config.persistResult) { const category = getToolCategory(info.toolName, config.toolCategories); try { const path = await config.persistResult(info.text, { toolName: info.toolName, messageIndex: i, category: category ?? 'rereadable', }); const bookended = applyBookend(info.text, config.bookendMaxChars, config.bookendMaxChars, info.tokens); replacement = `${bookended}\n\n[Full content persisted to ${path} -- use Read to access]`; } catch { replacement = applyBookend(info.text, config.bookendMaxChars, config.bookendMaxChars, info.tokens); } } else { replacement = applyBookend(info.text, config.bookendMaxChars, config.bookendMaxChars, info.tokens); } const newTokens = estimateTokens(replacement); currentTotal = currentTotal - info.tokens + newTokens; newParts[info.index] = { ...part, text: replacement }; } messages[i] = { ...msg, content: newParts }; } } // ----------------------------------------------------------------------- // transformContext hook // ----------------------------------------------------------------------- /** * Apply compaction layers to the context in transformContext. * * This is the main entry point called from CortexAgent.getTransformContextHook(). * It is fully self-contained: all three compaction layers are integrated here, * triggered autonomously based on token thresholds. No external calls from * the backend are needed to trigger compaction. * * Execution order: * 1. Layer 1 (microcompaction): tool result trimming at threshold crossings * 2. Layer 2 (summarization): if tokens exceed 70% after Layer 1, run LLM * summarization on the source transcript, then rebuild context from the * updated messages * 3. Layer 3 (failsafe): if tokens still exceed 90% after Layers 1-2, * emergency truncation drops the oldest turns * * @param context - The AgentContext from transformContext * @param getHistory - Function to get conversation history from the context * @param setHistory - Function to set conversation history in the context * @param getSourceHistory - Function to get the original source transcript history (post-slot) * @param setSourceHistory - Function to replace the original source transcript history * @returns Modified context with compacted history */ async applyInTransformContext( context: AgentContext, getHistory: (ctx: AgentContext) => AgentMessage[], setHistory: (ctx: AgentContext, history: AgentMessage[]) => AgentContext, getSourceHistory?: () => AgentMessage[], setSourceHistory?: (history: AgentMessage[]) => void, ): Promise { if (this._contextWindow <= 0) { // contextWindow not set, skip compaction return context; } let history = getHistory(context); if (history.length === 0) { return context; } // Use the current transformed context estimate as a first-class input. // Post-hoc token tracking from the previous turn is useful, but it can be // stale when transformContext injects large ephemeral content on this turn. const estimatedCurrentTokens = this.estimateContextTokens(context); const currentTokens = this.estimateCurrentContextTokens(context); this.logger.debug('[Compaction] transformContext', { historyLen: history.length, currentContextTokens: this._currentContextTokenCount, heuristic: estimatedCurrentTokens, currentTokens, ctxWindow: this._contextWindow, }); // Compute utilization and slot tokens (shared by both strategies and L3) const originalHistoryTokens = this.estimateHistoryTokens(getHistory(context)); const slotTokens = Math.max(0, currentTokens - originalHistoryTokens); const utilization = this._contextWindow > 0 ? currentTokens / this._contextWindow : 0; let layer2Failed = false; let lastLayer2Error: Error | undefined; let effectiveThreshold = 0; const cacheCold = this.isCacheCold(); if (this._strategy === 'observational' && this.observationalEngine && getSourceHistory && setSourceHistory) { // Observational memory path: observer/reflector handle conversation // compression. L2 summarization is skipped, but L1 still runs in // cache-aware mode on the unobserved tail to trim large tool results // before they hit the LLM. context = await this.observationalEngine.applyInTransformContext( context, utilization, this.slotCount, getHistory, setHistory, getSourceHistory, setSourceHistory, ); history = getHistory(context); // Run L1 on the surviving (post-observation) history. Cache-aware // gating ensures we only trim when the prompt cache has gone cold, // preserving cache hits during active use. Re-estimate from the // updated context so the observation slot's new size is reflected. const postObsTotal = this.estimateCurrentContextTokens(context); const trimmedHistory = await this.microcompaction.apply( history, this._contextWindow, postObsTotal, { cacheCold }, ); if (trimmedHistory !== history) { context = setHistory(context, trimmedHistory); history = trimmedHistory; } } else { // Classic path: L1 + L2 // Layer 1: Microcompaction. Cache-aware gating: only trims when the // prompt cache is cold (or unsupported). When warm, returns history // untouched to preserve cache hits. history = await this.microcompaction.apply( history, this._contextWindow, currentTokens, { cacheCold }, ); // Layer 2: Conversation summarization (70% threshold) // Operates on the original source transcript, not the in-memory // microcompacted context. After Layer 2 modifies the source, we rebuild // the context from the updated messages. const postMicroTokens = this.estimateHistoryTokens(history); const totalAfterMicro = slotTokens + postMicroTokens; effectiveThreshold = this.getEffectiveThreshold(); this.logger.debug('[Compaction] Layer2 evaluation', { totalAfterMicro, threshold: effectiveThreshold, ratio: totalAfterMicro / this._contextWindow, completeFn: !!this.completeFn, srcAccessors: !!getSourceHistory && !!setSourceHistory, shouldCompact: shouldCompact(totalAfterMicro, this._contextWindow, effectiveThreshold), }); if ( this.completeFn && getSourceHistory && setSourceHistory && shouldCompact(totalAfterMicro, this._contextWindow, effectiveThreshold) ) { const maxRetries = this.config.compaction.maxRetries ?? 3; const retryDelayMs = this.config.compaction.retryDelayMs ?? 2000; let succeeded = false; for (let attempt = 1; attempt <= maxRetries; attempt++) { try { const sourceHistory = getSourceHistory(); if (sourceHistory.length === 0) break; const { newHistory: compactedSource, result } = await runCompaction( sourceHistory, this.config.compaction, this.completeFn, { onBeforeCompaction: this.beforeCompactionHandlers, onPostCompaction: this.postCompactionHandlers, onCompactionError: this.compactionErrorHandlers, }, currentTokens, // pass actual full-context token count for accurate reporting ); // Success: update state and reset failure counter setSourceHistory(compactedSource); this.microcompaction.resetCache(); // result.tokensAfter now includes overhead (system prompt, slots, // tool definitions) since we passed actualContextTokens to // runCompaction. Use it directly to prevent the stale low value // that would cause re-triggering compaction on the next call. this._currentContextTokenCount = result.tokensAfter; this._consecutiveLayer2Failures = 0; for (const handler of this.compactionResultHandlers) { try { handler(result); } catch (err) { this.logger.error('[Compaction] compactionResult handler threw', { error: err instanceof Error ? err.message : String(err), }); } } // Rebuild context from updated source. L2 just rewrote history // wholesale, so any existing cache prefix is invalidated; treat as // cold so L1 can trim the rebuilt history if warranted. history = await this.microcompaction.apply( compactedSource, this._contextWindow, this._currentContextTokenCount, { cacheCold: true }, ); succeeded = true; break; } catch (err) { this._consecutiveLayer2Failures++; lastLayer2Error = err instanceof Error ? err : new Error(String(err)); this.logger.warn('[Compaction] Layer2 retry failed', { attempt, maxRetries, error: lastLayer2Error.message, }); if (attempt < maxRetries) { await new Promise(resolve => setTimeout(resolve, retryDelayMs)); } } } if (!succeeded) { layer2Failed = true; } } } // Layer 3: Emergency truncation (90% of model context window) // Uses the MODEL's actual context window, not the budget. Emergency // truncation should only fire when we're near the model's real limit, // not the user's artificial budget. Layer 1/2 handle the budget. // When observational memory is active, L3 operates on the post-slot // history (raw messages only). The observation slot lives in the slot // region and is naturally protected by slotCount. { const failsafeWindow = this._modelContextWindow > 0 ? this._modelContextWindow : this._contextWindow; const postLayerTokens = this.estimateHistoryTokens(history); const totalNow = slotTokens + postLayerTokens; if (shouldTruncate(totalNow, failsafeWindow, this.config.failsafe.threshold)) { // Force sync observation before L3 truncation to capture unobserved // content before it is dropped. The source history from getSourceHistory // is already post-slot, so pass 0 as slotCount. if (this._strategy === 'observational' && this.observationalEngine && getSourceHistory) { const sourceHistory = getSourceHistory(); await this.observationalEngine.triggerObservation(sourceHistory, 0); } const truncResult = emergencyTruncate( history, failsafeWindow, slotTokens, this.config.failsafe.threshold, ); history = truncResult.newHistory; // Emit degraded event if Layer 3 was used as fallback for Layer 2 failure if (layer2Failed) { const failures = this._consecutiveLayer2Failures; this._consecutiveLayer2Failures = 0; for (const handler of this.compactionDegradedHandlers) { try { handler({ layer2Failures: failures, turnsDropped: truncResult.turnsRemoved, }); } catch (err) { this.logger.error('[Compaction] compactionDegraded handler threw', { error: err instanceof Error ? err.message : String(err), }); } } } } else if (layer2Failed) { // Layer 2 failed but Layer 3 didn't need to run. If tokens are still // over the Layer 2 budget, emit exhausted so the consumer can act. const postTokens = this.estimateHistoryTokens(history); const stillOverBudget = shouldCompact(slotTokens + postTokens, this._contextWindow, effectiveThreshold); if (stillOverBudget) { const failures = this._consecutiveLayer2Failures; this._consecutiveLayer2Failures = 0; for (const handler of this.compactionExhaustedHandlers) { try { handler({ error: lastLayer2Error ?? new Error('Layer 2 compaction failed'), layer2Failures: failures, }); } catch (err) { this.logger.error('[Compaction] compactionExhausted handler threw', { error: err instanceof Error ? err.message : String(err), }); } } } } } return setHistory(context, history); } // ----------------------------------------------------------------------- // End-of-tick compaction check // ----------------------------------------------------------------------- /** * Manually check if compaction is needed and run it. * * This is a convenience API for consumers who want to trigger compaction * outside the agentic loop (e.g., for testing or manual maintenance). * The primary compaction trigger is `applyInTransformContext`, which runs * automatically before every LLM call. * * @param getHistory - Get current conversation history * @param setHistory - Replace conversation history * @returns CompactionResult if compaction ran, null otherwise */ async checkAndRunCompaction( getHistory: () => AgentMessage[], setHistory: (history: AgentMessage[]) => void, ): Promise { if (this._contextWindow <= 0) return null; const history = getHistory(); if (history.length === 0) return null; const estimatedTokens = this.estimateHistoryTokens(history); // Use adaptive threshold (adjusts based on interaction recency) const effectiveThreshold = this.getEffectiveThreshold(); // Check Layer 2 threshold if (!shouldCompact(this._currentContextTokenCount, this._contextWindow, effectiveThreshold)) { // Also check using heuristic estimation as fallback if (!shouldCompact(estimatedTokens, this._contextWindow, effectiveThreshold)) { return null; } } // Attempt Layer 2 (summarization) if (this.completeFn) { try { const actualTokens = Math.max(this._currentContextTokenCount, estimatedTokens); const { newHistory, result } = await runCompaction( history, this.config.compaction, this.completeFn, { onBeforeCompaction: this.beforeCompactionHandlers, onPostCompaction: this.postCompactionHandlers, onCompactionError: this.compactionErrorHandlers, }, actualTokens, // pass full-context token count for accurate reporting ); setHistory(newHistory); this.microcompaction.resetCache(); // result.tokensAfter includes overhead since we passed actualTokens this._currentContextTokenCount = result.tokensAfter; // Emit result for (const handler of this.compactionResultHandlers) { try { handler(result); } catch { // Swallow handler errors } } return result; } catch { // Layer 2 failed, fall through to Layer 3 } } // Layer 3 fallback: emergency truncation (uses model's actual window) const failsafeWindow = this._modelContextWindow > 0 ? this._modelContextWindow : this._contextWindow; const slotTokens = this._currentContextTokenCount - estimatedTokens; if (shouldTruncate(this._currentContextTokenCount, failsafeWindow, this.config.failsafe.threshold)) { const result = emergencyTruncate( history, failsafeWindow, Math.max(0, slotTokens), this.config.failsafe.threshold, ); setHistory(result.newHistory); this.microcompaction.resetCache(); this._currentContextTokenCount = result.tokensAfter; } return null; } // ----------------------------------------------------------------------- // Reactive overflow handling // ----------------------------------------------------------------------- /** * Handle a context overflow error by performing emergency truncation. * Called when the API returns a context overflow error. * * @param getHistory - Get current conversation history * @param setHistory - Replace conversation history */ handleOverflowError( getHistory: () => AgentMessage[], setHistory: (history: AgentMessage[]) => void, ): void { const history = getHistory(); if (history.length === 0) return; // API returned overflow error, so use the model's actual window const failsafeWindow = this._modelContextWindow > 0 ? this._modelContextWindow : this._contextWindow; const estimatedTokens = this.estimateHistoryTokens(history); const slotTokens = Math.max(0, this._currentContextTokenCount - estimatedTokens); const result = emergencyTruncate( history, failsafeWindow, slotTokens, this.config.failsafe.threshold, ); setHistory(result.newHistory); this.microcompaction.resetCache(); this._currentContextTokenCount = result.tokensAfter; } // ----------------------------------------------------------------------- // Cleanup // ----------------------------------------------------------------------- /** * Clear all state and handlers. */ destroy(): void { this.microcompaction.resetCache(); this.observationalEngine?.abort(); this.observationalEngine = null; this.beforeCompactionHandlers = []; this.postCompactionHandlers = []; this.compactionErrorHandlers = []; this.compactionResultHandlers = []; this.compactionDegradedHandlers = []; this.compactionExhaustedHandlers = []; this.completeFn = null; this._currentContextTokenCount = 0; this._consecutiveLayer2Failures = 0; this._lastInteractionTime = null; } // ----------------------------------------------------------------------- // Internal helpers // ----------------------------------------------------------------------- /** * Estimate tokens for a set of history messages. */ private estimateHistoryTokens(history: AgentMessage[]): number { return estimateTokens( history.map(m => extractTextContent(m)).join('\n'), ); } /** * Estimate total context tokens from an AgentContext object. */ private estimateContextTokens(context: AgentContext): number { let total = estimateTokens(context.systemPrompt); for (const msg of context.messages) { total += estimateTokens(extractTextContent(msg)); } return total; } }