/** * Daemon Manager * * Manages the single global Photon daemon lifecycle via a state machine. * The daemon handles all photons through channel-based isolation. * * Architecture: * - Single daemon process: ~/.photon/daemon.sock * - All photons communicate through the same daemon * - Channels provide isolation: {photonId}:{itemId} * - State machine guards all transitions (stopped → starting → running → stopping) */ import { DaemonStatus } from './protocol.js'; import { type DaemonState } from './state-machine.js'; import { type PhotonContext } from '../context.js'; /** * Thrown when a daemon PID survives SIGTERM and SIGKILL. Callers should * NOT retry an `ensure`/`start` automatically — a surviving process keeps * the socket bound, so spawning a fresh daemon would race or EADDRINUSE. * Surface this to the user instead. */ export declare class DaemonOrphanError extends Error { readonly survivorPids: number[]; constructor(message: string, survivorPids: number[]); } /** * DaemonManager — state-machine-guarded daemon lifecycle. * * All transitions (start, stop, restart) go through the FSM, * preventing illegal states like double-start or stop-while-starting. */ export declare class DaemonManager { private fsm; private ctx; private logger; constructor(ctx?: PhotonContext); /** Current FSM state. */ get status(): DaemonState; /** Paths for external consumers. */ get pidFile(): string; get logFile(): string; get socketPath(): string; /** * Idempotent ensure — start if needed, restart if binary is stale. * Concurrent callers join the in-flight operation instead of racing. */ private _ensurePromise; ensure(quiet?: boolean): Promise; private _ensureImpl; /** * Start the daemon. No-op if already running. * Uses a filesystem lock to prevent cross-process races where multiple * processes (Beam, CLI, MCP) each decide the daemon is dead and all spawn one. */ start(quiet?: boolean): Promise; /** * Stop the daemon. */ stop(): void; /** * Restart the daemon (stop → start). */ restart(): Promise; /** * Get daemon status info. */ getStatus(): DaemonStatus; isReachable(): Promise; private get lockFile(); private get ownerFile(); /** * Acquire a cross-process startup lock using atomic O_EXCL file creation. * Returns true if lock acquired, false if another process holds it. * Handles stale locks (lock holder PID is dead). */ /** * Returns true if the PID is alive and able to hold resources (sockets, * file descriptors). Zombie processes — dead but not yet reaped by their * parent — appear alive to kill(pid, 0) on POSIX because their PID table * entry persists, but they cannot hold sockets or any other resources. * Treating a zombie as alive causes cleanupStale() to refuse cleanup even * though removing the socket/pid files is perfectly safe. */ private isPidStillAlive; /** * Approximate process start time in epoch ms via `ps -o etime=`. * * POSIX-only — Windows daemons skip this check (a separate task tracks * full Windows daemon recovery). Returns null if ps is unavailable, the * pid is dead, or the etime output is unparseable; callers should treat * null as "can't tell" rather than "matches". */ private getProcessStartTimeMs; /** * Heuristic check: is `pid` actually our daemon, or did the kernel * recycle the slot to an unrelated process? * * Why this matters: getTrackedPids reads pid/owner files; if the daemon * died and the kernel reused its PID for an unrelated process, blindly * sending SIGTERM/SIGKILL would terminate that innocent process AND * cleanupStale would refuse to clean (because the "daemon" is "alive"), * leaving the operator wedged with a DaemonOrphanError they can't fix. * * We compare ps-reported process start time against the daemon owner * record's claimedAt timestamp. Daemons claim ownership within ~100ms * of spawning, so a 5s window covers normal jitter. A larger gap in * either direction strongly suggests the slot was recycled. * * Returns: * - true: evidence supports "this is our daemon" * - false: clear evidence of PID reuse — caller should treat as dead * - null: can't determine (no owner record, ps failed, etc.) — caller * should fall back to the conservative behavior (assume ours) * rather than risk killing a stranger. */ private isPidOurDaemon; /** * Synchronously poll until all pids are dead or the timeout expires. * Returns true iff every pid is confirmed dead before the deadline. */ private waitForPidsDeadSync; /** * Async variant used by web-server/request paths. The old synchronous * polling loop uses Atomics.wait, which freezes Beam while a daemon is * stopping. Keep the synchronous helper for direct CLI stop/tests, but * never use it from async daemon recovery paths. */ private waitForPidsDead; /** * SIGTERM → wait → SIGKILL → wait. Returns whether every tracked pid is * confirmed dead (or the slot has been confirmed recycled), plus the * list of survivors that the caller should report in DaemonOrphanError. * Callers MUST NOT delete socket/pid/owner files unless `allDead` is * true — a surviving daemon keeps the socket bound, and unlinking it * from underneath leaves the "marked running but socket unreachable" * state we're trying to prevent. * * Filters out PIDs that ps says belong to a stranger (the kernel * recycled the slot) — sending signals to them would terminate * unrelated processes. Recycled PIDs are treated as "already dead" * for the purposes of cleanup and are NOT reported as survivors. */ private terminateTrackedPidsSync; private terminateTrackedPids; /** * Unlink pid file, owner file, and socket file. Caller is responsible for * confirming the daemon process is dead first; this helper does not check. */ private removeStateFiles; private acquireStartupLock; private releaseStartupLock; /** * Returns true if the process recorded in the startup lock file is * still alive. Used by waitForDaemon to bail out early when the * lock-holding process has crashed mid-spawn. */ private isLockHolderAlive; /** * Wait for another process to finish starting the daemon. * * Polls until the socket comes up OR the lock-holding process dies. * * IMPORTANT: maxWait must exceed spawnDaemon's own readiness timeout. * If we time out faster than the lock holder can finish a healthy * spawn, we'll force-spawn a second daemon — both daemons race to * bind the socket, the loser is detected by the imposter scan and * killed. This is the root cause of the imposter-daemon entries * seen in the log around imposter PID detections. * * Also: do NOT force-unlink the lock file here. acquireStartupLock * already detects stale locks (holder PID dead) and cleans them up * atomically. Force-unlinking from a peer process opens a window * where two processes both think they hold the lock. */ private waitForDaemon; private isPidAlive; private readPid; private isSocketAlive; private isSocketConfirmedUnreachable; private cleanupStaleAsync; private killProcess; private killProcessAsync; private stopAsync; /** * Rotate the daemon log on each spawn if it has grown past LOG_ROTATE_BYTES. * * Cheap, single-generation rotation: rename log to log.1 (overwriting any * previous .1). Done at spawn time rather than per-write so it adds zero * overhead to the hot path. With the connect/disconnect spam now demoted * to debug, the log should grow slowly enough that a 50 MB cap is plenty * of headroom; the cap exists as a backstop, not a primary defense. */ private static readonly LOG_ROTATE_BYTES; private rotateLogIfTooLarge; private resolveDaemonScript; private isBinaryStale; /** * File-descriptor exhaustion: EMFILE is the per-process cap, ENFILE the * system-wide kernel file table. Both surface from `fs.openSync` and * `spawn`. They are transient (another process freeing FDs clears them), * so one short retry recovers the common case; if it persists the user * needs an actionable message, not a raw posix_spawn stack trace. */ private static isFdExhaustion; private spawnDaemon; /** * Re-sync FSM state from disk (PID file existence). * Used when FSM gets out of sync (e.g. daemon killed externally). */ private resyncFromDisk; private hasExclusiveOwner; private getTrackedPids; } export declare const GLOBAL_PID_FILE: string; export declare const GLOBAL_LOG_FILE: string; export declare function getGlobalSocketPath(): string; export declare function isGlobalDaemonRunning(): boolean; export declare function isGlobalDaemonReachable(): Promise; export declare function startGlobalDaemon(quiet?: boolean): Promise; export declare function ensureDaemon(quiet?: boolean): Promise; export declare function stopGlobalDaemon(): void; export declare function restartGlobalDaemon(): Promise; //# sourceMappingURL=manager.d.ts.map