{"version":3,"file":"overflow.d.ts","sourceRoot":"","sources":["../../src/utils/overflow.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AA0EpD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiDG;AACH,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,gBAAgB,EAAE,aAAa,CAAC,EAAE,MAAM,GAAG,OAAO,CA6B5F;AAED;;GAEG;AACH,wBAAgB,mBAAmB,IAAI,MAAM,EAAE,CAE9C","sourcesContent":["import type { AssistantMessage } from \"../types.ts\";\n\n/**\n * Regex patterns to detect context overflow errors from different providers.\n *\n * These patterns match error messages returned when the input exceeds\n * the model's context window.\n *\n * Provider-specific patterns (with example error messages):\n *\n * - Anthropic: \"prompt is too long: 213462 tokens > 200000 maximum\"\n * - Anthropic: \"413 {\\\"error\\\":{\\\"type\\\":\\\"request_too_large\\\",\\\"message\\\":\\\"Request exceeds the maximum size\\\"}}\"\n * - OpenAI: \"Your input exceeds the context window of this model\"\n * - OpenAI/LiteLLM: \"Requested token count exceeds the model's maximum context length of 131072 tokens\"\n * - Google: \"The input token count (1196265) exceeds the maximum number of tokens allowed (1048575)\"\n * - xAI: \"This model's maximum prompt length is 131072 but the request contains 537812 tokens\"\n * - Groq: \"Please reduce the length of the messages or completion\"\n * - OpenRouter: \"This endpoint's maximum context length is X tokens. However, you requested about Y tokens\"\n * - OpenRouter/Poolside: \"Input length X exceeds the maximum allowed input length of Y tokens.\"\n * - Together AI: \"The input (X tokens) is longer than the model's context length (Y tokens).\"\n * - llama.cpp: \"the request exceeds the available context size, try increasing it\"\n * - LM Studio: \"tokens to keep from the initial prompt is greater than the context length\"\n * - GitHub Copilot: \"prompt token count of X exceeds the limit of Y\"\n * - MiniMax: \"invalid params, context window exceeds limit\"\n * - Kimi For Coding: \"Your request exceeded model token limit: X (requested: Y)\"\n * - Cerebras: \"400/413 status code (no body)\"\n * - Mistral: \"Prompt contains X tokens ... too large for model with Y maximum context length\"\n * - z.ai: Does NOT error, accepts overflow silently - handled via usage.input > contextWindow\n * - Xiaomi MiMo: Truncates input to fill contextWindow exactly, then returns finish_reason \"length\"\n *   with output=0 (no room left to generate). Detected via stopReason \"length\" + zero output +\n *   input filling the context window.\n * - Ollama: Some deployments truncate silently, others return errors like \"prompt too long; exceeded max context length by X tokens\"\n */\nconst OVERFLOW_PATTERNS = [\n\t/prompt is too long/i, // Anthropic token overflow\n\t/request_too_large/i, // Anthropic request byte-size overflow (HTTP 413)\n\t/input is too long for requested model/i, // Amazon Bedrock\n\t/exceeds the context window/i, // OpenAI (Completions & Responses API)\n\t/exceeds (?:the )?(?:model'?s )?maximum context length of [\\d,]+ tokens?/i, // OpenAI-compatible proxies (LiteLLM)\n\t/input token count.*exceeds the maximum/i, // Google (Gemini)\n\t/maximum prompt length is \\d+/i, // xAI (Grok)\n\t/reduce the length of the messages/i, // Groq\n\t/maximum context length is \\d+ tokens/i, // OpenRouter (most backends)\n\t/exceeds (?:the )?maximum allowed input length of [\\d,]+ tokens?/i, // OpenRouter/Poolside\n\t/input \\(\\d+ tokens\\) is longer than the model'?s context length \\(\\d+ tokens\\)/i, // Together AI\n\t/exceeds the limit of \\d+/i, // GitHub Copilot\n\t/exceeds the available context size/i, // llama.cpp server\n\t/greater than the context length/i, // LM Studio\n\t/context window exceeds limit/i, // MiniMax\n\t/exceeded model token limit/i, // Kimi For Coding\n\t/too large for model with \\d+ maximum context length/i, // Mistral\n\t/model_context_window_exceeded/i, // z.ai non-standard finish_reason surfaced as error text\n\t/prompt too long; exceeded (?:max )?context length/i, // Ollama explicit overflow error\n\t/context[_ ]length[_ ]exceeded/i, // Generic fallback\n\t/too many tokens/i, // Generic fallback\n\t/token limit exceeded/i, // Generic fallback\n\t/^4(?:00|13)\\s*(?:status code)?\\s*\\(no body\\)/i, // Cerebras: 400/413 with no body\n];\n\n/**\n * Patterns that indicate non-overflow errors (e.g. rate limiting, server errors).\n * Error messages matching any of these are excluded from overflow detection\n * even if they also match an OVERFLOW_PATTERN.\n *\n * Example: Bedrock formats throttling errors as \"ThrottlingException: Too many tokens,\n * please wait before trying again.\" which would match the /too many tokens/i overflow\n * pattern without this exclusion.\n */\nconst NON_OVERFLOW_PATTERNS = [\n\t/^(Throttling error|Service unavailable):/i, // AWS Bedrock non-overflow errors (human-readable prefixes from formatBedrockError)\n\t/rate limit/i, // Generic rate limiting\n\t/too many requests/i, // Generic HTTP 429 style\n];\n\n/**\n * Check if an assistant message represents a context overflow error.\n *\n * This handles two cases:\n * 1. Error-based overflow: Most providers return stopReason \"error\" with a\n *    specific error message pattern.\n * 2. Silent overflow: Some providers accept overflow requests and return\n *    successfully. For these, we check if usage.input exceeds the context window.\n *\n * ## Reliability by Provider\n *\n * **Reliable detection (returns error with detectable message):**\n * - Anthropic: \"prompt is too long: X tokens > Y maximum\" or \"request_too_large\"\n * - OpenAI (Completions & Responses): \"exceeds the context window\" or \"exceeds the model's maximum context length of X tokens\"\n * - Google Gemini: \"input token count exceeds the maximum\"\n * - xAI (Grok): \"maximum prompt length is X but request contains Y\"\n * - Groq: \"reduce the length of the messages\"\n * - Cerebras: 400/413 status code (no body)\n * - Mistral: \"Prompt contains X tokens ... too large for model with Y maximum context length\"\n * - OpenRouter (most backends): \"maximum context length is X tokens\"\n * - OpenRouter/Poolside: \"Input length X exceeds the maximum allowed input length of Y tokens.\"\n * - Together AI: \"The input (X tokens) is longer than the model's context length (Y tokens).\"\n * - llama.cpp: \"exceeds the available context size\"\n * - LM Studio: \"greater than the context length\"\n * - Kimi For Coding: \"exceeded model token limit: X (requested: Y)\"\n *\n * **Unreliable detection:**\n * - z.ai: Sometimes accepts overflow silently (detectable via usage.input > contextWindow),\n *   sometimes returns rate limit errors. Pass contextWindow param to detect silent overflow.\n * - Xiaomi MiMo: Truncates input to fit contextWindow then returns stopReason \"length\" with\n *   output=0. Pass contextWindow param to detect via the \"filled context + zero output\" signal.\n * - Ollama: May truncate input silently for some setups, but may also return explicit\n *   overflow errors that match the patterns above. Silent truncation still cannot be\n *   detected here because we do not know the expected token count.\n *\n * ## Custom Providers\n *\n * If you've added custom models via settings.json, this function may not detect\n * overflow errors from those providers. To add support:\n *\n * 1. Send a request that exceeds the model's context window\n * 2. Check the errorMessage in the response\n * 3. Create a regex pattern that matches the error\n * 4. The pattern should be added to OVERFLOW_PATTERNS in this file, or\n *    check the errorMessage yourself before calling this function\n *\n * @param message - The assistant message to check\n * @param contextWindow - Optional context window size for detecting silent overflow (z.ai)\n * @returns true if the message indicates a context overflow\n */\nexport function isContextOverflow(message: AssistantMessage, contextWindow?: number): boolean {\n\t// Case 1: Check error message patterns\n\tif (message.stopReason === \"error\" && message.errorMessage) {\n\t\t// Skip messages matching known non-overflow patterns (e.g. throttling / rate-limit)\n\t\tconst isNonOverflow = NON_OVERFLOW_PATTERNS.some((p) => p.test(message.errorMessage!));\n\t\tif (!isNonOverflow && OVERFLOW_PATTERNS.some((p) => p.test(message.errorMessage!))) {\n\t\t\treturn true;\n\t\t}\n\t}\n\n\t// Case 2: Silent overflow (z.ai style) - successful but usage exceeds context\n\tif (contextWindow && message.stopReason === \"stop\") {\n\t\tconst inputTokens = message.usage.input + message.usage.cacheRead;\n\t\tif (inputTokens > contextWindow) {\n\t\t\treturn true;\n\t\t}\n\t}\n\n\t// Case 3: Length-stop overflow (Xiaomi MiMo style) - server truncates oversized input\n\t// to fit the context window, leaving no room for output. Returns stopReason \"length\"\n\t// with output=0 and input+cacheRead filling the context window.\n\tif (contextWindow && message.stopReason === \"length\" && message.usage.output === 0) {\n\t\tconst inputTokens = message.usage.input + message.usage.cacheRead;\n\t\tif (inputTokens >= contextWindow * 0.99) {\n\t\t\treturn true;\n\t\t}\n\t}\n\n\treturn false;\n}\n\n/**\n * Get the overflow patterns for testing purposes.\n */\nexport function getOverflowPatterns(): RegExp[] {\n\treturn [...OVERFLOW_PATTERNS];\n}\n"]}