import { type CompletionClientParams, type CompletionRun, type McpClientInput, type Tool, type RPCOptions } from "../../schemas/index"; import { type ToolInput } from "../../utils/tool-helpers"; type CompletionParams = Omit & { tools?: Tool[] | ToolInput[]; mcp?: McpClientInput[]; rpcOptions?: RPCOptions; captureThinking?: boolean; emitRawDeltas?: boolean; }; /** * Generates completion from a language model based on conversation history. * * Returns a `CompletionRun` whose canonical surfaces are: * * - `events` — `AsyncIterable` of ordered, typed events. * - `final` — `Promise` with aggregated results once the * stream ends (content, thinking, tool calls, stats, raw text). * * Legacy convenience fields (`tokenStream`, `text`, `toolCallStream`, * `toolCalls`, `stats`) are still available but deprecated — they derive * from `events` / `final` internally. * * @param params - The completion parameters * @param params.modelId - The identifier of the model to use for completion * @param params.history - Array of conversation messages with role, content, and optional attachments * @param params.stream - Whether to stream tokens (true) or return complete response (false). Defaults to true * @param params.tools - Optional array of tools (can be simple ToolInput with Zod schemas or full Tool objects) * @param params.mcp - Optional array of MCP client inputs for tool integration * @param params.captureThinking - Best-effort parsing of `` blocks into `thinkingDelta` events; `final.raw.fullText` always preserves the original output * @param params.emitRawDeltas - When true, every raw model token is also emitted as a `rawDelta` event * @param params.toolDialect - Override the SDK's name-based dialect detection. Supported values: `"hermes"`, `"pythonic"`, `"json"`, `"harmony"`, `"qwen35"` (Qwen3.5/3.6), `"gemma4"`. Use when the auto-router doesn't recognise your model name. Drives both streaming frame detection and finalization parsing. * Common override case: Llama 3.x tool-calling fine-tunes that emit the native pythonic header (`<|start_header_id|>tool_call<|end_header_id|>...<|eot_id|>`). * @param params.responseFormat - Optional structured-output constraint applied to the model's output: * - `{ type: "text" }` — no constraint (default behavior) * - `{ type: "json_object" }` — output must be a JSON object * - `{ type: "json_schema", json_schema: { name, schema, description?, strict? } }` — output must validate against `schema` * * The schema is converted to GBNF natively by llama.cpp and applied for the * duration of the request only. `json_schema.name` and `json_schema.description` * are accepted for OpenAI compatibility but only used at the API boundary — * they do not affect generation. **`json_schema.strict` is currently accepted * for compatibility but does NOT trigger OpenAI's auto-tightening semantics** * (implicit `additionalProperties: false`, all properties required). The * schema is forwarded to the addon as-is, so callers who want strict * validation must encode it explicitly in `schema`. * * Cannot be combined with `tools` (tools already constrain output via their parameter schema). * @param params.kvCache - Optional KV cache configuration. Cache files are organized hierarchically: * - Structure: `{kvCacheKey}/{modelId}/{configHash}.bin` * - The configHash includes model config + system prompt to ensure cache isolation * - `true`: Auto-generate cache key based on conversation history * - `"custom-key"`: Use provided string as cache key for manual session management * - `false` or `undefined`: No caching * - ⚡ Performance: When cache exists, only the last message is sent to the model (includes multimodal attachments) * - 🗑️ Cleanup: Use `deleteCache({ kvCacheKey })` to remove cached sessions * * **Auto-cache (`kvCache: true`) — assistant turn contract.** When * pushing the assistant turn back into `history` for the next call, * use `(await run.final).cacheableAssistantContent`. That's the exact * string the SDK persisted to the cache key on this turn, so re-using * it verbatim guarantees the next-turn lookup hits. * - Any post-processing of the assistant text (rewriting, summarizing, * stripping model stop tokens like `<|im_end|>`) before pushing it * back will miss the cache. Push the canonical string unchanged. * - `cacheableAssistantContent` is omitted on tool-call turns - those * can't be auto-cached today. * @returns A CompletionRun — consume via `events` / `final`. * @example * ```typescript * import { z } from "zod"; * * const run = completion({ * modelId: "llama-2", * history: [ * { role: "user", content: "What's the weather in Tokyo?" } * ], * stream: true, * captureThinking: true, * tools: [{ * name: "get_weather", * description: "Get current weather", * parameters: z.object({ * city: z.string().describe("City name"), * }), * handler: async (args) => { * return { temperature: 22, condition: "sunny" }; * } * }] * }); * * for await (const event of run.events) { * if (event.type === "contentDelta") process.stdout.write(event.text); * if (event.type === "toolCall") console.log(event.call.name, event.call.arguments); * } * * const result = await run.final; * for (const toolCall of await result.toolCalls) { * if (toolCall.invoke) { * const toolResult = await toolCall.invoke(); * console.log(toolResult); * } * } * ``` */ export declare function completion(params: CompletionParams): CompletionRun; export {}; //# sourceMappingURL=completion-stream.d.ts.map