import { type CompletionClientParams, type CompletionRun, type McpClientInput, type Tool, type RPCOptions } from "../../schemas/index";
import { type ToolInput } from "../../utils/tool-helpers";
type CompletionParams = Omit<CompletionClientParams, "tools"> & {
    tools?: Tool[] | ToolInput[];
    mcp?: McpClientInput[];
    rpcOptions?: RPCOptions;
    captureThinking?: boolean;
    emitRawDeltas?: boolean;
};
/**
 * Generates completion from a language model based on conversation history.
 *
 * Returns a `CompletionRun` whose canonical surfaces are:
 *
 *  - `events`  — `AsyncIterable<CompletionEvent>` of ordered, typed events.
 *  - `final`   — `Promise<CompletionFinal>` with aggregated results once the
 *                stream ends (content, thinking, tool calls, stats, raw text).
 *
 * Legacy convenience fields (`tokenStream`, `text`, `toolCallStream`,
 * `toolCalls`, `stats`) are still available but deprecated — they derive
 * from `events` / `final` internally.
 *
 * @param params - The completion parameters
 * @param params.modelId - The identifier of the model to use for completion
 * @param params.history - Array of conversation messages with role, content, and optional attachments
 * @param params.stream - Whether to stream tokens (true) or return complete response (false). Defaults to true
 * @param params.tools - Optional array of tools (can be simple ToolInput with Zod schemas or full Tool objects)
 * @param params.mcp - Optional array of MCP client inputs for tool integration
 * @param params.captureThinking - Best-effort parsing of `<think>` blocks into `thinkingDelta` events; `final.raw.fullText` always preserves the original output
 * @param params.emitRawDeltas - When true, every raw model token is also emitted as a `rawDelta` event
 * @param params.toolDialect - Override the SDK's name-based dialect detection. Supported values: `"hermes"`, `"pythonic"`, `"json"`, `"harmony"`, `"qwen35"` (Qwen3.5/3.6), `"gemma4"`. Use when the auto-router doesn't recognise your model name. Drives both streaming frame detection and finalization parsing.
 * Common override case: Llama 3.x tool-calling fine-tunes that emit the native pythonic header (`<|start_header_id|>tool_call<|end_header_id|>...<|eot_id|>`).
 * @param params.responseFormat - Optional structured-output constraint applied to the model's output:
 *   - `{ type: "text" }` — no constraint (default behavior)
 *   - `{ type: "json_object" }` — output must be a JSON object
 *   - `{ type: "json_schema", json_schema: { name, schema, description?, strict? } }` — output must validate against `schema`
 *
 *   The schema is converted to GBNF natively by llama.cpp and applied for the
 *   duration of the request only. `json_schema.name` and `json_schema.description`
 *   are accepted for OpenAI compatibility but only used at the API boundary —
 *   they do not affect generation. **`json_schema.strict` is currently accepted
 *   for compatibility but does NOT trigger OpenAI's auto-tightening semantics**
 *   (implicit `additionalProperties: false`, all properties required). The
 *   schema is forwarded to the addon as-is, so callers who want strict
 *   validation must encode it explicitly in `schema`.
 *
 *   Cannot be combined with `tools` (tools already constrain output via their parameter schema).
 * @param params.kvCache - Optional KV cache configuration. Cache files are organized hierarchically:
 *   - Structure: `{kvCacheKey}/{modelId}/{configHash}.bin`
 *   - The configHash includes model config + system prompt to ensure cache isolation
 *   - `true`: Auto-generate cache key based on conversation history
 *   - `"custom-key"`: Use provided string as cache key for manual session management
 *   - `false` or `undefined`: No caching
 *   - ⚡ Performance: When cache exists, only the last message is sent to the model (includes multimodal attachments)
 *   - 🗑️ Cleanup: Use `deleteCache({ kvCacheKey })` to remove cached sessions
 *
 *   **Auto-cache (`kvCache: true`) — assistant turn contract.** When
 *   pushing the assistant turn back into `history` for the next call,
 *   use `(await run.final).cacheableAssistantContent`. That's the exact
 *   string the SDK persisted to the cache key on this turn, so re-using
 *   it verbatim guarantees the next-turn lookup hits.
 *   - Any post-processing of the assistant text (rewriting, summarizing,
 *     stripping model stop tokens like `<|im_end|>`) before pushing it
 *     back will miss the cache. Push the canonical string unchanged.
 *   - `cacheableAssistantContent` is omitted on tool-call turns - those
 *     can't be auto-cached today.
 * @returns A CompletionRun — consume via `events` / `final`.
 * @example
 * ```typescript
 * import { z } from "zod";
 *
 * const run = completion({
 *   modelId: "llama-2",
 *   history: [
 *     { role: "user", content: "What's the weather in Tokyo?" }
 *   ],
 *   stream: true,
 *   captureThinking: true,
 *   tools: [{
 *     name: "get_weather",
 *     description: "Get current weather",
 *     parameters: z.object({
 *       city: z.string().describe("City name"),
 *     }),
 *     handler: async (args) => {
 *       return { temperature: 22, condition: "sunny" };
 *     }
 *   }]
 * });
 *
 * for await (const event of run.events) {
 *   if (event.type === "contentDelta") process.stdout.write(event.text);
 *   if (event.type === "toolCall") console.log(event.call.name, event.call.arguments);
 * }
 *
 * const result = await run.final;
 * for (const toolCall of await result.toolCalls) {
 *   if (toolCall.invoke) {
 *     const toolResult = await toolCall.invoke();
 *     console.log(toolResult);
 *   }
 * }
 * ```
 */
export declare function completion(params: CompletionParams): CompletionRun;
export {};
//# sourceMappingURL=completion-stream.d.ts.map