import { AssetLibrary } from "./asset"; import { Chat } from "./chat"; import { CompletionsAPI } from "./completions"; import { FineTuningClient } from "./tune"; /** * Future class for endpoints that support server side async inferences. * * @param poll_url - A unique URL to poll status of inference. * @param response_id - Unique identifier for inference * * @remarks * This class is used in conjunction with the {@link Client.inferAsync} method. * Please reference that methods remarks for more information. * {@link Client.inferAsync} returns an InferenceFuture, which can then be * used with {@link Client.isFutureReady} to see the status. Once it returns * `true`, you can use the {@link Client.getFutureResult} to get the response * for your InferenceFuture. */ export interface InferenceFuture { poll_url: string; response_id: string; } /** * This class represents the responses from the poll_url of a * {@link InferenceFuture} * * @param status - Inference status of either pending, running, completed * @param response_url - URL to get the results once it's ready. * * @remarks * This class is used in conjunction with the {@link Client.inferAsync} method. * Please reference that methods remarks for more information. */ export interface InferenceFutureResponse { status: string; response_url: string; } /** * A client that allows inferences from existing OctoAI endpoints. Sets * various headers, establishes clients for {@link Chat} under `Client.chat`, * {@link AssetLibrary} under `Client.asset`, {@link FineTuningClient} under * `Client.tune`, and will check for `OCTOAI_TOKEN` * from environment variable if no token is provided. * * @throws {@link OctoAIClientError} - For client-side failures (throttled, no token) * @throws {@link OctoAIServerError} - For server-side failures (unreachable, etc) * * @remarks * You can create an OctoAI API token by following the guide at * {@link https://docs.octoai.cloud/docs/how-to-create-an-octoai-access-token | * How to Create an OctoAI Access Token} */ export declare class Client { /** * Headers used to interact with OctoAI servers. Communicates authorization * and request type. */ readonly headers: { Authorization: string; "Content-Type": string; "User-Agent": string; "X-OctoAI-Async": string; Accept: string; }; /** * The {@link Chat} client, accessible with `Client.chat`. */ readonly chat: Chat; /** * The {@link AssetLibrary} client, accessible with `Client.asset`. */ readonly asset: AssetLibrary; /** * The {@link FineTuningClient}, accessible with `Client.tune`. */ readonly tune: FineTuningClient; /** * The {@link CompletionsAPI} client, accessible with `Client.completions`. */ readonly completions: CompletionsAPI; /** * Set to true to use the SecureLink API. */ readonly secureLink: boolean; /** * Constructor for the Client class. * * @param token - OctoAI token. If none is set, checks for an `OCTOAI_TOKEN` * envvar, or will default to null. * @param secureLink - Set to true to use SecureLink API instead of public API */ constructor(token?: string | null, secureLink?: boolean); /** * Send a request to the given endpoint with inputs as request body. * For LLaMA2 LLMs, this requires `"stream": false` in the inputs. To stream * for LLMs, please see the {@link inferStream} method. * * @param endpointUrl - Target URL to run inference * @param inputs - Necessary inputs for the endpointURL to run inference * * @returns JSON outputs from the endpoint */ infer(endpointUrl: string, inputs: Record): Promise; /** * Stream text event response body for supporting endpoints. This is an * alternative to loading all response body into memory at once. Recommended * for use with LLM models. Requires `"stream": true` in the inputs for * LLaMA2 LLMs. * * @param endpointUrl - Target URL to run inference * @param inputs - Necessary inputs for the endpointURL to run inference * @returns Compatible with getReader method. * * @remarks * This allows you to stream back tokens from the LLMs. Below is an example * on how to do this with a LLaMA2 LLM using a completions style API. * * HuggingFace style APIs will usually use the variable `done` below to * indicate the end of the stream. OpenAI style APIs will often send a * string in the stream `"data: [DONE]\n"` to indicate the stream is complete. * * This example concatenates all values from the tokens into a single text * variable. How you choose to use the tokens will likely be different, so * please modify the code. * * This examples assumes: * 1) You've followed the guide at * {@link https://docs.octoai.cloud/docs/how-to-create-an-octoai-access-token | * How to Create an OctoAI Access Token} to create and set your OctoAI access * token * 2) Either that you will set this token as an OCTOAI_TOKEN envvar * or edit the snippet to pass it as a value in the `{@link Client.constructor}`. * 3) You have assigned your endpoint URL and inputs into variables named * llamaEndpoint and streamInputs. * *```ts * const client = new Client(); * const readableStream = await client.inferStream( * llamaEndpoint, * streamInputs * ); * let text = ``; * const streamReader = readableStream.getReader(); * for ( * let { value, done } = await streamReader.read(); * !done; * { value, done } = await streamReader.read() * ) { * if (done) break; * const decoded = new TextDecoder().decode(value); * if ( * decoded === "data: [DONE]\n" || * decoded.includes('"finish_reason": "') * ) { * break; * } * const token = JSON.parse(decoded.substring(5)); * if (token.object === "chat.completion.chunk") { * text += token.choices[0].delta.content; * } * console.log(text); *``` * The `const token = JSON.parse(decoded.substring(5))` line strips `"data"` * from the returned text/event-stream then parses the token as an object. */ inferStream(endpointUrl: string, inputs: Record): Promise; private coldStartWarning; /** * Check health of an endpoint using a get request. Try until timeout. * * @param endpointUrl - Target URL to run the health check. * @param timeoutMS - Milliseconds before request times out. Default is 15 * minutes. * @param intervalMS - Interval in milliseconds before the healthCheck method * queries * @returns HTTP status code. * * @remarks * The default timeout is set to 15 minutes to allow for potential cold start. * * For custom containers, please follow * {@link https://docs.octoai.cloud/docs/health-check-paths-in-custom-containers * | Health Check Paths in Custom Containers} to set a health check endpoint. * * Information about health check endpoint URLs are available on relevant * QuickStart Templates. */ healthCheck(endpointUrl: string, timeoutMS?: number, // 15 minutes for cold start intervalMS?: number): Promise; /** * Execute an inference in the background on the server. * * @param endpointUrl - Target URL to send inference request. * @param inputs - Contains necessary inputs for endpoint to run inference. * @returns Future allows checking if results are ready then accessing them. * * @remarks * Please read the {@link https://docs.octoai.cloud/reference/inference | * Async Inference Reference} for more information. * {@link Client.inferAsync} returns an {@link InferenceFuture}, * which can then be used with {@link Client.isFutureReady} to see the * status. Once it returns `true`, you can use the * {@link Client.getFutureResult} to get the response for your * InferenceFuture. * * Assuming you have a variable with your target endpoint URL and the inputs * the model needs, and an `OCTOAI_TOKEN` set as an environment variable, you * can run a server-side asynchronous inference from * {@link https://docs.octoai.cloud/docs/welcome-to-the-octoai-compute-service-copy | QuickStart Template} * endpoints with something like the below. * * ```ts * const client = new Client(); * const future = await client.inferAsync(url, inputs); * if (await client.isFutureReady(future) === true) { * return await client.getFutureResult(future); * } * ``` */ inferAsync(endpointUrl: string, inputs: Record): Promise; private pollFuture; /** * Return whether the {@link InferenceFuture} generated from * {@link Client.inferAsync} has been computed and can return results. * * @param future - Created from {@link Client.inferAsync}. * @returns True if the {@link InferenceFuture} * inference is completed and are able to use {@link Client.getFutureResult}. * Else returns false. */ isFutureReady(future: InferenceFuture): Promise; /** * Return the result of a {@link InferenceFuture} generated from * {@link Client.inferAsync} as long as {@link Client.isFutureReady} returned * `true`. * * @param future - An {@link InferenceFuture} generated from * {@link Client.inferAsync} * @returns JSON outputs from the endpoint. */ getFutureResult(future: InferenceFuture): Promise>; }