import process from "node:process";
import { isBooleanLike, isNumberLike } from "inferred-types";
import "dotenv/config";

export const HUGGINGFACE_API_URL = "https://huggingface.co/api";
export const AUTH_TOKEN = process.env.HUGGINGFACE_AUTH_TOKEN;
export const HOME_DIR = process.env.HOME;
export const CONFIG_BASE_DIR = `${HOME_DIR}/.config` as const;
export const CONFIG_DIR = `${CONFIG_BASE_DIR}/llm-model` as const;
export const MODEL_CACHE = ".models.json" as const;
export const LAST_MODEL_SET = ".last-model-result.json" as const;
export const VARIANT_CACHE = ".model-variant.json" as const;
export const BENCH_CACHE = ".bench.json" as const;
export const JOBS_CACHE = ".jobs.json" as const;
export const DEFAULT_MODEL_DIR = process.env.HF_HOME || process.env.PWD || ".";

// LLAMA PARAMs

/** number of layers to the GPU */
export const NGL = isNumberLike(process.env.NGL) ? Number(process.env.NGL) : 99;
/** number of layers to the GPU (for the draft model) */
export const NGLD = isNumberLike(process.env.NGLD) ? Number(process.env.NGLD) : 99;
/** context size for the LLM model  */
export const CONTEXT_SIZE = isNumberLike(process.env.CONTEXT_SIZE)
  ? Number(process.env.CONTEXT_SIZE)
  : 16384;

/** context size for the LLM _draft_ model  */
export const CONTEXT_SIZE_DRAFT = isNumberLike(process.env.CONTEXT_SIZE_DRAFT)
  ? Number(process.env.CONTEXT_SIZE_DRAFT)
  : 16384;

export const TEMP = isNumberLike(process.env.TEMP)
  ? Number(process.env.TEMP)
  : 0.6;

export const HOST = process.env.HOST || "0.0.0.0";
export const PORT = isNumberLike(process.env.PORT)
  ? Number(process.env.PORT)
  : 8087;

/** the maximum number of draft tokens to generate (default: 16) */
export const DRAFT_MAX = isNumberLike(process.env.DRAFT_MAX)
  ? Number(process.env.DRAFT_MAX)
  : 16;

/** the minimum number of draft tokens to generate (default: 1) */
export const DRAFT_MIN = isNumberLike(process.env.DRAFT_MIN)
  ? Number(process.env.DRAFT_MIN)
  : 1;

/** the minimum probability threshold for draft tokens (default: 0.05) */
export const DRAFT_P_MIN = isNumberLike(process.env.DRAFT_P_MIN)
  ? Number(process.env.DRAFT_P_MIN)
  : 1;

/** enable Flash Attention (default: false) */
export const FLASH_ATTN = isBooleanLike(process.env.FLASH_ATTN)
  ? Boolean(process.env.FLASH_ATTN)
  : false;

export const LLAMA_CPP_BOOLEAN = [
  ["--help", "-h", "Print usage and exit."],
  ["--usage", "-h", "Print usage and exit."],
  ["--version", "", "Show version and build info."],
  ["--verbose-prompt", "", "Print a verbose prompt before generation (default: false)."],
  ["--cpu-strict", "", "Use strict CPU placement (default: 0)."],
  ["--poll-batch", "", "Use polling to wait for work (default: same as --poll)."],
  ["--flash-attn", "-fa", "Enable Flash Attention (default: disabled)."],
  ["--no-perf", "", "Disable internal libllama performance timings (default: false)."],
  ["--escape", "-e", "Process escape sequences (\\n, \\r, \\t, ', \", \\) (default: true)."],
  ["--no-escape", "", "Do not process escape sequences."],
  ["--dump-kv-cache", "-dkvc", "Verbose print of the KV cache."],
  ["--no-kv-offload", "-nkvo", "Disable KV offload."],
  ["--mlock", "", "Force system to keep model in RAM rather than swapping or compressing."],
  ["--no-mmap", "", "Do not memory-map model (slower load but may reduce pageouts if not using mlock)."],
  ["--list-devices", "", "Print list of available devices and exit."],
  ["--check-tensors", "", "Check model tensor data for invalid values (default: false)."],
  ["--log-disable", "", "Disable logging."],
  ["--log-colors", "", "Enable colored logging."],
  ["--verbose", "-v", "Set verbosity level to infinity (i.e., log all messages, useful for debugging)."],
  ["--log-verbose", "-v", "Set verbosity level to infinity (i.e., log all messages, useful for debugging)."],
  ["--log-prefix", "", "Enable prefix in log messages."],
  ["--log-timestamps", "", "Enable timestamps in log messages."],
  ["--ignore-eos", "", "Ignore end of stream token and continue generating (implies --logit-bias EOS-inf)."],
  ["--no-context-shift", "", "Disables context shift on infinite text generation (default: disabled)."],
  ["--special", "-sp", "Special tokens output enabled (default: false)."],
  ["--no-warmup", "", "Skip warming up the model with an empty run."],
  ["--spm-infill", "", "Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this (default: disabled)."],
  ["--cont-batching", "-cb", "Enable continuous batching (a.k.a dynamic batching) (default: enabled)."],
  ["--no-cont-batching", "-nocb", "Disable continuous batching."],
  ["--no-webui", "", "Disable the Web UI (default: enabled)."],
  ["--embedding", "", "Restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)."],
  ["--embeddings", "", "Restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)."],
  ["--reranking", "", "Enable reranking endpoint on server (default: disabled)."],
  ["--rerank", "", "Enable reranking endpoint on server (default: disabled)."],
  ["--metrics", "", "Enable Prometheus-compatible metrics endpoint (default: disabled)."],
  ["--slots", "", "Enable slots monitoring endpoint (default: disabled)."],
  ["--props", "", "Enable changing global properties via POST /props (default: disabled)."],
  ["--no-slots", "", "Disables slots monitoring endpoint."],
  ["--lora-init-without-apply", "", "Load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)."],
] as const;

export const LLAMA_CPP_STRING = [
  ["--cpu-mask", "-C", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")."],
  ["--cpu-range", "-Cr", "Range of CPUs for affinity. Complements --cpu-mask."],
  ["--cpu-mask-batch", "-Cb", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)."],
  ["--cpu-range-batch", "-Crb", "Ranges of CPUs for affinity. Complements --cpu-mask-batch."],
  ["--rope-scaling", "", "RoPE frequency scaling method, defaults to linear unless specified by the model."],
  ["--cache-type-k", "-ctk", "KV cache data type for K (allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, default: f16)."],
  ["--cache-type-v", "-ctv", "KV cache data type for V (allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, default: f16)."],
  ["--numa", "", "Attempt optimizations that help on some NUMA systems."],
  ["--device", "-dev", "Comma-separated list of devices to use for offloading (none = don't offload)."],
  ["--split-mode", "-sm", "How to split the model across multiple GPUs (none, layer, row, default: layer)."],
  ["--tensor-split", "-ts", "Fraction of the model to offload to each GPU, comma-separated list of proportions."],
  ["--override-kv", "", "Advanced option to override model metadata by key. May be specified multiple times."],
  ["--lora", "", "Path to LoRA adapter (can be repeated to use multiple adapters)."],
  ["--lora-scaled", "", "Path to LoRA adapter with user-defined scaling (can be repeated to use multiple adapters)."],
  ["--control-vector", "", "Add a control vector. Can be repeated to add multiple control vectors."],
  ["--control-vector-scaled", "", "Add a control vector with user-defined scaling. Can be repeated to add multiple scaled control vectors."],
  ["--control-vector-layer-range", "", "Layer range to apply the control vector(s) to, start and end inclusive."],
  ["--model", "-m", "Model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)."],
  ["--model-url", "-mu", "Model download URL (default: unused)."],
  ["--hf-repo", "-hfr", "Hugging Face model repository (default: unused)."],
  ["--hf-file", "-hff", "Hugging Face model file (default: unused)."],
  ["--hf-token", "-hft", "Hugging Face access token (default: value from HF_TOKEN environment variable)."],
  ["--log-file", "", "Log to file."],
  ["--samplers", "", "Samplers that will be used for generation in the order, separated by ';'."],
  ["--sampling-seq", "", "Simplified sequence for samplers that will be used (default: dkypmxt)."],
  ["--dry-sequence-breaker", "", "Add sequence breaker for DRY sampling, clearing out default breakers ('\\n', ':', '\"', '*') in the process; use \"none\" to not use any sequence breakers."],
  ["--logit-bias", "-l", "Modifies the likelihood of token appearing in the completion."],
  ["--grammar", "", "BNF-like grammar to constrain generations (default: '')."],
  ["--grammar-file", "", "File to read grammar from."],
  ["--json-schema", "-j", "JSON schema to constrain generations."],
  ["--pooling", "", "Pooling type for embeddings, use model default if unspecified."],
  ["--alias", "-a", "Set alias for model name (to be used by REST API)."],
  ["--host", "", "IP address to listen (default: 127.0.0.1)."],
  ["--path", "", "Path to serve static files from (default: )."],
  ["--api-key", "", "API key to use for authentication (default: none)."],
  ["--api-key-file", "", "Path to file containing API keys (default: none)."],
  ["--ssl-key-file", "", "Path to file a PEM-encoded SSL private key."],
  ["--ssl-cert-file", "", "Path to file a PEM-encoded SSL certificate."],
  ["--slot-save-path", "", "Path to save slot KV cache (default: disabled)."],
  ["--chat-template", "", "Set custom Jinja chat template (default: template taken from model's metadata)."],
  ["--device-draft", "-devd", "Comma-separated list of devices to use for offloading the draft model (none = don't offload)."],
  ["--model-draft", "-md", "Draft model for speculative decoding (default: unused)."],
] as const;

export const LLAMA_CPP_NUMBER = [
  ["--threads", "-t", "Number of threads to use during generation (default: -1)."],
  ["--threads-batch", "-tb", "Number of threads to use during batch and prompt processing (default: same as --threads)."],
  ["--prio", "", "Set process/thread priority: 0-normal, 1-medium, 2-high, 3-realtime (default: 0)."],
  ["--poll", "", "Use polling level to wait for work (0 - no polling, default: 50)."],
  ["--prio-batch", "", "Set process/thread priority: 0-normal, 1-medium, 2-high, 3-realtime (default: 0)."],
  ["--ctx-size", "-c", "Size of the prompt context (default: 4096, 0 = loaded from model)."],
  ["--predict", "-n", "Number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)."],
  ["--n-predict", "-n", "Number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)."],
  ["--batch-size", "-b", "Logical maximum batch size (default: 2048)."],
  ["--ubatch-size", "-ub", "Physical maximum batch size (default: 512)."],
  ["--keep", "", "Number of tokens to keep from the initial prompt (default: 0, -1 = all)."],
  ["--rope-scale", "", "RoPE context scaling factor, expands context by a factor of N."],
  ["--rope-freq-base", "", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)."],
  ["--rope-freq-scale", "", "RoPE frequency scaling factor, expands context by a factor of 1/N."],
  ["--yarn-orig-ctx", "", "YaRN: original context size of model (default: 0 = model training context size)."],
  ["--yarn-ext-factor", "", "YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)."],
  ["--yarn-attn-factor", "", "YaRN: scale sqrt(t) or attention magnitude (default: 1.0)."],
  ["--yarn-beta-slow", "", "YaRN: high correction dim or alpha (default: 1.0)."],
  ["--yarn-beta-fast", "", "YaRN: low correction dim or beta (default: 32.0)."],
  ["--defrag-thold", "-dt", "KV cache defragmentation threshold (default: 0.1, < 0 - disabled)."],
  ["--parallel", "-np", "Number of parallel sequences to decode (default: 1)."],
  ["--gpu-layers", "-ngl", "Number of layers to store in VRAM."],
  ["--n-gpu-layers", "-ngl", "Number of layers to store in VRAM."],
  ["--main-gpu", "-mg", "The GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)."],
  ["--verbosity", "-lv", "Set the verbosity threshold. Messages with a higher verbosity will be ignored."],
  ["--log-verbosity", "-lv", "Set the verbosity threshold. Messages with a higher verbosity will be ignored."],
  ["--seed", "-s", "RNG seed (default: -1, use random seed for -1)."],
  ["--temp", "", "Temperature (default: 0.8)."],
  ["--top-k", "", "Top-k sampling (default: 40, 0 = disabled)."],
  ["--top-p", "", "Top-p sampling (default: 0.9, 1.0 = disabled)."],
  ["--min-p", "", "Min-p sampling (default: 0.1, 0.0 = disabled)."],
  ["--xtc-probability", "", "XTC probability (default: 0.0, 0.0 = disabled)."],
  ["--xtc-threshold", "", "XTC threshold (default: 0.1, 1.0 = disabled)."],
  ["--typical", "", "Locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)."],
  ["--repeat-last-n", "", "Last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)."],
  ["--repeat-penalty", "", "Penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)."],
  ["--presence-penalty", "", "Repeat alpha presence penalty (default: 0.0, 0.0 = disabled)."],
  ["--frequency-penalty", "", "Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)."],
  ["--dry-multiplier", "", "Set DRY sampling multiplier (default: 0.0, 0.0 = disabled)."],
  ["--dry-base", "", "Set DRY sampling base value (default: 1.75)."],
  ["--dry-allowed-length", "", "Set allowed length for DRY sampling (default: 2)."],
  ["--dry-penalty-last-n", "", "Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size)."],
  ["--dynatemp-range", "", "Dynamic temperature range (default: 0.0, 0.0 = disabled)."],
  ["--dynatemp-exp", "", "Dynamic temperature exponent (default: 1.0)."],
  ["--mirostat", "", "Use Mirostat sampling (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)."],
  ["--mirostat-lr", "", "Mirostat learning rate, parameter eta (default: 0.1)."],
  ["--mirostat-ent", "", "Mirostat target entropy, parameter tau (default: 5.0)."],
  ["--port", "", "Port to listen (default: 8080)."],
  ["--timeout", "-to", "Server read/write timeout in seconds (default: 600)."],
  ["--threads-http", "", "Number of threads used to process HTTP requests (default: -1)."],
  ["--cache-reuse", "", "Min chunk size to attempt reusing from the cache via KV shifting (default: 0)."],
  ["--slot-prompt-similarity", "-sps", "How much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)."],
  ["--draft-max", "", "Number of tokens to draft for speculative decoding (default: 16)."],
  ["--draft", "", "Number of tokens to draft for speculative decoding (default: 16)."],
  ["--draft-n", "", "Number of tokens to draft for speculative decoding (default: 16)."],
  ["--draft-min", "", "Minimum number of draft tokens to use for speculative decoding (default: 5)."],
  ["--draft-n-min", "", "Minimum number of draft tokens to use for speculative decoding (default: 5)."],
  ["--draft-p-min", "", "Minimum speculative decoding probability (greedy) (default: 0.9)."],
  ["--ctx-size-draft", "-cd", "Size of the prompt context for the draft model (default: 0, 0 = loaded from model)."],
  ["--gpu-layers-draft", "-ngld", "Number of layers to store in VRAM for the draft model."],
  ["--n-gpu-layers-draft", "-ngld", "Number of layers to store in VRAM for the draft model."],
] as const;