import { open } from "node:fs/promises";
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
import { NodeHtmlMarkdown } from "node-html-markdown";
import { exec } from "./utils.js";

export type ExtractionStatus = "success" | "failed" | "unsupported";

export interface ExtractedContent {
  extracted: string;
  title?: string;
  extractor?: string;
  extraction_status?: ExtractionStatus;
  content_type?: string;
}

export interface FileExtractor {
  format: string;
  shouldReadText: boolean;
  extractorName?: string;
  content_type?: string;
  matches(filePath: string): boolean;
  extract(args: FileExtractArgs): Promise<string> | string;
}

interface FileExtractArgs {
  pi: ExtensionAPI;
  filePath: string;
  content: string;
  signal?: AbortSignal;
}

interface UrlExtractor {
  matches(url: string): boolean;
  extract(args: UrlExtractArgs): Promise<ExtractedContent>;
}

interface UrlExtractArgs {
  pi: ExtensionAPI;
  url: string;
  signal?: AbortSignal;
}

// ---------------------------------------------------------------------------
// Binary magic byte detection
// ---------------------------------------------------------------------------

const BINARY_SIGNATURES: Array<{ bytes: number[]; format: string }> = [
  // Archives & documents
  { bytes: [0x50, 0x4b, 0x03, 0x04], format: "zip" }, // ZIP / DOCX / XLSX / PPTX / JAR
  { bytes: [0x25, 0x50, 0x44, 0x46], format: "pdf" }, // %PDF
  { bytes: [0x37, 0x7a, 0xbc, 0xaf], format: "7z" }, // 7-Zip
  { bytes: [0x1f, 0x8b], format: "gzip" }, // gzip / .tar.gz
  // Images
  { bytes: [0x89, 0x50, 0x4e, 0x47], format: "png" }, // PNG
  { bytes: [0xff, 0xd8, 0xff], format: "jpeg" }, // JPEG
  { bytes: [0x47, 0x49, 0x46, 0x38], format: "gif" }, // GIF8
  { bytes: [0x42, 0x4d], format: "bmp" }, // BMP
  { bytes: [0x49, 0x49, 0x2a, 0x00], format: "tiff" }, // TIFF (little-endian)
  { bytes: [0x4d, 0x4d, 0x00, 0x2a], format: "tiff" }, // TIFF (big-endian)
  { bytes: [0x52, 0x49, 0x46, 0x46], format: "riff" }, // RIFF (WAV / AVI / WebP)
  // Executables & binaries
  { bytes: [0x4d, 0x5a], format: "exe" }, // Windows PE (EXE / DLL)
  { bytes: [0xcf, 0xfa, 0xed, 0xfe], format: "macho" }, // Mach-O 64-bit LE
  { bytes: [0xce, 0xfa, 0xed, 0xfe], format: "macho" }, // Mach-O 32-bit LE
  { bytes: [0xfe, 0xed, 0xfa, 0xcf], format: "macho" }, // Mach-O 64-bit BE
  { bytes: [0xfe, 0xed, 0xfa, 0xce], format: "macho" }, // Mach-O 32-bit BE
  { bytes: [0xca, 0xfe, 0xba, 0xbe], format: "class" }, // Java .class / Mach-O FAT
  { bytes: [0x7f, 0x45, 0x4c, 0x46], format: "elf" }, // ELF binary
  { bytes: [0x00, 0x61, 0x73, 0x6d], format: "wasm" }, // WebAssembly
  // Data & media
  { bytes: [0x53, 0x51, 0x4c, 0x69], format: "sqlite" }, // SQLite
  { bytes: [0x49, 0x44, 0x33], format: "mp3" }, // MP3 (ID3 tag)
];

/**
 * Reads the first 8 bytes of `filePath` and checks them against known binary
 * magic byte signatures. Returns the detected format name or `null` for text.
 */
export async function detectBinaryMagicBytes(filePath: string): Promise<string | null> {
  let handle: import("node:fs/promises").FileHandle | undefined;
  try {
    handle = await open(filePath, "r");
    const buf = Buffer.alloc(8);
    const { bytesRead } = await handle.read(buf, 0, 8, 0);
    const header = buf.subarray(0, bytesRead);

    for (const { bytes, format } of BINARY_SIGNATURES) {
      if (bytes.every((b, i) => header[i] === b)) return format;
    }
    return null;
  } catch {
    return null; // Unreadable file — let the extractor deal with it
  } finally {
    await handle?.close();
  }
}

export function binaryExtractionFailureMessage(format: string): string {
  return `_Binary file could not be converted to markdown (detected format: ${format}).\nCapture a text-based version or a URL pointing to readable content instead._\n`;
}

// ---------------------------------------------------------------------------

const DEFAULT_MARKITDOWN_TIMEOUT_MS = 180_000;
const DEFAULT_CURL_TIMEOUT_SECONDS = 30;

const FILE_EXTRACTORS: FileExtractor[] = [
  {
    format: "pdf",
    shouldReadText: false,
    extractorName: "markitdown",
    content_type: "application/pdf",
    matches: hasExtension(".pdf"),
    extract: ({ pi, filePath, signal }) => extractPdf(pi, filePath, signal),
  },
  textFileExtractor("markdown", [".md"], "text/markdown"),
  textFileExtractor("text", [".txt"], "text/plain"),
  textFileExtractor("html", [".html", ".htm"], "text/html"),
  {
    format: "xml",
    shouldReadText: true,
    extractorName: "xmlToMarkdown",
    content_type: "application/xml",
    matches: hasExtension(".xml"),
    extract: ({ content }) => xmlToMarkdown(content),
  },
  {
    format: "json",
    shouldReadText: true,
    extractorName: "jsonToMarkdown",
    content_type: "application/json",
    matches: hasExtension(".json"),
    extract: ({ content }) => jsonToMarkdown(content),
  },
  {
    format: "docx",
    shouldReadText: false,
    extractorName: "markitdown",
    content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    matches: hasExtension(".docx"),
    extract: ({ pi, filePath, signal }) => extractDocx(pi, filePath, signal),
  },
  textFileExtractor("file", []),
];

const URL_EXTRACTORS: UrlExtractor[] = [
  {
    matches: isPdfUrl,
    extract: ({ pi, url, signal }) => extractPdfUrl(pi, url, signal),
  },
  {
    matches: () => true,
    extract: ({ pi, url, signal }) => extractTextUrl(pi, url, signal),
  },
];

export function fileExtractorFor(filePath: string): FileExtractor {
  return (
    FILE_EXTRACTORS.find((extractor) => extractor.matches(filePath)) ?? FILE_EXTRACTORS.at(-1)!
  );
}

export function extractUrlContent(
  pi: ExtensionAPI,
  url: string,
  signal?: AbortSignal,
): Promise<ExtractedContent> {
  const extractor =
    URL_EXTRACTORS.find((candidate) => candidate.matches(url)) ?? URL_EXTRACTORS.at(-1)!;
  return extractor.extract({ pi, url, signal });
}

export function pdfExtractionFailureMessage(source: string): string {
  return `_PDF content could not be converted to markdown from ${source}. Try increasing WIKI_MARKITDOWN_TIMEOUT_MS._\n`;
}

function textFileExtractor(
  format: string,
  extensions: string[],
  contentType?: string,
): FileExtractor {
  return {
    format,
    shouldReadText: true,
    extractorName: "passthrough",
    content_type: contentType,
    matches: extensions.length ? hasAnyExtension(extensions) : () => true,
    extract: ({ content }) => content,
  };
}

function hasExtension(extension: string): (path: string) => boolean {
  return (path) => path.toLowerCase().endsWith(extension);
}

function hasAnyExtension(extensions: string[]): (path: string) => boolean {
  return (path) => extensions.some((extension) => hasExtension(extension)(path));
}

async function extractPdf(pi: ExtensionAPI, source: string, signal?: AbortSignal): Promise<string> {
  const extracted = await extractWithMarkItDown(pi, source, signal);
  return extracted || pdfExtractionFailureMessage(source);
}

export function docxExtractionFailureMessage(source: string): string {
  return `_DOCX content could not be converted to markdown from ${source}. Ensure uvx and markitdown are installed._\n`;
}

async function extractDocx(
  pi: ExtensionAPI,
  source: string,
  signal?: AbortSignal,
): Promise<string> {
  const extracted = await extractWithMarkItDown(pi, source, signal);
  return extracted || docxExtractionFailureMessage(source);
}

async function extractPdfUrl(
  pi: ExtensionAPI,
  url: string,
  signal?: AbortSignal,
): Promise<ExtractedContent> {
  const extracted = await extractPdf(pi, url, signal);
  const failed = extracted.includes("could not be converted");
  return {
    extracted,
    title: titleFromMarkdown(extracted),
    extractor: "markitdown",
    extraction_status: failed ? "failed" : "success",
    content_type: "application/pdf",
  };
}

async function extractTextUrl(
  pi: ExtensionAPI,
  url: string,
  signal?: AbortSignal,
): Promise<ExtractedContent> {
  const markitdownExtracted = await extractWithMarkItDown(pi, url, signal);
  if (markitdownExtracted) {
    return {
      extracted: markitdownExtracted,
      title: titleFromMarkdown(markitdownExtracted),
      extractor: "markitdown",
      extraction_status: "success",
    };
  }

  const curlExtracted = await fetchTextUrl(pi, url, signal);
  if (!curlExtracted) return { extracted: "", extractor: "none", extraction_status: "failed" };
  if (looksLikePdf(curlExtracted)) {
    return {
      extracted: pdfExtractionFailureMessage(url),
      extractor: "curl",
      extraction_status: "failed",
      content_type: "application/pdf",
    };
  }
  const normalized = htmlToMarkdown(curlExtracted);
  return {
    extracted: normalized,
    title: titleFromMarkdown(normalized) ?? titleFromHtml(curlExtracted),
    extractor: "htmlToMarkdown",
    extraction_status: "success",
  };
}

async function extractWithMarkItDown(
  pi: ExtensionAPI,
  source: string,
  signal?: AbortSignal,
): Promise<string> {
  if (!(await hasMarkItDown(pi, signal))) return "";

  try {
    const mdResult = await exec(
      pi,
      "sh",
      ["-c", `uvx --from 'markitdown[docx,pdf]' markitdown "${source}" 2>/dev/null || echo ""`],
      { signal, timeout: markitdownTimeoutMs() },
    );
    return mdResult.stdout.trim() ? mdResult.stdout : "";
  } catch {
    return "";
  }
}

async function hasMarkItDown(pi: ExtensionAPI, signal?: AbortSignal): Promise<boolean> {
  const markitdown = await exec(
    pi,
    "sh",
    ["-c", `which uvx >/dev/null 2>&1 && echo "yes" || echo "no"`],
    { signal },
  );
  return markitdown.stdout.trim() === "yes";
}

async function fetchTextUrl(pi: ExtensionAPI, url: string, signal?: AbortSignal): Promise<string> {
  try {
    const curlResult = await exec(
      pi,
      "curl",
      ["-sL", "--max-time", String(DEFAULT_CURL_TIMEOUT_SECONDS), url],
      {
        signal,
        timeout: (DEFAULT_CURL_TIMEOUT_SECONDS + 5) * 1_000,
      },
    );
    return curlResult.stdout || "";
  } catch {
    return "";
  }
}

function markitdownTimeoutMs(): number {
  return positiveIntegerFromEnv("WIKI_MARKITDOWN_TIMEOUT_MS", DEFAULT_MARKITDOWN_TIMEOUT_MS);
}

function positiveIntegerFromEnv(name: string, fallback: number): number {
  const raw = process.env[name];
  if (!raw) return fallback;
  const parsed = Number.parseInt(raw, 10);
  return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
}

function isPdfUrl(url: string): boolean {
  try {
    return new URL(url).pathname.toLowerCase().endsWith(".pdf");
  } catch {
    return url.toLowerCase().split(/[?#]/, 1)[0].endsWith(".pdf");
  }
}

function looksLikePdf(content: string): boolean {
  return content.trimStart().startsWith("%PDF-");
}

function titleFromMarkdown(markdown: string): string | undefined {
  return markdown.match(/^#\s+(.+)$/m)?.[1]?.trim();
}

function titleFromHtml(html: string): string | undefined {
  return html.match(/<title>([^<]*)<\/title>/i)?.[1]?.trim();
}

/** Decode common HTML/XML entities. Shared by xmlToMarkdown and htmlToMarkdown. */
function decodeHtmlEntities(text: string): string {
  return text.replace(/&(?:amp|lt|gt|quot|apos|#\d+);/gi, (entity) => {
    const map: Record<string, string> = {
      "&amp;": "&",
      "&lt;": "<",
      "&gt;": ">",
      "&quot;": '"',
      "&apos;": "'",
    };
    const lower = entity.toLowerCase();
    if (map[lower]) return map[lower];
    if (lower.startsWith("&#")) return String.fromCodePoint(Number.parseInt(entity.slice(2, -1)));
    return entity;
  });
}

/** Basic XML to markdown conversion: strip tags while preserving text structure. */
function xmlToMarkdown(xml: string): string {
  let title = "";
  const titleMatch = xml.match(/<title[^>]*>([^<]*)<\/title>/i);
  if (titleMatch) title = titleMatch[1].trim();

  let text = xml.replace(/<\?xml[^>]*\?>\s*/gi, "");
  text = text.replace(/<!DOCTYPE[^>]*>\s*/gi, "");
  text = text.replace(/<\/(p|div|section|article|li|h\d|tr|blockquote|pre)>/gi, "\n");
  text = text.replace(/<br\s*\/?>/gi, "\n");

  let prev = "";
  while (prev !== text) {
    prev = text;
    text = text.replace(/<[a-zA-Z\/!?][^>]*>/g, "");
  }
  text = text.replace(/</g, "");

  text = decodeHtmlEntities(text);
  text = text.replace(/\n{3,}/g, "\n\n").trim();
  if (!text) return xml;

  const lines = [];
  if (title) lines.push(`# ${title}\n`);
  lines.push(text);
  return lines.join("\n\n");
}

/**
 * Lightweight HTML-to-markdown normalizer for the curl fallback path.
 *
 * Pre-strips page chrome (nav, header, footer, script, style) that
 * node-html-markdown does not remove, then delegates full conversion —
 * bold, italic, code blocks, tables, ordered lists, image alt text — to
 * node-html-markdown. Prepends the <title> as a # heading when the body
 * has no <h1> of its own.
 *
 * Falls back to the original HTML if conversion yields an empty string.
 */
export function htmlToMarkdown(input: string): string {
  // 1. Extract <title> from original before stripping head
  const title = input.match(/<title[^>]*>([^<]*)<\/title>/i)?.[1]?.trim() ?? "";

  // 2. Strip <head> and noise blocks that node-html-markdown won't remove
  let html = input.replace(/<head[\s\S]*?<\/head>/gi, "");
  let previousHtml = "";
  while (previousHtml !== html) {
    previousHtml = html;
    html = html.replace(/<(script|style|nav|header|footer|noscript)[\s\S]*?<\/\1>/gi, "");
  }

  // 3. Delegate to node-html-markdown for full semantic conversion
  const converted = NodeHtmlMarkdown.translate(html).trim();
  if (!converted) return input;

  // 4. Prepend <title> as # heading only if body has no <h1> of its own
  const hasBodyH1 = /<h1[^>]*>[\s\S]*?<\/h1>/i.test(html);
  const lines: string[] = [];
  if (title && !hasBodyH1) lines.push(`# ${title}\n`);
  lines.push(converted);
  return lines.join("\n");
}

function jsonToMarkdown(json: string): string {
  let value: unknown;
  try {
    value = JSON.parse(json);
  } catch {
    return json;
  }

  const lines: string[] = [];
  const title = titleFromValue(value) || "JSON Extract";
  lines.push(`# ${title}`, "");
  renderJsonValue(value, lines, 0);

  const markdown = lines
    .join("\n")
    .replace(/\n{3,}/g, "\n\n")
    .trim();
  return markdown || json;
}

function titleFromValue(value: unknown): string | undefined {
  if (!isRecord(value)) return undefined;
  for (const key of ["title", "name", "id"]) {
    const candidate = value[key];
    if (typeof candidate === "string" && candidate.trim()) return candidate.trim();
  }
  return undefined;
}

function isRecord(value: unknown): value is Record<string, unknown> {
  return typeof value === "object" && value !== null && !Array.isArray(value);
}

function renderJsonValue(value: unknown, lines: string[], depth: number, label?: string): void {
  if (Array.isArray(value)) {
    renderJsonArray(value, lines, depth, label);
    return;
  }

  if (isRecord(value)) {
    renderJsonObject(value, lines, depth, label);
    return;
  }

  if (label) lines.push(`${indent(depth)}- **${humanizeKey(label)}:** ${formatJsonScalar(value)}`);
  else lines.push(`${indent(depth)}- ${formatJsonScalar(value)}`);
}

function renderJsonObject(
  object: Record<string, unknown>,
  lines: string[],
  depth: number,
  label?: string,
): void {
  if (label) {
    lines.push(`${heading(depth)} ${humanizeKey(label)}`, "");
  }

  for (const [key, value] of Object.entries(object)) {
    if (Array.isArray(value) || isRecord(value)) {
      const childDepth = label ? depth + 1 : depth;
      renderJsonValue(value, lines, childDepth, key);
    } else {
      lines.push(`${indent(depth)}- **${humanizeKey(key)}:** ${formatJsonScalar(value)}`);
    }
  }
  lines.push("");
}

function renderJsonArray(array: unknown[], lines: string[], depth: number, label?: string): void {
  if (label) lines.push(`${heading(depth)} ${humanizeKey(label)}`, "");

  if (array.length === 0) {
    lines.push(`${indent(depth)}- _(empty)_`, "");
    return;
  }

  for (const [index, item] of array.entries()) {
    if (isRecord(item)) {
      const itemTitle = titleFromValue(item) || `Item ${index + 1}`;
      const itemDepth = label ? depth + 1 : depth;
      lines.push(`${heading(itemDepth)} ${itemTitle}`, "");
      renderJsonObject(item, lines, itemDepth);
    } else if (Array.isArray(item)) {
      lines.push(`${indent(depth)}- Item ${index + 1}:`);
      renderJsonArray(item, lines, depth + 1);
    } else {
      lines.push(`${indent(depth)}- ${formatJsonScalar(item)}`);
    }
  }
  lines.push("");
}

function formatJsonScalar(value: unknown): string {
  if (value === null) return "null";
  if (typeof value === "string") return value;
  if (typeof value === "number" || typeof value === "boolean") return String(value);
  return String(value);
}

function humanizeKey(key: string): string {
  return key
    .replace(/[_-]+/g, " ")
    .replace(/([a-z0-9])([A-Z])/g, "$1 $2")
    .replace(/\s+/g, " ")
    .trim()
    .replace(/^./, (char) => char.toUpperCase());
}

function heading(depth: number): string {
  return "#".repeat(Math.min(depth + 2, 6));
}

function indent(depth: number): string {
  return "  ".repeat(Math.max(0, depth));
}