"""Ingestion pipeline — orchestrates all 7 phases.

Phase 1: Structure (walk filesystem)
Phase 2: Parsing (Tree-sitter AST extraction)
Phase 3: Resolution (cross-file references)
Phase 4: Clustering (community detection)
Phase 5: Processes (execution flow tracing)
Phase 6: Documents (chunk & link docs to code)
Phase 7: Embeddings (vector embeddings for hybrid search)
"""

from __future__ import annotations

import time

from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn

from ..models import NodeType, ProjectIndex
from ._console import console
from .walker import walk_project
from .parser import parse_all_files
from .resolver import resolve_references
from .cluster import compute_clusters
from .process_tracer import trace_processes
from .doc_chunker import ingest_documents



class PipelineInterrupted(Exception):
    """Raised when the user cancels the pipeline with Ctrl+C."""
    pass


def run_pipeline(root_path: str) -> ProjectIndex:
    """Run the full ingestion pipeline on a project directory.

    Returns a ProjectIndex with all nodes and edges.
    Supports Ctrl+C cancellation between phases.
    """
    console.print(f"\n[bold blue]Indexing:[/bold blue] {root_path}\n")
    start = time.time()

    phases = [
        "Phase 1: Structure",
        "Phase 2: Parsing",
        "Phase 3: Resolution",
        "Phase 4: Clustering",
        "Phase 5: Processes",
        "Phase 6: Documents",
        "Phase 7: Embeddings",
    ]

    def _check_cancel():
        """Allow Ctrl+C to propagate between phases."""
        pass  # Python checks signals between bytecodes; this forces a check point

    try:
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            TimeElapsedColumn(),
            console=console,
        ) as progress:
            main_task = progress.add_task("Indexing...", total=len(phases))

            # Phase 1: Structure
            progress.update(main_task, description=phases[0])
            files, structure_nodes, structure_edges = walk_project(root_path)
            progress.advance(main_task)

            # Phase 2: Parsing
            progress.update(main_task, description=f"{phases[1]} ({len(files)} files)")
            symbol_nodes, symbol_edges = parse_all_files(files)
            progress.advance(main_task)

            # Combine (extend in-place to avoid copying)
            all_nodes = structure_nodes
            all_nodes.extend(symbol_nodes)
            all_edges = structure_edges
            all_edges.extend(symbol_edges)

            # Phase 3: Resolution
            progress.update(main_task, description=phases[2])
            new_edges = resolve_references(files, all_nodes, all_edges)
            all_edges[:] = (e for e in all_edges if not e.target_uid.startswith("unresolved:"))
            all_edges.extend(new_edges)
            progress.advance(main_task)

            # Free cached source text — releases potentially GBs of RAM
            for f in files:
                f.source_text = ""

            # Phase 4: Clustering
            progress.update(main_task, description=phases[3])
            cluster_nodes, cluster_edges = compute_clusters(all_nodes, all_edges)
            all_nodes.extend(cluster_nodes)
            all_edges.extend(cluster_edges)
            progress.advance(main_task)

            # Phase 5: Process Tracing
            progress.update(main_task, description=phases[4])
            process_nodes, process_edges = trace_processes(all_nodes, all_edges)
            all_nodes.extend(process_nodes)
            all_edges.extend(process_edges)
            progress.advance(main_task)

            # Phase 6: Document Ingestion
            progress.update(main_task, description=phases[5])
            _code_types = frozenset((NodeType.FUNCTION, NodeType.CLASS, NodeType.METHOD))
            code_symbols = [n for n in all_nodes if n.node_type in _code_types]
            doc_nodes, doc_edges = ingest_documents(root_path, code_symbols)
            all_nodes.extend(doc_nodes)
            all_edges.extend(doc_edges)
            progress.advance(main_task)

            index = ProjectIndex(
                root_path=root_path,
                files=files,
                nodes=all_nodes,
                edges=all_edges,
            )

            # Phase 7: Embeddings
            progress.update(main_task, description=phases[6])
            embed_count = _run_embedding_phase(index)
            progress.advance(main_task)

    except KeyboardInterrupt:
        console.print("\n[bold red]Cancelled![/bold red] Pipeline interrupted by user.")
        raise PipelineInterrupted("Cancelled by user")

    elapsed = time.time() - start
    stats = index.stats
    console.print(f"\n[bold green]Done![/bold green] {elapsed:.1f}s")
    console.print(f"  Nodes: {stats['nodes']}  Edges: {stats['edges']}")
    console.print(f"  Types: {stats['node_types']}")
    console.print(f"  Edges: {stats['edge_types']}")
    if embed_count:
        console.print(f"  Vectors: {embed_count}")

    return index


def _run_embedding_phase(index: ProjectIndex) -> int:
    """Phase 7: Generate and store embeddings for symbols and doc chunks.

    Returns number of vectors stored, or 0 if embedding is unavailable.
    """
    from ..search.embedder import Embedder
    from ..search.vector_store import VectorStore

    try:
        console.print("[blue]  Phase 7:[/blue] Generating embeddings...")
        try:
            embedder = Embedder()
            store = VectorStore()
        except ImportError as e:
            console.print(f"[dim]  Skipping embeddings ({e})[/dim]")
            console.print("[dim]  For hybrid search: pip install sentence-transformers qdrant-client[/dim]")
            return 0
        store.delete_collection()  # Fresh re-index

        # Collect embeddable symbols
        embeddable_types = frozenset((NodeType.FUNCTION, NodeType.CLASS, NodeType.METHOD))
        symbols = [
            n for n in index.nodes
            if n.node_type in embeddable_types
        ]

        if not symbols:
            return 0

        # Build texts, ids, and payloads in a single pass
        texts = []
        ids = []
        payloads = []
        for sym in symbols:
            parts = [sym.name]
            if sym.signature:
                parts.append(sym.signature)
            if sym.docstring:
                parts.append(sym.docstring)
            texts.append(" ".join(parts))
            ids.append(sym.uid)
            payloads.append({
                "name": sym.name,
                "node_type": sym.node_type.value,
                "file_path": sym.file_path,
                "line_start": sym.line_start,
                "signature": sym.signature or "",
                "docstring": (sym.docstring or "")[:200],
            })

        # Embed
        vectors = embedder.embed_texts(texts)

        # Store
        store.upsert(ids, vectors, payloads)
        count = store.count()
        console.print(f"  [green]✓[/green] Stored {count} vectors in Qdrant")
        return count

    except Exception as e:
        console.print(f"[yellow]  Embedding phase skipped:[/yellow] {e}")
        return 0
