import {describe, it, expect, vi, beforeEach} from 'vitest';

// Mock xrblocks so importing `Script` doesn't trigger the Core singleton,
// which constructs a real AudioContext (jsdom can't provide one). We
// only need Script (as a generic Object3D base), the VisemeWeights type,
// and ZERO_VISEME (used by dispose to reset the target).
vi.mock('xrblocks', async () => {
  const T = await import('three');
  return {
    Script: T.Object3D,
    core: {camera: undefined},
    ZERO_VISEME: {
      jawOpen: 0,
      aa: 0,
      oo: 0,
      oh: 0,
      ee: 0,
      consonant: 0,
    },
  };
});

import {LipsyncMouth, VisemeTarget} from './LipsyncMouth';
import type {VisemeWeights} from './BlendshapeReducer';
import {ZERO_VISEME} from './BlendshapeReducer';

/**
 * Stand-in for an xrblocks `StylizedFace`: captures the most recent
 * viseme weights so tests can assert against them. Real consumers
 * pass `user.avatar.face` or a freshly constructed `StylizedFace`.
 */
class FakeTarget implements VisemeTarget {
  visemes: VisemeWeights = {...ZERO_VISEME};
  setVisemes = vi.fn((v: VisemeWeights) => {
    this.visemes = {...v};
  });
}

// Minimal in-memory Web Audio mock: only what LipsyncMouth touches.
// AnalyserNode emits fixed-shape (silent) buffers; tests stub
// freqData/timeData via `(node as any).__setSpectrum()` to drive the mapper.
class MockAnalyserNode {
  fftSize = 1024;
  frequencyBinCount = 512;
  smoothingTimeConstant = 0.4;
  private _freq = new Uint8Array(this.frequencyBinCount);
  private _freqDb = new Float32Array(this.frequencyBinCount).fill(-120);
  private _time = new Uint8Array(this.fftSize).fill(128);
  connect = vi.fn();
  disconnect = vi.fn();
  getByteFrequencyData(out: Uint8Array) {
    out.set(this._freq);
  }
  getFloatFrequencyData(out: Float32Array) {
    out.set(this._freqDb);
  }
  getByteTimeDomainData(out: Uint8Array) {
    out.set(this._time);
  }
  __setLoudVoiced() {
    // Strong low-band + a F1/F2-shaped pair, plus non-silent time domain.
    for (let i = 0; i < 30; i++) this._freq[i] = 200;
    this._freq[20] = 255;
    this._freq[48] = 230;
    for (let i = 0; i < this._time.length; i++) {
      this._time[i] = 128 + Math.round(64 * Math.sin((i / 8) * Math.PI));
    }
  }
  __setSilent() {
    this._freq.fill(0);
    this._freqDb.fill(-120);
    this._time.fill(128);
  }
  /** Set the time-domain buffer so computeAudioFeatures sees ~targetRms. */
  __setRms(targetRms: number) {
    // sin wave amplitude a → RMS = a / sqrt(2). Solve for the byte
    // amplitude needed: int 128 + a*128 produces amplitude a in
    // [-1,1] space, hence RMS = a/sqrt(2). So a = targetRms * sqrt(2).
    const a = Math.min(0.99, targetRms * Math.SQRT2);
    for (let i = 0; i < this._time.length; i++) {
      this._time[i] = 128 + Math.round(a * 127 * Math.sin((i / 8) * Math.PI));
    }
    this._freq.fill(0);
    this._freqDb.fill(-120);
  }
}

class MockMediaStreamSource {
  connect = vi.fn();
  disconnect = vi.fn();
}

class MockAudioContext {
  sampleRate = 48000;
  state = 'running';
  createAnalyser = vi.fn(() => new MockAnalyserNode());
  createMediaStreamSource = vi.fn(() => new MockMediaStreamSource());
  resume = vi.fn(() => Promise.resolve());
  close = vi.fn(() => Promise.resolve());
}

function makeStream(): MediaStream {
  // jsdom provides a MediaStream shim sufficient for our needs.
  return new (globalThis.MediaStream ??
    (class {} as unknown as typeof MediaStream))();
}

let ctx: MockAudioContext;

beforeEach(() => {
  ctx = new MockAudioContext();
});

describe('LipsyncMouth', () => {
  it('is a THREE.Object3D suitable for parenting to a head pivot', () => {
    const m = new LipsyncMouth(makeStream(), {
      target: new FakeTarget(),
      audioContext: ctx as unknown as AudioContext,
    });
    expect(m.isObject3D).toBe(true);
  });

  it('constructor + init() builds the audio graph from the injected AudioContext', async () => {
    const m = new LipsyncMouth(makeStream(), {
      target: new FakeTarget(),
      audioContext: ctx as unknown as AudioContext,
    });
    await m.init();
    expect(ctx.createMediaStreamSource).toHaveBeenCalled();
    expect(ctx.createAnalyser).toHaveBeenCalled();
  });

  it('drives the supplied target instead of owning a visual itself', async () => {
    const target = new FakeTarget();
    const m = new LipsyncMouth(makeStream(), {
      target,
      audioContext: ctx as unknown as AudioContext,
    });
    await m.init();
    // No visual children — the target is the only thing rendering.
    expect(m.children.length).toBe(0);
    expect(m.target).toBe(target);
  });

  it('update() drives the target visemes when audio is loud / voiced', async () => {
    const target = new FakeTarget();
    const m = new LipsyncMouth(makeStream(), {
      target,
      audioContext: ctx as unknown as AudioContext,
    });
    await m.init();
    // Drive enough frames to overcome the smoothing time constant.
    const analyser = ctx.createAnalyser.mock.results[0]
      .value as MockAnalyserNode;
    analyser.__setLoudVoiced();
    for (let i = 0; i < 50; i++) m.update(i * 0.016);
    expect(target.visemes.jawOpen).toBeGreaterThan(0.05);
  });

  it('silent input → target stays at rest', async () => {
    const target = new FakeTarget();
    const m = new LipsyncMouth(makeStream(), {
      target,
      audioContext: ctx as unknown as AudioContext,
    });
    await m.init();
    for (let i = 0; i < 50; i++) m.update(i * 16);
    expect(target.visemes.jawOpen).toBeLessThan(0.05);
  });

  it('loud then silent: brief silence holds visemes; sustained silence decays them', async () => {
    const target = new FakeTarget();
    const m = new LipsyncMouth(makeStream(), {
      target,
      audioContext: ctx as unknown as AudioContext,
      // Default silenceHoldMs is 150; keep default for this test.
    });
    await m.init();
    const analyser = ctx.createAnalyser.mock.results[0]
      .value as MockAnalyserNode;
    analyser.__setLoudVoiced();
    for (let i = 0; i < 60; i++) m.update(i * 16);
    const peakJaw = target.visemes.jawOpen;
    expect(peakJaw).toBeGreaterThan(0.05);

    // First silent frames within the 150 ms hold window: target held in
    // place, no decay started yet. Brief gaps (~one frame) between
    // syllables should not cause any jitter — verified by the call
    // count not advancing during the hold.
    analyser.__setSilent();
    const callsBeforeHold = target.setVisemes.mock.calls.length;
    m.update(60 * 16 + 16);
    m.update(60 * 16 + 80);
    expect(target.setVisemes.mock.calls.length).toBe(callsBeforeHold);
    expect(target.visemes.jawOpen).toBe(peakJaw);

    // Past the hold window: mapper smoothing starts pulling toward zero.
    for (let i = 0; i < 40; i++) m.update(60 * 16 + 200 + i * 16);
    expect(target.visemes.jawOpen).toBeLessThan(0.02);
    expect(target.visemes.aa).toBeLessThan(0.02);
    expect(target.visemes.ee).toBeLessThan(0.02);
    expect(target.visemes.oo).toBeLessThan(0.02);
    expect(target.visemes.consonant).toBeLessThan(0.02);
  });

  it('voiced resumes mid-hold: silence timer resets, target never began decaying', async () => {
    const target = new FakeTarget();
    const m = new LipsyncMouth(makeStream(), {
      target,
      audioContext: ctx as unknown as AudioContext,
    });
    await m.init();
    const analyser = ctx.createAnalyser.mock.results[0]
      .value as MockAnalyserNode;
    analyser.__setLoudVoiced();
    for (let i = 0; i < 60; i++) m.update(i * 16);
    const peakJaw = target.visemes.jawOpen;

    // 100 ms silent gap (within the 150 ms hold), then voiced again.
    analyser.__setSilent();
    m.update(60 * 16 + 50);
    m.update(60 * 16 + 100);
    expect(target.visemes.jawOpen).toBe(peakJaw);
    analyser.__setLoudVoiced();
    m.update(60 * 16 + 116);
    expect(target.visemes.jawOpen).toBeGreaterThan(peakJaw * 0.8);
  });

  it('Schmitt hysteresis: noise-floor RMS chatter around silenceThreshold still accumulates the hold timer', async () => {
    const target = new FakeTarget();
    const m = new LipsyncMouth(makeStream(), {
      target,
      audioContext: ctx as unknown as AudioContext,
      silenceHoldMs: 100,
    });
    await m.init();
    const analyser = ctx.createAnalyser.mock.results[0]
      .value as MockAnalyserNode;
    analyser.__setLoudVoiced();
    for (let i = 0; i < 60; i++) m.update(i * 16);
    const peakJaw = target.visemes.jawOpen;
    expect(peakJaw).toBeGreaterThan(0.05);

    // Now drop to noise-floor chatter: alternating RMS just below and
    // just above the entry threshold (0.01), but always below the
    // exit threshold (0.0125).
    for (let i = 0; i < 30; i++) {
      analyser.__setRms(i % 2 === 0 ? 0.008 : 0.011);
      m.update(60 * 16 + i * 16);
    }
    // After ~480 ms of chatter we should be past the 100 ms hold and
    // well into mapper decay; the target must have moved off its peak.
    expect(target.visemes.jawOpen).toBeLessThan(peakJaw * 0.4);
  });

  it('dispose() resets the target to ZERO_VISEME so a face never freezes mid-vowel', async () => {
    const target = new FakeTarget();
    const m = new LipsyncMouth(makeStream(), {
      target,
      audioContext: ctx as unknown as AudioContext,
    });
    await m.init();
    const analyser = ctx.createAnalyser.mock.results[0]
      .value as MockAnalyserNode;
    analyser.__setLoudVoiced();
    for (let i = 0; i < 60; i++) m.update(i * 16);
    // Sanity: the target was actively driven open before dispose.
    expect(target.visemes.jawOpen).toBeGreaterThan(0.05);
    m.dispose();
    expect(target.visemes).toEqual(ZERO_VISEME);
  });

  it('dispose() disconnects analyser + source but does NOT dispose the target (caller owns it)', async () => {
    const target = new FakeTarget();
    const disposeSpy = vi.fn();
    (target as unknown as {dispose: () => void}).dispose = disposeSpy;
    const m = new LipsyncMouth(makeStream(), {
      target,
      audioContext: ctx as unknown as AudioContext,
    });
    await m.init();
    const source = ctx.createMediaStreamSource.mock.results[0]
      .value as MockMediaStreamSource;
    const analyser = ctx.createAnalyser.mock.results[0]
      .value as MockAnalyserNode;
    m.dispose();
    expect(source.disconnect).toHaveBeenCalled();
    expect(analyser.disconnect).toHaveBeenCalled();
    expect(disposeSpy).not.toHaveBeenCalled();
  });

  it('dispose() does NOT close the injected AudioContext (caller owns it)', async () => {
    const m = new LipsyncMouth(makeStream(), {
      target: new FakeTarget(),
      audioContext: ctx as unknown as AudioContext,
    });
    await m.init();
    m.dispose();
    expect(ctx.close).not.toHaveBeenCalled();
  });

  it('dispose() does NOT stop MediaStream tracks (caller owns the stream)', async () => {
    const stream = makeStream();
    const track = {
      stop: vi.fn(),
      kind: 'audio',
      enabled: true,
    } as unknown as MediaStreamTrack;
    // jsdom MediaStream doesn't expose addTrack consistently; monkey-patch
    // getTracks instead since that's what consumers iterate.
    (stream as unknown as {getTracks: () => MediaStreamTrack[]}).getTracks =
      () => [track];
    const m = new LipsyncMouth(stream, {
      target: new FakeTarget(),
      audioContext: ctx as unknown as AudioContext,
    });
    await m.init();
    m.dispose();
    expect(track.stop).not.toHaveBeenCalled();
  });

  it('two LipsyncMouths can share one AudioContext and one target each', async () => {
    const t1 = new FakeTarget();
    const t2 = new FakeTarget();
    const m1 = new LipsyncMouth(makeStream(), {
      target: t1,
      audioContext: ctx as unknown as AudioContext,
    });
    const m2 = new LipsyncMouth(makeStream(), {
      target: t2,
      audioContext: ctx as unknown as AudioContext,
    });
    await m1.init();
    await m2.init();
    expect(ctx.createMediaStreamSource).toHaveBeenCalledTimes(2);
    // Disposing one leaves the other working.
    m1.dispose();
    expect(ctx.close).not.toHaveBeenCalled();
    expect(m2.target).toBe(t2);
    m2.dispose();
  });
});