import React, { useEffect, useRef, useMemo, useState, forwardRef, useImperativeHandle } from 'react'; import { clsx } from 'clsx'; import './remote-control.css'; import { ANDROID_KEYS, AMOTION_EVENT, codeMap } from '../core/constants'; import iphoneFrameImage from '../assets/iphone16pro_black_bg.webp'; import pixelFrameImage from '../assets/pixel9_black.webp'; import pixelFrameImageLandscape from '../assets/pixel9_black_landscape.webp'; import pixelTabletFrameImage from '../assets/pixel_tablet_portrait.webp'; import pixelTabletFrameImageLandscape from '../assets/pixel_tablet_landscape.webp'; import iphoneFrameImageLandscape from '../assets/iphone16pro_black_landscape_bg.webp'; import appleLogoSvg from '../assets/Apple_logo_white.svg'; import androidBootImage from '../assets/android_boot.webp'; import { createTouchControlMessage, createInjectKeycodeMessage, createSetClipboardMessage, createTwoFingerTouchControlMessage, } from '../core/webrtc-messages'; import { AxFetcher, AxStatus } from '../core/ax-fetcher'; import { AxElement, AxSnapshot, axElementAtPoint, axSnapshotsEqual } from '../core/ax-tree'; import { InspectOverlay, InspectOverlayGeometry, InspectMode } from './inspect-overlay'; declare global { interface Window { debugRemoteControl?: boolean; } } interface RemoteControlProps { // url is the URL of the instance to connect to. url: string; // token is used to authenticate the connection to the instance. token: string; // className is the class name to apply to the component // on top of the default styles. className?: string; // sessionId is a unique identifier for the WebRTC session // with the source to prevent conflicts between other // users connected to the same source. // If empty, the component will generate a random one. sessionId?: string; // openUrl is the URL to open in the instance when the // component is ready. // // If not provided, the component will not open any URL. openUrl?: string; // showFrame controls whether to display the device frame // around the video. Defaults to true. showFrame?: boolean; // When true, drops after a working session auto-reconnect instead of // surfacing the manual "Retry" button. Defaults to false. autoReconnect?: boolean; /** * Enable the inspect overlay. When set, the component starts polling the * accessibility tree and draws boxes over each element on top of the * video stream. * * - `true` — Select mode. Boxes are clickable, click pins a selection * with action buttons (Tap / Copy selector / Copy id), ESC clears. * Device input is blocked while in this mode. * - `'hover-only'` — Boxes follow the cursor as a visual preview. Device * input still passes through, so you can drive the simulator while * inspecting. * - `undefined` / `false` (default) — overlay disabled, no polling. */ inspectMode?: boolean | 'hover-only'; /** * Fires whenever a fresh accessibility snapshot is delivered. * * Customers use this to drive their own side panels, agent prompts, * analytics, etc. The built-in overlay does not require this callback — * it renders from internal state regardless. * * Identical-to-previous snapshots (per `axSnapshotsEqual`) are NOT * re-emitted, so a stable UI doesn't generate callback noise. * * Invoked in a microtask so customer code doesn't run synchronously * inside React's commit phase. */ onAxSnapshotChange?: (snapshot: AxSnapshot | null) => void; /** * Fires when the user clicks an overlay element (only emitted when * `inspectMode === true`). `null` indicates a deselection (ESC, click * outside any box, or programmatic clear). * * The `snapshot` field is the snapshot active at the moment of the * click — useful for capturing context without races against the next * poll cycle. */ onInspectSelectionChange?: (selection: { element: AxElement; snapshot: AxSnapshot } | null) => void; /** * Fires whenever the accessibility subsystem changes coarse-grained * status. Useful for rendering readiness indicators or error banners in * a customer-built side panel. * * Transitions are deduplicated; no self-loops are emitted. The `error` * argument is populated when status is `error` or `unavailable`. * * Lifecycle: `idle` → `starting` → `ready` (or `unavailable` / `error`). * Recovery from `error` / `unavailable` is automatic — the fetcher * keeps polling and transitions back to `ready` on the next success. */ onAxStatusChange?: (status: AxStatus, error?: string) => void; /** * Base interval (ms) between successful AX-tree fetches. * * The fetcher will: * - Wait `axPollIntervalMs` after a successful fetch with NEW data. * - Double the wait (up to `axMaxBackoffMs`) when consecutive snapshots * are byte-identical (e.g. static screen). * - Wait 5 s when the server reports AX is unavailable. * * In addition, after user input (taps, scrolls, keypresses, openUrl, * terminateApp, orientation flips), the fetcher enters a short * "activity boost" window (~1.2 s) during which fetches happen at * ~250 ms regardless of this setting. This captures mid-animation UI * changes without you having to manually call `refreshAxTree`. * * @default 500 */ axPollIntervalMs?: number; /** * Maximum backoff (ms) for the AX-tree polling loop when consecutive * snapshots are unchanged. * * @default 2000 */ axMaxBackoffMs?: number; } interface ScreenshotData { dataUri: string; } export interface ImperativeKeyboardEvent { type: 'keydown' | 'keyup'; code: string; // e.g., "KeyA", "Enter", "ShiftLeft" shiftKey?: boolean; altKey?: boolean; ctrlKey?: boolean; metaKey?: boolean; } export interface RemoteControlHandle { openUrl: (url: string) => void; sendKeyEvent: (event: ImperativeKeyboardEvent) => void; screenshot: () => Promise; terminateApp: (bundleId: string) => Promise; reconnect: () => void; // Inspect-mode helpers. These are no-ops when inspect mode is disabled or // the WebSocket isn't open. // Force a fresh accessibility-tree fetch outside the normal poll cadence. refreshAxTree: () => Promise; // Pull-based access to the most recent snapshot (the same one passed to // onAxSnapshotChange). Returns null when no snapshot has arrived yet or // when inspect mode is off. getAxSnapshot: () => AxSnapshot | null; // Programmatically drive the overlay highlight/selection — useful when a // customer's own side panel wants to cross-highlight with the overlay. // Pass `null` to clear. setInspectHighlight: (element: AxElement | null) => void; setInspectSelection: (element: AxElement | null) => void; // Pull-based access to the current AX subsystem status. Mirrors what // onAxStatusChange reports, for customers that don't want to subscribe. getAxStatus: () => AxStatus; } const debugLog = (...args: any[]) => { if (window.debugRemoteControl) { console.log(...args); } }; const debugWarn = (...args: any[]) => { if (window.debugRemoteControl) { console.warn(...args); } }; // Invokes a customer-provided callback in isolation. A throw from the // customer's code must NOT propagate back into our state-update flow — that // would risk corrupting React reconciliation. We log the error to the // console so the customer can still debug, but otherwise swallow. const safeInvoke = ( label: string, fn: ((...args: Args) => unknown) | undefined, ...args: Args ): void => { if (!fn) return; try { fn(...args); } catch (err) { // Surface to the developer regardless of debug flag — this is a bug // in the customer's handler and they'll want to see it. // eslint-disable-next-line no-console console.error(`[RemoteControl] customer callback "${label}" threw:`, err); } }; const motionActionToString = (action: number): string => { // AMOTION_EVENT is a constants object; find the matching ACTION_* key if present const match = Object.entries(AMOTION_EVENT).find( ([key, value]) => key.startsWith('ACTION_') && value === action, ); return match?.[0] ?? String(action); }; type DevicePlatform = 'ios' | 'android'; const detectPlatform = (url: string): DevicePlatform => { if (url.includes('/android_')) { return 'android'; } // Default to iOS if no Android pattern is found return 'ios'; }; type DeviceConfig = { videoBorderRadiusMultiplier: number; loadingLogo: string; loadingLogoSize: string; videoPosition: { portrait: { heightMultiplier?: number; widthMultiplier?: number }; landscape: { heightMultiplier?: number; widthMultiplier?: number }; }; frame: { image: string; imageLandscape: string; }; }; const ANDROID_TABLET_VIDEO_WIDTH = 1920; const ANDROID_TABLET_VIDEO_HEIGHT = 1200; const MAX_CONNECTION_ATTEMPTS = 3; const CONNECTION_RETRY_DELAY_MS = 1000; const CONNECTION_SUCCESS_TIMEOUT_MS = 15000; const ICE_DISCONNECTED_GRACE_MS = 3000; const isAndroidTabletVideo = (width: number, height: number): boolean => (width === ANDROID_TABLET_VIDEO_WIDTH && height === ANDROID_TABLET_VIDEO_HEIGHT) || (width === ANDROID_TABLET_VIDEO_HEIGHT && height === ANDROID_TABLET_VIDEO_WIDTH); // Device-specific configuration for frame sizing and video positioning // Video position percentages are relative to the frame image dimensions const deviceConfig: Record = { ios: { frame: { image: iphoneFrameImage, imageLandscape: iphoneFrameImageLandscape, }, videoBorderRadiusMultiplier: 0.15, loadingLogo: appleLogoSvg, loadingLogoSize: '20%', // Video position as percentage of frame dimensions videoPosition: { portrait: { heightMultiplier: 0.9678 }, landscape: { widthMultiplier: 0.9678 }, }, }, android: { frame: { image: pixelFrameImage, imageLandscape: pixelFrameImageLandscape, }, videoBorderRadiusMultiplier: 0.13, loadingLogo: androidBootImage, loadingLogoSize: '40%', // Video position as percentage of frame dimensions videoPosition: { portrait: { heightMultiplier: 0.967 }, landscape: { widthMultiplier: 0.962 }, }, }, }; function getAndroidKeycodeAndMeta(event: React.KeyboardEvent): { keycode: number; metaState: number } | null { const code = event.code; const keycode = codeMap[code]; if (!keycode) { // Use the wrapper for conditional warning debugWarn(`Unknown event.code: ${code}, key: ${event.key}`); return null; } let metaState = ANDROID_KEYS.META_NONE; const isLetter = code >= 'KeyA' && code <= 'KeyZ'; const isCapsLock = event.getModifierState('CapsLock'); const isShiftPressed = event.shiftKey; // Determine effective shift state let effectiveShift = isShiftPressed; if (isLetter) { effectiveShift = isShiftPressed !== isCapsLock; // Logical XOR for booleans } // Apply meta states if (effectiveShift) metaState |= ANDROID_KEYS.META_SHIFT_ON; if (event.ctrlKey) metaState |= ANDROID_KEYS.META_CTRL_ON; if (event.altKey) metaState |= ANDROID_KEYS.META_ALT_ON; if (event.metaKey) metaState |= ANDROID_KEYS.META_META_ON; // Command on Mac, Windows key on Win return { keycode, metaState }; } export const RemoteControl = forwardRef( ( { className, url, token, sessionId: propSessionId, openUrl, showFrame = true, autoReconnect = false, inspectMode, onAxSnapshotChange, onInspectSelectionChange, onAxStatusChange, axPollIntervalMs, axMaxBackoffMs, }: RemoteControlProps, ref, ) => { const containerRef = useRef(null); const videoRef = useRef(null); const frameRef = useRef(null); const [videoLoaded, setVideoLoaded] = useState(false); const [retryExhausted, setRetryExhausted] = useState(false); const [isLandscape, setIsLandscape] = useState(false); const [useAndroidTabletFrame, setUseAndroidTabletFrame] = useState(false); const [videoStyle, setVideoStyle] = useState({}); const wsRef = useRef(null); const peerConnectionRef = useRef(null); const dataChannelRef = useRef(null); const keepAliveIntervalRef = useRef(undefined); const retryTimeoutRef = useRef(undefined); const connectionSuccessTimeoutRef = useRef(undefined); const requestFrameIntervalRef = useRef(undefined); const iceDisconnectedGraceRef = useRef(undefined); const connectionGenerationRef = useRef(0); const connectionAttemptRef = useRef(0); const controlChannelOpenedRef = useRef(false); // Mirrored to a ref so stale closures in event handlers see the latest value. const autoReconnectRef = useRef(autoReconnect); autoReconnectRef.current = autoReconnect; const firstFrameShownRef = useRef(false); const pendingScreenshotResolversRef = useRef< Map) => void> >(new Map()); const pendingScreenshotRejectersRef = useRef void>>(new Map()); const pendingTerminateAppResolversRef = useRef void>>(new Map()); const pendingTerminateAppRejectersRef = useRef void>>(new Map()); // Map to track active pointers for real touch/mouse single-finger events. // Key: pointerId (-1 for mouse, touch.identifier for touch), Value: { x: number, y: number } const activePointers = useRef>(new Map()); // Alt/Option modifier state for pinch emulation. // We use a ref as the source of truth (for synchronous event handler access) // and state only to trigger re-renders for the visual indicators. const isAltHeldRef = useRef(false); const [isAltHeld, setIsAltHeld] = useState(false); // State for any two-finger gesture (Alt+mouse simulated or real two-finger touch). // Tracks positions, video size, source, and pointer IDs (for Android protocol). type TwoFingerState = { finger0: { x: number; y: number }; finger1: { x: number; y: number }; videoSize: { width: number; height: number }; // Track source so we know when to clear (Alt release vs touch end) source: 'alt-mouse' | 'real-touch'; // Pointer IDs for Android (real touch.identifier or simulated -1/-2) pointerId0: number; pointerId1: number; }; const twoFingerStateRef = useRef(null); // Hover point for rendering two-finger indicators when Alt is held. // Only computed/set when Alt is held to avoid unnecessary re-renders. type HoverPoint = { containerX: number; containerY: number; mirrorContainerX: number; mirrorContainerY: number; videoX: number; videoY: number; mirrorVideoX: number; mirrorVideoY: number; videoWidth: number; videoHeight: number; }; const [hoverPoint, setHoverPoint] = useState(null); // Inspect-mode state. // // Lifecycle of `axFetcherRef`: // - Created in dataChannel.onopen (last step of WebRTC handshake) // so we know the signaling WS is healthy and the device is // responsive to control messages. // - Started immediately if `inspectMode` is already enabled, or // started later via the sibling useEffect when inspectMode flips on. // - Stopped + nulled in teardownConnection (WS close / unmount). // // Customers can observe readiness via the `onAxStatusChange` callback: // `starting` fires when start() runs but no snapshot has landed yet; // `ready` once the first snapshot arrives. Status falls back to // `unavailable` / `error` if the server can't satisfy AX requests. const axFetcherRef = useRef(null); const [axSnapshot, setAxSnapshot] = useState(null); const [axHighlightedId, setAxHighlightedId] = useState(null); const [axSelectedId, setAxSelectedId] = useState(null); const [overlayGeometry, setOverlayGeometry] = useState(null); // Viewport-space cursor position used to anchor the inspect InfoCard. // Throttled to one update per animation frame to avoid React reconciling // on every native mousemove (~60–120Hz). const [axCursorPosition, setAxCursorPosition] = useState<{ x: number; y: number } | null>(null); const cursorPositionRef = useRef<{ x: number; y: number } | null>(null); const cursorRafIdRef = useRef(undefined); const scheduleCursorFlush = (next: { x: number; y: number } | null) => { cursorPositionRef.current = next; if (cursorRafIdRef.current !== undefined) return; cursorRafIdRef.current = window.requestAnimationFrame(() => { cursorRafIdRef.current = undefined; setAxCursorPosition(cursorPositionRef.current); }); }; // Position captured at click-time so the InfoCard "freezes" near where // the user clicked, even as they move the cursor around afterward. The // action buttons (Tap / Copy) stay reachable because the card no longer // chases the cursor while the click target is the active selection. const [axFrozenCursorPosition, setAxFrozenCursorPosition] = useState<{ x: number; y: number; } | null>(null); // Mirrors for synchronous access from event handlers without stale closures. const inspectModeRef = useRef(inspectMode); inspectModeRef.current = inspectMode; const axSnapshotRef = useRef(null); axSnapshotRef.current = axSnapshot; const onAxSnapshotChangeRef = useRef(onAxSnapshotChange); onAxSnapshotChangeRef.current = onAxSnapshotChange; const onInspectSelectionChangeRef = useRef(onInspectSelectionChange); onInspectSelectionChangeRef.current = onInspectSelectionChange; const onAxStatusChangeRef = useRef(onAxStatusChange); onAxStatusChangeRef.current = onAxStatusChange; const inspectActive = inspectMode === true || inspectMode === 'hover-only'; const inspectModeResolved: InspectMode = inspectMode === 'hover-only' ? 'hover-only' : 'select'; const sessionId = useMemo( () => propSessionId || Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15), [propSessionId], ); const platform = useMemo(() => detectPlatform(url), [url]); const config = deviceConfig[platform]; const updateStatus = (message: string) => { // Use the wrapper for conditional logging debugLog(message); }; const sendBinaryControlMessage = (data: ArrayBuffer) => { if (!dataChannelRef.current || dataChannelRef.current.readyState !== 'open') { return; } dataChannelRef.current.send(data); // Any binary control message is an input event. Bump the AX poller so // we get a fresh snapshot quickly — the UI almost certainly changed. axFetcherRef.current?.bumpActivity(); }; // Pointer ID used by inspect-driven taps. Distinct from human pointers // (-1 mouse, -2 alt-mirror) and our touch identifiers so they never // interfere with an in-progress drag. const AX_TAP_POINTER_ID = -10; // Send a down+up tap at a viewport-space (clientX/Y) position. The point // is mapped through the current video letterbox geometry so the // simulator receives the correct in-stream coordinates regardless of // how the device frame is sized in the DOM. const sendTapAtClient = (clientX: number, clientY: number) => { const ctx = computeVideoMappingContext(); if (!ctx) return; const geometry = mapClientPointToVideo(ctx, clientX, clientY); if (!geometry) return; const { videoX, videoY, videoWidth, videoHeight } = geometry; const down = createTouchControlMessage( AMOTION_EVENT.ACTION_DOWN, AX_TAP_POINTER_ID, videoWidth, videoHeight, videoX, videoY, 1.0, AMOTION_EVENT.BUTTON_PRIMARY, AMOTION_EVENT.BUTTON_PRIMARY, ); if (down) sendBinaryControlMessage(down); window.setTimeout(() => { const up = createTouchControlMessage( AMOTION_EVENT.ACTION_UP, AX_TAP_POINTER_ID, videoWidth, videoHeight, videoX, videoY, 0, AMOTION_EVENT.BUTTON_PRIMARY, AMOTION_EVENT.BUTTON_PRIMARY, ); if (up) sendBinaryControlMessage(up); }, 60); }; // Center-of-bounds fallback for programmatic taps when there's no // user-aimed click position (e.g. customer calls `setInspectSelection` // followed by their own "tap selected" handler without forwarding a // pointer position). Maps the element's frame center through the AX // screen-coordinate space to viewport coords, then delegates to // sendTapAtClient. const sendTapAtElementCenter = (element: AxElement, snapshot: AxSnapshot) => { const ctx = computeVideoMappingContext(); if (!ctx) return; if (snapshot.screen.width <= 0 || snapshot.screen.height <= 0) return; const cxAx = element.frame.x + element.frame.width / 2; const cyAx = element.frame.y + element.frame.height / 2; // AX screen-fraction → in-video pixel offset → viewport client coord. const inVideoX = (cxAx / snapshot.screen.width) * ctx.actualWidth; const inVideoY = (cyAx / snapshot.screen.height) * ctx.actualHeight; const clientX = ctx.videoRect.left + ctx.offsetX + inVideoX; const clientY = ctx.videoRect.top + ctx.offsetY + inVideoY; sendTapAtClient(clientX, clientY); }; // Fixed pointer IDs for Alt-simulated two-finger gestures const ALT_POINTER_ID_PRIMARY = -1; const ALT_POINTER_ID_MIRROR = -2; // Helper to send a single-touch control message (used by both single-finger and Android two-finger paths) const sendSingleTouch = ( action: number, pointerId: number, videoWidth: number, videoHeight: number, x: number, y: number, ) => { const message = createTouchControlMessage( action, pointerId, videoWidth, videoHeight, x, y, 1.0, // pressure AMOTION_EVENT.BUTTON_PRIMARY, AMOTION_EVENT.BUTTON_PRIMARY, ); if (message) { debugLog('[rc-touch] sendSingleTouch', { action, actionName: motionActionToString(action), pointerId, x, y, video: { width: videoWidth, height: videoHeight }, }); sendBinaryControlMessage(message); } }; // Minimal geometry for single-finger touch events (no mirror/container coords needed). type PointerGeometry = { videoX: number; videoY: number; videoWidth: number; videoHeight: number; }; const applyPointerEvent = ( pointerId: number, eventType: 'down' | 'move' | 'up' | 'cancel', geometry: PointerGeometry | null, ) => { if (!geometry) return; const { videoX, videoY, videoWidth, videoHeight } = geometry; let action: number | null = null; let positionToSend: { x: number; y: number } | null = null; let pressure = 1.0; // Default pressure const buttons = AMOTION_EVENT.BUTTON_PRIMARY; // Assume primary button switch (eventType) { case 'down': // For multi-touch: use ACTION_DOWN for first pointer, ACTION_POINTER_DOWN for additional pointers const currentPointerCount = activePointers.current.size; action = currentPointerCount === 0 ? AMOTION_EVENT.ACTION_DOWN : AMOTION_EVENT.ACTION_POINTER_DOWN; positionToSend = { x: videoX, y: videoY }; activePointers.current.set(pointerId, positionToSend); if (pointerId === -1) { // Focus on mouse down videoRef.current?.focus(); } break; case 'move': if (activePointers.current.has(pointerId)) { action = AMOTION_EVENT.ACTION_MOVE; positionToSend = { x: videoX, y: videoY }; // Update the last known position for this active pointer activePointers.current.set(pointerId, positionToSend); } break; case 'up': case 'cancel': // Treat cancel like up, but use ACTION_CANCEL if (activePointers.current.has(pointerId)) { // IMPORTANT: Send the UP/CANCEL at the *last known position* inside the video positionToSend = activePointers.current.get(pointerId)!; activePointers.current.delete(pointerId); // Remove pointer as it's no longer active if (eventType === 'cancel') { action = AMOTION_EVENT.ACTION_CANCEL; } else { // For multi-touch: use ACTION_UP for last pointer, ACTION_POINTER_UP for non-last pointers const remainingPointerCount = activePointers.current.size; action = remainingPointerCount === 0 ? AMOTION_EVENT.ACTION_UP : AMOTION_EVENT.ACTION_POINTER_UP; } } break; } // Send message if action and position determined if (action !== null && positionToSend !== null) { debugLog('[rc-touch][mouse->touch] sending', { pointerId, eventType, action, actionName: motionActionToString(action), positionToSend, video: { width: videoWidth, height: videoHeight }, altHeld: isAltHeldRef.current, activePointersAfter: Array.from(activePointers.current.entries()).map(([id, pos]) => ({ id, x: pos.x, y: pos.y, })), }); const message = createTouchControlMessage( action, pointerId, videoWidth, videoHeight, positionToSend.x, positionToSend.y, pressure, buttons, buttons, ); if (message) { debugLog('[rc-touch][mouse->touch] buffer', { pointerId, actionName: motionActionToString(action), byteLength: message.byteLength, }); sendBinaryControlMessage(message); } } else if (eventType === 'up' || eventType === 'cancel') { activePointers.current.delete(pointerId); } }; // Update Alt modifier state. Only iOS Simulator uses Indigo modifier injection. const updateAltHeld = (nextHeld: boolean) => { if (isAltHeldRef.current === nextHeld) { return; } isAltHeldRef.current = nextHeld; setIsAltHeld(nextHeld); // Clear hover point when Alt is released to hide indicators immediately. if (!nextHeld) { setHoverPoint(null); } debugLog('[rc-touch][alt] updateAltHeld', { nextHeld, activePointerIds: Array.from(activePointers.current.keys()), }); // iOS Simulator pinch (Option/Alt+drag) behavior depends on the Option modifier being // active on the Indigo HID side. Send Alt key down/up immediately on toggle so the // sequence matches Simulator.app (Alt down -> mouse down/drag -> mouse up -> Alt up). // This is iOS-specific; Android doesn't use this modifier injection. if (platform === 'ios' && dataChannelRef.current && dataChannelRef.current.readyState === 'open') { const action = nextHeld ? ANDROID_KEYS.ACTION_DOWN : ANDROID_KEYS.ACTION_UP; const message = createInjectKeycodeMessage( action, ANDROID_KEYS.KEYCODE_ALT_LEFT, 0, ANDROID_KEYS.META_NONE, ); debugLog('[rc-touch][alt] sending Indigo modifier keycode', { action, keycode: ANDROID_KEYS.KEYCODE_ALT_LEFT, }); if (message) { sendBinaryControlMessage(message); } } }; // Mapping context computed once per DOM event, then reused for each pointer. type VideoMappingContext = { videoWidth: number; videoHeight: number; videoRect: DOMRect; containerRect: DOMRect; actualWidth: number; actualHeight: number; offsetX: number; offsetY: number; }; // Compute mapping context from current video/container state (once per event). const computeVideoMappingContext = (): VideoMappingContext | null => { const video = videoRef.current; const container = containerRef.current; if (!video || !container) return null; const videoWidth = video.videoWidth; const videoHeight = video.videoHeight; if (!videoWidth || !videoHeight) return null; const videoRect = video.getBoundingClientRect(); const containerRect = container.getBoundingClientRect(); const displayWidth = videoRect.width; const displayHeight = videoRect.height; const videoAspectRatio = videoWidth / videoHeight; const containerAspectRatio = displayWidth / displayHeight; let actualWidth = displayWidth; let actualHeight = displayHeight; if (videoAspectRatio > containerAspectRatio) { actualHeight = displayWidth / videoAspectRatio; } else { actualWidth = displayHeight * videoAspectRatio; } const offsetX = (displayWidth - actualWidth) / 2; const offsetY = (displayHeight - actualHeight) / 2; return { videoWidth, videoHeight, videoRect, containerRect, actualWidth, actualHeight, offsetX, offsetY, }; }; // Map a client point to video coordinates using a pre-computed context, // clamping points outside the rendered video to the nearest point on the video. const mapClientPointToVideo = ( ctx: VideoMappingContext, clientX: number, clientY: number, ): PointerGeometry | null => { const relativeX = clientX - ctx.videoRect.left - ctx.offsetX; const relativeY = clientY - ctx.videoRect.top - ctx.offsetY; const clampedRelativeX = Math.max(0, Math.min(ctx.actualWidth, relativeX)); const clampedRelativeY = Math.max(0, Math.min(ctx.actualHeight, relativeY)); const videoX = Math.max( 0, Math.min(ctx.videoWidth, (clampedRelativeX / ctx.actualWidth) * ctx.videoWidth), ); const videoY = Math.max( 0, Math.min(ctx.videoHeight, (clampedRelativeY / ctx.actualHeight) * ctx.videoHeight), ); return { videoX, videoY, videoWidth: ctx.videoWidth, videoHeight: ctx.videoHeight, }; }; // Compute full hover point with mirror/container coordinates (for Alt indicator rendering), // clamping points outside the rendered video to the nearest point on the video. const computeFullHoverPoint = ( ctx: VideoMappingContext, clientX: number, clientY: number, ): HoverPoint | null => { const relativeX = clientX - ctx.videoRect.left - ctx.offsetX; const relativeY = clientY - ctx.videoRect.top - ctx.offsetY; const clampedRelativeX = Math.max(0, Math.min(ctx.actualWidth, relativeX)); const clampedRelativeY = Math.max(0, Math.min(ctx.actualHeight, relativeY)); const videoX = Math.max( 0, Math.min(ctx.videoWidth, (clampedRelativeX / ctx.actualWidth) * ctx.videoWidth), ); const videoY = Math.max( 0, Math.min(ctx.videoHeight, (clampedRelativeY / ctx.actualHeight) * ctx.videoHeight), ); const mirrorVideoX = ctx.videoWidth - videoX; const mirrorVideoY = ctx.videoHeight - videoY; const contentLeft = ctx.videoRect.left + ctx.offsetX; const contentTop = ctx.videoRect.top + ctx.offsetY; const containerX = contentLeft - ctx.containerRect.left + clampedRelativeX; const containerY = contentTop - ctx.containerRect.top + clampedRelativeY; const mirrorContainerX = contentLeft - ctx.containerRect.left + (ctx.actualWidth - clampedRelativeX); const mirrorContainerY = contentTop - ctx.containerRect.top + (ctx.actualHeight - clampedRelativeY); return { containerX, containerY, mirrorContainerX, mirrorContainerY, videoX, videoY, mirrorVideoX, mirrorVideoY, videoWidth: ctx.videoWidth, videoHeight: ctx.videoHeight, }; }; // Helper to send a two-finger touch message (iOS-specific type=18 message). const sendTwoFingerMessage = ( action: number, videoWidth: number, videoHeight: number, x0: number, y0: number, x1: number, y1: number, ) => { const msg = createTwoFingerTouchControlMessage(action, videoWidth, videoHeight, x0, y0, x1, y1); debugLog('[rc-touch2] sendTwoFingerMessage (iOS)', { actionName: motionActionToString(action), video: { width: videoWidth, height: videoHeight }, p0: { x: x0, y: y0 }, p1: { x: x1, y: y1 }, byteLength: msg.byteLength, }); sendBinaryControlMessage(msg); }; // Generic two-finger event handler - sends platform-appropriate messages. // iOS: uses special two-finger message (type=18) // Android: sends two separate single-touch messages with proper action sequencing const applyTwoFingerEvent = ( eventType: 'down' | 'move' | 'up', videoWidth: number, videoHeight: number, x0: number, y0: number, x1: number, y1: number, pointerId0: number, pointerId1: number, ) => { debugLog('[rc-touch2] applyTwoFingerEvent', { platform, eventType, video: { width: videoWidth, height: videoHeight }, p0: { x: x0, y: y0, id: pointerId0 }, p1: { x: x1, y: y1, id: pointerId1 }, }); if (platform === 'ios') { // iOS: use special two-finger message (type=18) const action = eventType === 'down' ? AMOTION_EVENT.ACTION_DOWN : eventType === 'move' ? AMOTION_EVENT.ACTION_MOVE : AMOTION_EVENT.ACTION_UP; sendTwoFingerMessage(action, videoWidth, videoHeight, x0, y0, x1, y1); } else { // Android: send two separate single-touch messages with proper action codes // Per scrcpy protocol, each finger is a separate INJECT_TOUCH_EVENT with unique pointerId if (eventType === 'down') { // First finger down (ACTION_DOWN), then second finger down (ACTION_POINTER_DOWN) sendSingleTouch(AMOTION_EVENT.ACTION_DOWN, pointerId0, videoWidth, videoHeight, x0, y0); sendSingleTouch(AMOTION_EVENT.ACTION_POINTER_DOWN, pointerId1, videoWidth, videoHeight, x1, y1); } else if (eventType === 'move') { // Both fingers move (ACTION_MOVE for each) sendSingleTouch(AMOTION_EVENT.ACTION_MOVE, pointerId0, videoWidth, videoHeight, x0, y0); sendSingleTouch(AMOTION_EVENT.ACTION_MOVE, pointerId1, videoWidth, videoHeight, x1, y1); } else { // Second finger up (ACTION_POINTER_UP), then first finger up (ACTION_UP) sendSingleTouch(AMOTION_EVENT.ACTION_POINTER_UP, pointerId1, videoWidth, videoHeight, x1, y1); sendSingleTouch(AMOTION_EVENT.ACTION_UP, pointerId0, videoWidth, videoHeight, x0, y0); } } }; // Update hover point only when Alt is held (to avoid re-renders in normal path). const updateHoverPoint = (ctx: VideoMappingContext, clientX: number, clientY: number) => { if (!isAltHeldRef.current) { // Don't compute or update when Alt isn't held if (hoverPoint !== null) { setHoverPoint(null); } return; } const fullPoint = computeFullHoverPoint(ctx, clientX, clientY); setHoverPoint(fullPoint); }; // Map clientX/Y to AX screen-coordinate space using the latest snapshot. // Returns null if there's no snapshot or the click is outside the video. const hitTestAxAtClient = ( ctx: VideoMappingContext, clientX: number, clientY: number, ): AxElement | null => { const snapshot = axSnapshotRef.current; if (!snapshot || snapshot.screen.width <= 0 || snapshot.screen.height <= 0) return null; const relX = clientX - ctx.videoRect.left - ctx.offsetX; const relY = clientY - ctx.videoRect.top - ctx.offsetY; if (relX < 0 || relY < 0 || relX > ctx.actualWidth || relY > ctx.actualHeight) return null; const axX = (relX / ctx.actualWidth) * snapshot.screen.width; const axY = (relY / ctx.actualHeight) * snapshot.screen.height; return axElementAtPoint(snapshot, axX, axY); }; // Unified handler for both mouse and touch interactions const handleInteraction = (event: React.MouseEvent | React.TouchEvent) => { event.preventDefault(); event.stopPropagation(); // Compute mapping context once per event (reused for all pointers) const ctx = computeVideoMappingContext(); // Inspect-mode handling. // // We use JS hit-testing (not box-level onMouseEnter/Leave) as the // single source of truth for which element is under the cursor — it // handles overlapping rectangles deterministically by picking the // smallest matching box. The overlay's InspectBox children no longer // attach hover handlers; they just paint themselves based on the // `highlightedId` prop driven from here. // // Cursor position is tracked in both modes so the cursor-anchored // InfoCard can follow the pointer. const isInspecting = inspectModeRef.current === true || inspectModeRef.current === 'hover-only'; if (isInspecting && !('touches' in event)) { if (event.type === 'mousemove') { scheduleCursorFlush({ x: event.clientX, y: event.clientY }); if (ctx) { const hit = hitTestAxAtClient(ctx, event.clientX, event.clientY); setAxHighlightedId(hit?.id ?? null); } } else if (event.type === 'mouseleave') { scheduleCursorFlush(null); setAxHighlightedId(null); } } // Select mode blocks device input — clicks/drags don't reach the // simulator. Hover-only mode falls through to the regular path. if (inspectModeRef.current === true) { return; } // Handle hover point updates for mouse events (only when Alt is held) if (!('touches' in event) && ctx) { if (event.type === 'mousemove') { updateHoverPoint(ctx, event.clientX, event.clientY); } else if (event.type === 'mouseleave') { setHoverPoint(null); } // Note: Alt state is tracked via global keydown/keyup listeners, not event.altKey, // to ensure consistent behavior across focus transitions. } if ( !dataChannelRef.current || dataChannelRef.current.readyState !== 'open' || !videoRef.current || !ctx ) { return; } // --- Event Type Handling --- if ('touches' in event) { // Touch Events - handle both single-finger and two-finger gestures const allTouches = event.touches; // All currently active touches const changedTouches = event.changedTouches; let eventType: 'down' | 'move' | 'up' | 'cancel'; switch (event.type) { case 'touchstart': eventType = 'down'; break; case 'touchmove': eventType = 'move'; break; case 'touchend': eventType = 'up'; break; case 'touchcancel': eventType = 'cancel'; break; default: return; } // Check if we have exactly 2 active touches - route to two-finger logic if (allTouches.length === 2) { const t0 = allTouches[0]; const t1 = allTouches[1]; const g0 = mapClientPointToVideo(ctx, t0.clientX, t0.clientY); const g1 = mapClientPointToVideo(ctx, t1.clientX, t1.clientY); if (!g0 || !g1) return; if (!twoFingerStateRef.current) { // Starting a new two-finger gesture twoFingerStateRef.current = { finger0: { x: g0.videoX, y: g0.videoY }, finger1: { x: g1.videoX, y: g1.videoY }, videoSize: { width: g0.videoWidth, height: g0.videoHeight }, source: 'real-touch', pointerId0: t0.identifier, pointerId1: t1.identifier, }; applyTwoFingerEvent( 'down', g0.videoWidth, g0.videoHeight, g0.videoX, g0.videoY, g1.videoX, g1.videoY, t0.identifier, t1.identifier, ); } else if (twoFingerStateRef.current.source === 'real-touch') { // Continuing two-finger gesture (move) twoFingerStateRef.current.finger0 = { x: g0.videoX, y: g0.videoY }; twoFingerStateRef.current.finger1 = { x: g1.videoX, y: g1.videoY }; applyTwoFingerEvent( 'move', g0.videoWidth, g0.videoHeight, g0.videoX, g0.videoY, g1.videoX, g1.videoY, twoFingerStateRef.current.pointerId0, twoFingerStateRef.current.pointerId1, ); } } else if (allTouches.length < 2 && twoFingerStateRef.current?.source === 'real-touch') { // Finger lifted - end two-finger gesture using last known state const state = twoFingerStateRef.current; applyTwoFingerEvent( 'up', state.videoSize.width, state.videoSize.height, state.finger0.x, state.finger0.y, state.finger1.x, state.finger1.y, state.pointerId0, state.pointerId1, ); twoFingerStateRef.current = null; // Don't process remaining finger - gesture ended return; } else if (allTouches.length > 2) { // 3+ fingers - not supported, ignore return; } else { // Single finger touch (allTouches is 0 or 1) // Note: allTouches=0 happens on touchend when last finger lifts const touch = changedTouches[0]; if (touch) { const geometry = mapClientPointToVideo(ctx, touch.clientX, touch.clientY); applyPointerEvent(touch.identifier, eventType, geometry); } } } else { // Mouse Events const pointerId = -1; // Primary mouse pointer let eventType: 'down' | 'move' | 'up' | 'cancel' | null = null; // Determine if we're in two-finger mode (Alt+mouse drag) const inTwoFingerMode = twoFingerStateRef.current?.source === 'alt-mouse'; switch (event.type) { case 'mousedown': if (event.button === 0) eventType = 'down'; break; case 'mousemove': // Process move if either in two-finger mode or has active pointer (normal drag) if (inTwoFingerMode || activePointers.current.has(pointerId)) { eventType = 'move'; } break; case 'mouseup': if (event.button === 0) eventType = 'up'; break; case 'mouseleave': // Treat leave like up only if in drag/two-finger mode if (inTwoFingerMode || activePointers.current.has(pointerId)) { eventType = 'up'; } break; } if (eventType) { const geometry = mapClientPointToVideo(ctx, event.clientX, event.clientY); if (!geometry) { return; } debugLog('[rc-touch][mouse] event', { domType: event.type, eventType, button: event.button, buttons: (event as React.MouseEvent).buttons, client: { x: event.clientX, y: event.clientY }, altHeldRef: isAltHeldRef.current, inTwoFingerMode, geometry: { videoX: geometry.videoX, videoY: geometry.videoY, videoWidth: geometry.videoWidth, videoHeight: geometry.videoHeight, }, activePointerIds: Array.from(activePointers.current.keys()), }); // Route to two-finger (Alt+mouse) or single-finger path if (isAltHeldRef.current || inTwoFingerMode) { // Two-finger mode - Alt simulates second finger at mirror position handleAltMouseGesture(eventType, geometry); } else { // Normal single-finger touch applyPointerEvent(pointerId, eventType, geometry); } } } }; // Handle Alt+mouse gestures (simulated two-finger with mirror position). // Works on both iOS and Android - applyTwoFingerEvent handles platform differences. const handleAltMouseGesture = ( eventType: 'down' | 'move' | 'up' | 'cancel', geometry: PointerGeometry, ) => { const { videoX, videoY, videoWidth, videoHeight } = geometry; const mirrorX = videoWidth - videoX; const mirrorY = videoHeight - videoY; if (eventType === 'down') { // Start two-finger gesture twoFingerStateRef.current = { finger0: { x: videoX, y: videoY }, finger1: { x: mirrorX, y: mirrorY }, videoSize: { width: videoWidth, height: videoHeight }, source: 'alt-mouse', pointerId0: ALT_POINTER_ID_PRIMARY, pointerId1: ALT_POINTER_ID_MIRROR, }; videoRef.current?.focus(); applyTwoFingerEvent( 'down', videoWidth, videoHeight, videoX, videoY, mirrorX, mirrorY, ALT_POINTER_ID_PRIMARY, ALT_POINTER_ID_MIRROR, ); return; } if (eventType === 'move') { if (twoFingerStateRef.current?.source === 'alt-mouse') { // Update positions twoFingerStateRef.current.finger0 = { x: videoX, y: videoY }; twoFingerStateRef.current.finger1 = { x: mirrorX, y: mirrorY }; applyTwoFingerEvent( 'move', videoWidth, videoHeight, videoX, videoY, mirrorX, mirrorY, ALT_POINTER_ID_PRIMARY, ALT_POINTER_ID_MIRROR, ); } return; } if (eventType === 'up' || eventType === 'cancel') { const state = twoFingerStateRef.current; if (state?.source === 'alt-mouse') { // End gesture at last known positions const { finger0, finger1, videoSize } = state; applyTwoFingerEvent( 'up', videoSize.width, videoSize.height, finger0.x, finger0.y, finger1.x, finger1.y, ALT_POINTER_ID_PRIMARY, ALT_POINTER_ID_MIRROR, ); twoFingerStateRef.current = null; } return; } }; useEffect(() => { const handleKeyDown = (event: KeyboardEvent) => { if (event.key === 'Alt') { updateAltHeld(true); } }; const handleKeyUp = (event: KeyboardEvent) => { if (event.key === 'Alt') { updateAltHeld(false); } }; const handleWindowBlur = () => { updateAltHeld(false); }; // Use capture phase so these fire before handleKeyboard's stopPropagation window.addEventListener('keydown', handleKeyDown, true); window.addEventListener('keyup', handleKeyUp, true); window.addEventListener('blur', handleWindowBlur); return () => { window.removeEventListener('keydown', handleKeyDown, true); window.removeEventListener('keyup', handleKeyUp, true); window.removeEventListener('blur', handleWindowBlur); }; }, []); const handleKeyboard = (event: React.KeyboardEvent) => { event.preventDefault(); event.stopPropagation(); // Use the wrapper for conditional logging debugLog('Keyboard event:', { type: event.type, key: event.key, keyCode: event.keyCode, code: event.code, target: (event.target as HTMLElement).tagName, focused: document.activeElement === videoRef.current, }); if (document.activeElement !== videoRef.current) { // Use the wrapper for conditional warning debugWarn('Video element not focused, skipping keyboard event'); return; } if (!dataChannelRef.current || dataChannelRef.current.readyState !== 'open') { // Use the wrapper for conditional warning debugWarn('Data channel not ready for keyboard event:', dataChannelRef.current?.readyState); return; } // Handle special shortcuts first (Paste, Menu) if (event.type === 'keydown') { // Paste (Cmd+V / Ctrl+V) if (event.key.toLowerCase() === 'v' && (event.metaKey || event.ctrlKey)) { debugLog('Paste shortcut detected'); navigator.clipboard .readText() .then((text) => { if (text) { debugLog( 'Pasting text via SET_CLIPBOARD:', text.substring(0, 20) + (text.length > 20 ? '...' : ''), ); const message = createSetClipboardMessage(text, true); // paste=true sendBinaryControlMessage(message); } }) .catch((err) => { console.error('Failed to read clipboard contents: ', err); }); return; // Don't process 'v' keycode further } // Menu (Cmd+M / Ctrl+M) - Send down and up immediately if (event.key.toLowerCase() === 'm' && (event.metaKey || event.ctrlKey)) { debugLog('Menu shortcut detected'); const messageDown = createInjectKeycodeMessage( ANDROID_KEYS.ACTION_DOWN, ANDROID_KEYS.MENU, 0, ANDROID_KEYS.META_NONE, // Modifiers are handled by the shortcut check, not passed down ); sendBinaryControlMessage(messageDown); const messageUp = createInjectKeycodeMessage( ANDROID_KEYS.ACTION_UP, ANDROID_KEYS.MENU, 0, ANDROID_KEYS.META_NONE, ); sendBinaryControlMessage(messageUp); return; // Don't process 'm' keycode further } } // Handle general key presses (including Arrows, Enter, Backspace, Delete, Letters, Numbers, Symbols) const keyInfo = getAndroidKeycodeAndMeta(event); if (keyInfo) { const { keycode, metaState } = keyInfo; const action = event.type === 'keydown' ? ANDROID_KEYS.ACTION_DOWN : ANDROID_KEYS.ACTION_UP; debugLog(`Sending Keycode: key=${event.key}, code=${keycode}, action=${action}, meta=${metaState}`); const message = createInjectKeycodeMessage( action, keycode, 0, // repeat count, typically 0 for single presses metaState, ); sendBinaryControlMessage(message); } else { debugLog(`Ignoring unhandled key event: type=${event.type}, key=${event.key}`); } }; const sendKeepAlive = () => { if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) { wsRef.current.send( JSON.stringify({ type: 'keepAlive', sessionId: sessionId, }), ); } }; const startKeepAlive = () => { if (keepAliveIntervalRef.current) { window.clearInterval(keepAliveIntervalRef.current); } keepAliveIntervalRef.current = window.setInterval(sendKeepAlive, 10000); }; const stopKeepAlive = () => { if (keepAliveIntervalRef.current) { window.clearInterval(keepAliveIntervalRef.current); keepAliveIntervalRef.current = undefined; } }; const clearScheduledRetry = () => { if (retryTimeoutRef.current) { window.clearTimeout(retryTimeoutRef.current); retryTimeoutRef.current = undefined; } }; const clearConnectionSuccessTimeout = () => { if (connectionSuccessTimeoutRef.current) { window.clearTimeout(connectionSuccessTimeoutRef.current); connectionSuccessTimeoutRef.current = undefined; } }; const stopRequestFrameLoop = () => { if (requestFrameIntervalRef.current) { window.clearInterval(requestFrameIntervalRef.current); requestFrameIntervalRef.current = undefined; } }; const clearIceDisconnectedGrace = () => { if (iceDisconnectedGraceRef.current !== undefined) { window.clearTimeout(iceDisconnectedGraceRef.current); iceDisconnectedGraceRef.current = undefined; } }; const markFirstFrameShown = () => { if (firstFrameShownRef.current) { return; } firstFrameShownRef.current = true; stopRequestFrameLoop(); setVideoLoaded(true); }; const teardownConnection = () => { clearConnectionSuccessTimeout(); clearIceDisconnectedGrace(); stopRequestFrameLoop(); if (axFetcherRef.current) { axFetcherRef.current.stop(); axFetcherRef.current = null; } // A scheduled cursor flush would otherwise call setState on a // teardown component once the next frame runs. if (cursorRafIdRef.current !== undefined) { window.cancelAnimationFrame(cursorRafIdRef.current); cursorRafIdRef.current = undefined; } if (wsRef.current) { wsRef.current.onopen = null; wsRef.current.onmessage = null; wsRef.current.onerror = null; wsRef.current.onclose = null; wsRef.current.close(); wsRef.current = null; } if (peerConnectionRef.current) { peerConnectionRef.current.onconnectionstatechange = null; peerConnectionRef.current.oniceconnectionstatechange = null; peerConnectionRef.current.ontrack = null; peerConnectionRef.current.onicecandidate = null; peerConnectionRef.current.close(); peerConnectionRef.current = null; } if (videoRef.current) { videoRef.current.srcObject = null; } if (dataChannelRef.current) { dataChannelRef.current.onopen = null; dataChannelRef.current.onclose = null; dataChannelRef.current.onerror = null; dataChannelRef.current.close(); dataChannelRef.current = null; } }; const handleVisibilityChange = () => { if (document.hidden) { stopKeepAlive(); } else { startKeepAlive(); } }; const scheduleRetry = (reason: string, generation: number) => { if (generation !== connectionGenerationRef.current) { return; } if (controlChannelOpenedRef.current) { if (!autoReconnectRef.current) { updateStatus(`Connection failed after it was established: ${reason}`); setRetryExhausted(true); teardownConnection(); return; } // Reset so the upcoming retry gets a fresh MAX_CONNECTION_ATTEMPTS budget. updateStatus(`Reconnecting after established session dropped: ${reason}`); controlChannelOpenedRef.current = false; connectionAttemptRef.current = -1; } clearScheduledRetry(); const nextAttempt = connectionAttemptRef.current + 1; if (nextAttempt >= MAX_CONNECTION_ATTEMPTS) { updateStatus(`Connection failed after ${MAX_CONNECTION_ATTEMPTS} attempts: ${reason}`); setRetryExhausted(true); teardownConnection(); return; } updateStatus(`Retrying connection (${nextAttempt + 1}/${MAX_CONNECTION_ATTEMPTS})`); teardownConnection(); retryTimeoutRef.current = window.setTimeout(() => { retryTimeoutRef.current = undefined; if (generation !== connectionGenerationRef.current) { return; } void startAttempt(nextAttempt); }, CONNECTION_RETRY_DELAY_MS); }; const startAttempt = async (attemptNumber = 0) => { const generation = connectionGenerationRef.current + 1; connectionGenerationRef.current = generation; connectionAttemptRef.current = attemptNumber; controlChannelOpenedRef.current = false; setRetryExhausted(false); clearScheduledRetry(); clearConnectionSuccessTimeout(); stopRequestFrameLoop(); firstFrameShownRef.current = false; setVideoLoaded(false); teardownConnection(); const isCurrentAttempt = () => generation === connectionGenerationRef.current; connectionSuccessTimeoutRef.current = window.setTimeout(() => { connectionSuccessTimeoutRef.current = undefined; if (!isCurrentAttempt() || controlChannelOpenedRef.current) { return; } scheduleRetry('Connection did not succeed within 15 seconds', generation); }, CONNECTION_SUCCESS_TIMEOUT_MS); try { const ws = new WebSocket(`${url}?token=${token}`); wsRef.current = ws; // Wait for WebSocket to connect await new Promise((resolve, reject) => { let settled = false; const timeoutId = window.setTimeout(() => reject(new Error('WebSocket connection timeout')), 30000); const settle = (callback: () => void) => { if (settled) { return; } settled = true; window.clearTimeout(timeoutId); callback(); }; ws.onopen = () => { if (!isCurrentAttempt() || wsRef.current !== ws) { return; } settle(resolve); }; ws.onerror = (error) => { if (!isCurrentAttempt() || wsRef.current !== ws) { return; } updateStatus('WebSocket error: ' + error); settle(() => reject(new Error('WebSocket connection failed'))); }; ws.onclose = () => { if (!isCurrentAttempt() || wsRef.current !== ws) { return; } updateStatus('WebSocket closed'); settle(() => reject(new Error('WebSocket closed before connection was established'))); }; }); if (!isCurrentAttempt() || wsRef.current !== ws) { return; } ws.onerror = (error) => { if (!isCurrentAttempt() || wsRef.current !== ws) { return; } updateStatus('WebSocket error: ' + error); }; ws.onclose = () => { if (!isCurrentAttempt() || wsRef.current !== ws) { return; } updateStatus('WebSocket closed'); }; // Request RTCConfiguration const rtcConfigPromise = new Promise((resolve, reject) => { const timeoutId = window.setTimeout(() => reject(new Error('RTCConfiguration timeout')), 30000); const messageHandler = (event: MessageEvent) => { try { const message = JSON.parse(event.data); if (message.type === 'rtcConfiguration') { window.clearTimeout(timeoutId); ws.removeEventListener('message', messageHandler); resolve(message.rtcConfiguration); } } catch (e) { window.clearTimeout(timeoutId); ws.removeEventListener('message', messageHandler); console.error('Error handling RTC configuration:', e); reject(e); } }; ws.addEventListener('message', messageHandler); ws.send( JSON.stringify({ type: 'requestRtcConfiguration', sessionId: sessionId, }), ); }); const rtcConfig = await rtcConfigPromise; if (!isCurrentAttempt() || wsRef.current !== ws) { return; } const peerConnection = new RTCPeerConnection(rtcConfig); peerConnectionRef.current = peerConnection; peerConnection.addTransceiver('audio', { direction: 'recvonly' }); const videoTransceiver = peerConnection.addTransceiver('video', { direction: 'recvonly' }); // As hardware encoder, we use H265 for iOS and VP9 for Android. // We make sure these two are the first ones in the list. // If not, the fallback is H264 which is also hardware accelerated, although not as good, // available on all platforms. // // The rest is not important. if (RTCRtpReceiver.getCapabilities) { const capabilities = RTCRtpReceiver.getCapabilities('video'); if (capabilities && capabilities.codecs) { const codecs = capabilities.codecs; const sortedCodecs = codecs.sort((a, b) => { const getCodecPriority = (codec: { mimeType: string }): number => { const mimeType = codec.mimeType.toLowerCase(); if (mimeType.includes('vp9')) return 1; if (mimeType.includes('h265') || mimeType.includes('hevc')) return 2; if (mimeType.includes('h264') || mimeType.includes('avc')) return 3; return 4; // Everything else }; return getCodecPriority(a) - getCodecPriority(b); }); videoTransceiver.setCodecPreferences(sortedCodecs); debugLog('Set codec preferences:', sortedCodecs.map((c) => c.mimeType).join(', ')); } } const dataChannel = peerConnection.createDataChannel('control', { ordered: true, negotiated: true, id: 1, }); dataChannelRef.current = dataChannel; dataChannel.onopen = () => { if (!isCurrentAttempt() || dataChannelRef.current !== dataChannel || wsRef.current !== ws) { return; } controlChannelOpenedRef.current = true; clearConnectionSuccessTimeout(); updateStatus('Control channel opened'); // Spin up the AX fetcher now that we have a stable WS + control // channel. The fetcher's send function reuses this WS; it stops // sending if the WS dies. start() is called lazily based on the // inspectMode prop via a sibling useEffect. if (!axFetcherRef.current) { axFetcherRef.current = new AxFetcher({ platform, baseIntervalMs: axPollIntervalMs, maxBackoffMs: axMaxBackoffMs, send: (payload) => { if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return false; try { wsRef.current.send(JSON.stringify(payload)); return true; } catch { return false; } }, onSnapshot: (snapshot) => { setAxSnapshot((prev) => (axSnapshotsEqual(prev, snapshot) ? prev : snapshot)); // Defer to a microtask so customer code (which may DOM-write, // start expensive work, or itself call back into ref // methods) doesn't run synchronously inside our state-setter // path. React then has a chance to schedule its render before // the customer handler kicks off side-effects. queueMicrotask(() => { safeInvoke('onAxSnapshotChange', onAxSnapshotChangeRef.current, snapshot); }); }, onStatusChange: (status, error) => { safeInvoke('onAxStatusChange', onAxStatusChangeRef.current, status, error); }, }); if (inspectModeRef.current === true || inspectModeRef.current === 'hover-only') { axFetcherRef.current.start(); } } const sendRequestFrame = () => { if ( !isCurrentAttempt() || firstFrameShownRef.current || dataChannelRef.current !== dataChannel || wsRef.current !== ws || ws.readyState !== WebSocket.OPEN ) { return; } ws.send(JSON.stringify({ type: 'requestFrame', sessionId: sessionId })); }; sendRequestFrame(); stopRequestFrameLoop(); requestFrameIntervalRef.current = window.setInterval(() => { if ( !isCurrentAttempt() || firstFrameShownRef.current || dataChannelRef.current !== dataChannel || wsRef.current !== ws || ws.readyState !== WebSocket.OPEN ) { stopRequestFrameLoop(); return; } sendRequestFrame(); }, 250); // Send openUrl message if the prop is provided if (openUrl) { try { const decodedUrl = decodeURIComponent(openUrl); updateStatus('Opening URL'); ws.send( JSON.stringify({ type: 'openUrl', url: decodedUrl, sessionId: sessionId, }), ); } catch (error) { console.error({ error }, 'Error decoding URL, falling back to the original URL'); ws.send( JSON.stringify({ type: 'openUrl', url: openUrl, sessionId: sessionId, }), ); } // openUrl can take a moment to load the destination — boost // AX polling so the overlay refreshes through the transition. axFetcherRef.current?.bumpActivity(); } }; dataChannel.onclose = () => { if (!isCurrentAttempt() || dataChannelRef.current !== dataChannel) { return; } updateStatus('Control channel closed'); }; dataChannel.onerror = (error) => { if (!isCurrentAttempt() || dataChannelRef.current !== dataChannel) { return; } console.error('Control channel error:', error); updateStatus('Control channel error: ' + error); }; // Set up connection state monitoring peerConnection.onconnectionstatechange = () => { if (!isCurrentAttempt() || peerConnectionRef.current !== peerConnection) { return; } updateStatus('Connection state: ' + peerConnection.connectionState); if (peerConnection.connectionState === 'failed') { scheduleRetry('WebRTC connection entered failed state', generation); } }; peerConnection.oniceconnectionstatechange = () => { if (!isCurrentAttempt() || peerConnectionRef.current !== peerConnection) { return; } const iceState = peerConnection.iceConnectionState; updateStatus('ICE state: ' + iceState); if (iceState === 'connected' || iceState === 'completed') { clearIceDisconnectedGrace(); return; } if (iceState === 'failed') { clearIceDisconnectedGrace(); scheduleRetry('ICE connection entered failed state', generation); return; } if ( iceState === 'disconnected' && autoReconnectRef.current && iceDisconnectedGraceRef.current === undefined ) { // Cap the browser's natural disconnected→failed escalation to recover faster. iceDisconnectedGraceRef.current = window.setTimeout(() => { iceDisconnectedGraceRef.current = undefined; if (!isCurrentAttempt() || peerConnectionRef.current !== peerConnection) { return; } if (peerConnection.iceConnectionState === 'disconnected') { scheduleRetry('ICE stayed disconnected past grace period', generation); } }, ICE_DISCONNECTED_GRACE_MS); } }; // Set up video handling peerConnection.ontrack = (event) => { if (!isCurrentAttempt() || peerConnectionRef.current !== peerConnection) { return; } updateStatus('Received remote track: ' + event.track.kind); if (event.track.kind === 'video' && videoRef.current) { debugLog(`[${new Date().toISOString()}] Video track received:`, event.track); videoRef.current.srcObject = event.streams[0]; } }; // Handle ICE candidates peerConnection.onicecandidate = (event) => { if (!isCurrentAttempt() || peerConnectionRef.current !== peerConnection || wsRef.current !== ws) { return; } if (event.candidate && ws.readyState === WebSocket.OPEN) { const message = { type: 'candidate', candidate: event.candidate.candidate, sdpMid: event.candidate.sdpMid, sdpMLineIndex: event.candidate.sdpMLineIndex, sessionId: sessionId, }; ws.send(JSON.stringify(message)); updateStatus('Sent ICE candidate'); } else { updateStatus('ICE candidate gathering completed'); } }; // Handle incoming messages ws.onmessage = async (event) => { if (!isCurrentAttempt() || wsRef.current !== ws) { return; } let message; try { message = JSON.parse(event.data); } catch (e) { debugWarn('Error parsing message:', e); return; } // Inspect-mode responses are routed to the fetcher first so it // can resolve in-flight requests regardless of which platform's // protocol is in use. if (axFetcherRef.current?.handleMessage(message)) { return; } updateStatus('Received: ' + message.type); switch (message.type) { case 'answer': if (!peerConnectionRef.current || peerConnectionRef.current !== peerConnection) { updateStatus('No peer connection, skipping answer'); break; } await peerConnection.setRemoteDescription( new RTCSessionDescription({ type: 'answer', sdp: message.sdp, }), ); if (!isCurrentAttempt() || peerConnectionRef.current !== peerConnection) { return; } updateStatus('Set remote description'); break; case 'candidate': if (!peerConnectionRef.current || peerConnectionRef.current !== peerConnection) { updateStatus('No peer connection, skipping candidate'); break; } await peerConnection.addIceCandidate( new RTCIceCandidate({ candidate: message.candidate, sdpMid: message.sdpMid, sdpMLineIndex: message.sdpMLineIndex, }), ); if (!isCurrentAttempt() || peerConnectionRef.current !== peerConnection) { return; } updateStatus('Added ICE candidate'); break; case 'screenshot': case 'screenshotResult': { if (typeof message.id !== 'string') { debugWarn('Received invalid screenshot success message:', message); break; } const screenshotError = getScreenshotError(message); if (screenshotError) { const rejecter = pendingScreenshotRejectersRef.current.get(message.id); if (!rejecter) { debugWarn(`Received screenshot error for unknown or handled id: ${message.id}`); break; } debugWarn(`Received screenshot error for id ${message.id}: ${screenshotError}`); rejecter(new Error(screenshotError)); pendingScreenshotResolversRef.current.delete(message.id); pendingScreenshotRejectersRef.current.delete(message.id); break; } const screenshotData = toScreenshotData(message); if (!screenshotData) { debugWarn('Received screenshot message without image data:', message); break; } const resolver = pendingScreenshotResolversRef.current.get(message.id); if (!resolver) { debugWarn(`Received screenshot data for unknown or handled id: ${message.id}`); break; } debugLog(`Received screenshot data for id ${message.id}`); resolver(screenshotData); pendingScreenshotResolversRef.current.delete(message.id); pendingScreenshotRejectersRef.current.delete(message.id); break; } case 'screenshotError': if (typeof message.id !== 'string' || typeof message.message !== 'string') { debugWarn('Received invalid screenshot error message:', message); break; } const rejecter = pendingScreenshotRejectersRef.current.get(message.id); if (!rejecter) { debugWarn(`Received screenshot error for unknown or handled id: ${message.id}`); break; } debugWarn(`Received screenshot error for id ${message.id}: ${message.message}`); rejecter(new Error(message.message)); pendingScreenshotResolversRef.current.delete(message.id); pendingScreenshotRejectersRef.current.delete(message.id); break; case 'terminateAppResult': if (typeof message.id !== 'string') { debugWarn('Received invalid terminateApp result message:', message); break; } if (typeof message.error === 'string') { const terminateRejecter = pendingTerminateAppRejectersRef.current.get(message.id); if (!terminateRejecter) { debugWarn(`Received terminateApp error for unknown or handled id: ${message.id}`); break; } debugWarn(`Received terminateApp error for id ${message.id}: ${message.error}`); terminateRejecter(new Error(message.error)); pendingTerminateAppResolversRef.current.delete(message.id); pendingTerminateAppRejectersRef.current.delete(message.id); break; } const terminateResolver = pendingTerminateAppResolversRef.current.get(message.id); if (!terminateResolver) { debugWarn(`Received terminateApp result for unknown or handled id: ${message.id}`); break; } debugLog(`Received terminateApp success for id ${message.id}`); terminateResolver(); pendingTerminateAppResolversRef.current.delete(message.id); pendingTerminateAppRejectersRef.current.delete(message.id); break; default: debugWarn(`Received unhandled message type: ${message.type}`, message); break; } }; // Create and send offer if (peerConnectionRef.current === peerConnection) { const offer = await peerConnection.createOffer({ offerToReceiveVideo: true, offerToReceiveAudio: false, }); if (!isCurrentAttempt() || peerConnectionRef.current !== peerConnection) { return; } await peerConnection.setLocalDescription(offer); if (!isCurrentAttempt() || peerConnectionRef.current !== peerConnection) { return; } if (isCurrentAttempt() && wsRef.current === ws && ws.readyState === WebSocket.OPEN) { ws.send( JSON.stringify({ type: 'offer', sdp: offer.sdp, sessionId: sessionId, }), ); } updateStatus('Sent offer'); } } catch (e) { if (!isCurrentAttempt()) { return; } const reason = e instanceof Error ? e.message : String(e); updateStatus('Error: ' + reason); scheduleRetry(reason, generation); } }; const start = () => { void startAttempt(0); }; const stop = () => { connectionGenerationRef.current += 1; connectionAttemptRef.current = 0; controlChannelOpenedRef.current = false; clearScheduledRetry(); clearIceDisconnectedGrace(); teardownConnection(); updateStatus('Stopped'); }; const handleManualRetry = (event: React.MouseEvent) => { event.stopPropagation(); start(); }; useEffect(() => { // Reset video loaded state when connection params change setVideoLoaded(false); // Start connection when component mounts start(); // Only start keepAlive if page is visible if (!document.hidden) { startKeepAlive(); } // Add visibility change listener document.addEventListener('visibilitychange', handleVisibilityChange); // Clean up return () => { stopKeepAlive(); stop(); document.removeEventListener('visibilitychange', handleVisibilityChange); }; }, [url, token, propSessionId]); // Recompute the inspect-overlay geometry (container-local pixel rect of // the actually-rendered video content) from the current mapping context. // The InfoCard places itself in viewport coordinates from pointer events // directly, so no viewport-space origin is needed in the geometry. const recomputeOverlayGeometry = () => { const ctx = computeVideoMappingContext(); if (!ctx) { setOverlayGeometry(null); return; } const next: InspectOverlayGeometry = { left: ctx.videoRect.left - ctx.containerRect.left + ctx.offsetX, top: ctx.videoRect.top - ctx.containerRect.top + ctx.offsetY, width: ctx.actualWidth, height: ctx.actualHeight, }; setOverlayGeometry((prev) => ( prev && prev.left === next.left && prev.top === next.top && prev.width === next.width && prev.height === next.height ) ? prev : next, ); }; // Calculate video position and border-radius based on frame dimensions useEffect(() => { const video = videoRef.current; const frame = frameRef.current; const container = containerRef.current; if (!video) return; const updateVideoPosition = () => { // If no frame, just refresh overlay geometry; no inset/letterbox math // is needed since the video element is its own size. if (!showFrame || !frame) { setVideoStyle({}); recomputeOverlayGeometry(); return; } const frameWidth = frame.clientWidth; const frameHeight = frame.clientHeight; if (frameWidth === 0 || frameHeight === 0) return; // Determine landscape based on video's intrinsic dimensions const landscape = video.videoWidth > video.videoHeight; setIsLandscape(landscape); setUseAndroidTabletFrame( platform === 'android' && isAndroidTabletVideo(video.videoWidth, video.videoHeight), ); const pos = landscape ? config.videoPosition.landscape : config.videoPosition.portrait; let newStyle: React.CSSProperties = {}; if (pos.heightMultiplier) { newStyle.height = `${frameHeight * pos.heightMultiplier}px`; // Let the other dimension follow the video stream's intrinsic aspect ratio. newStyle.width = 'auto'; } else if (pos.widthMultiplier) { newStyle.width = `${frameWidth * pos.widthMultiplier}px`; // Let the other dimension follow the video stream's intrinsic aspect ratio. newStyle.height = 'auto'; } newStyle.borderRadius = `${ landscape ? frameHeight * config.videoBorderRadiusMultiplier : frameWidth * config.videoBorderRadiusMultiplier }px`; setVideoStyle(newStyle); recomputeOverlayGeometry(); }; const resizeObserver = new ResizeObserver(() => { updateVideoPosition(); }); if (frame) resizeObserver.observe(frame); resizeObserver.observe(video); if (container) resizeObserver.observe(container); // Also update when the frame image loads if (frame) frame.addEventListener('load', updateVideoPosition); // Update when video metadata loads (to get correct intrinsic dimensions) video.addEventListener('loadedmetadata', updateVideoPosition); // IMPORTANT: When the WebRTC stream changes orientation, the intrinsic video size // (videoWidth/videoHeight) can change without re-firing 'loadedmetadata'. // The