/** * dom.ts — Agent manipulates its own DOM. * * Bugs fixed vs the original: * #1 dom_observe crashed when passing {types: ['childList','characterData']} without 'attributes' — * code always set attributeOldValue:true / characterDataOldValue:true, which * require the parent flag to also be true. Now: each *OldValue flag is set * ONLY when the corresponding flag is enabled. * #2 dom_mutate default all:false silently worked on 1 of N matches — now the * response explicitly reports `matched` vs `affected` and the agent can * detect partial application. * #3 setStyle ignored kebab-case keys (background-color) — now normalizes to * camelCase, accepting both forms. * #4 click() just called element.click() which doesn't fire real MouseEvents * in many frameworks. Now has a dispatch:'synthetic'|'native' option that * fires a full MouseEvent with clientX/Y, bubbles, and modifier keys. * #5 No way to dispatch keyboard / input events → agent can't test forms. * New dom_dispatch_event tool supports Keyboard / Input / Mouse / Wheel / * arbitrary CustomEvent. * #6 No way to scroll-match or pick Nth element by index → dom_query now has * an `nth` param and dom_mutate accepts `nth`. * #7 Added dom_wait_for — wait until a selector appears / becomes visible, * with a timeout (useful before interacting with async-rendered UI). */ import { tool } from '@strands-agents/sdk' import { z } from 'zod' /* -------------------------------------------------------------------------- * Serialization helper (unchanged behavior, minor hardening) * ------------------------------------------------------------------------ */ function serializeElement(el: Element, depth: number = 0, maxDepth: number = 2): any { const rect = el.getBoundingClientRect() const style = window.getComputedStyle(el) const visible = rect.width > 0 && rect.height > 0 && style.visibility !== 'hidden' && style.display !== 'none' && parseFloat(style.opacity || '1') > 0 const out: any = { tag: el.tagName.toLowerCase(), id: (el as HTMLElement).id || undefined, classes: el.className && typeof el.className === 'string' ? el.className.split(/\s+/).filter(Boolean) : undefined, text: (el.textContent || '').trim().slice(0, 200) || undefined, rect: { x: rect.x, y: rect.y, w: rect.width, h: rect.height }, visible, attrs: Object.fromEntries(Array.from(el.attributes).map((a) => [a.name, a.value])), } if (depth < maxDepth && el.children.length > 0) { out.children = Array.from(el.children).slice(0, 20).map((c) => serializeElement(c, depth + 1, maxDepth)) if (el.children.length > 20) out.truncated = el.children.length - 20 } return out } /* kebab-case → camelCase for CSS properties. Leaves already-camel keys alone. */ function kebabToCamel(k: string): string { if (!/-/.test(k)) return k return k.replace(/-([a-z])/g, (_, c) => c.toUpperCase()) } /* -------------------------------------------------------------------------- * dom_query * ------------------------------------------------------------------------ */ export const domQueryTool = tool({ name: 'dom_query', description: 'Query the page DOM with a CSS selector. Returns matching elements with tag, id, classes, text, bounding rect, attrs, and (optionally) children tree. ' + 'Use `nth: N` (zero-indexed) to pick a single specific match.', inputSchema: z.object({ selector: z.string().describe('CSS selector'), limit: z.number().optional().describe('Max results (default 10)'), depth: z.number().optional().describe('Child tree depth (default 2, 0 = no children)'), all: z.boolean().optional().describe('Return ALL matches (ignores limit)'), nth: z.number().optional().describe('Return only the Nth match (zero-indexed). Overrides limit/all.'), root: z.string().optional().describe('Optional root selector to scope the query (default: document)'), }), callback: (input) => { try { const scope: ParentNode = input.root ? (document.querySelector(input.root) ?? document) : document const nodes = Array.from(scope.querySelectorAll(input.selector)) if (input.nth !== undefined) { if (input.nth < 0 || input.nth >= nodes.length) { return JSON.stringify({ status: 'error', error: `nth=${input.nth} out of range (found ${nodes.length})`, selector: input.selector, count: nodes.length, }) } return JSON.stringify({ status: 'success', selector: input.selector, count: nodes.length, returned: 1, nth: input.nth, elements: [serializeElement(nodes[input.nth], 0, input.depth ?? 2)], }) } const limit = input.all ? nodes.length : input.limit ?? 10 const results = nodes.slice(0, limit).map((el) => serializeElement(el, 0, input.depth ?? 2)) return JSON.stringify({ status: 'success', selector: input.selector, count: nodes.length, returned: results.length, elements: results, }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) /* -------------------------------------------------------------------------- * dom_mutate * ------------------------------------------------------------------------ */ export const domMutateTool = tool({ name: 'dom_mutate', description: 'Mutate the DOM. Operations: setText, setHTML, setAttr, removeAttr, addClass, removeClass, toggleClass, setStyle, remove, click, focus, blur, scrollIntoView, scrollTo, append, prepend, setValue (for inputs). ' + 'Default applies to the FIRST matching element; use all:true to apply to all, or nth:N for a specific index. ' + 'style keys may be kebab-case ("background-color") OR camelCase ("backgroundColor") — both work.', inputSchema: z.object({ selector: z.string().describe('CSS selector of target element(s)'), operation: z.enum([ 'setText', 'setHTML', 'setAttr', 'removeAttr', 'addClass', 'removeClass', 'toggleClass', 'setStyle', 'remove', 'click', 'focus', 'blur', 'scrollIntoView', 'scrollTo', 'append', 'prepend', 'setValue', ]), value: z.string().optional().describe('Value for setText/setHTML/append/prepend/setValue/scrollTo (pixels for scrollTo)'), attr: z.string().optional().describe('Attribute name for setAttr/removeAttr'), style: z.record(z.string(), z.string()).optional().describe('Style object for setStyle; keys accept kebab-case OR camelCase'), className: z.string().optional().describe('Class name(s) for addClass/removeClass/toggleClass'), all: z.boolean().optional().describe('Apply to ALL matches (default: first only)'), nth: z.number().optional().describe('Apply only to the Nth match (zero-indexed). Overrides all.'), click_options: z.object({ dispatch: z.enum(['native', 'synthetic']).optional().describe('native: el.click() (default). synthetic: dispatch a real MouseEvent.'), clientX: z.number().optional(), clientY: z.number().optional(), button: z.number().optional(), }).optional(), }), callback: (input) => { try { const nodes = Array.from(document.querySelectorAll(input.selector)) if (nodes.length === 0) { return JSON.stringify({ status: 'error', error: 'No elements matched', selector: input.selector, }) } let targets: Element[] if (input.nth !== undefined) { if (input.nth < 0 || input.nth >= nodes.length) { return JSON.stringify({ status: 'error', error: `nth=${input.nth} out of range (${nodes.length} matched)` }) } targets = [nodes[input.nth]] } else { targets = input.all ? nodes : nodes.slice(0, 1) } let affected = 0 const errors: string[] = [] for (const el of targets) { const h = el as HTMLElement try { switch (input.operation) { case 'setText': h.textContent = input.value ?? ''; break case 'setHTML': h.innerHTML = input.value ?? ''; break case 'setAttr': if (!input.attr) throw new Error('attr required for setAttr') h.setAttribute(input.attr, input.value ?? '') break case 'removeAttr': if (!input.attr) throw new Error('attr required for removeAttr') h.removeAttribute(input.attr) break case 'addClass': if (input.className) input.className.split(/\s+/).forEach((c) => c && h.classList.add(c)) break case 'removeClass': if (input.className) input.className.split(/\s+/).forEach((c) => c && h.classList.remove(c)) break case 'toggleClass': if (input.className) input.className.split(/\s+/).forEach((c) => c && h.classList.toggle(c)) break case 'setStyle': if (input.style) { for (const [k, v] of Object.entries(input.style)) { (h.style as any)[kebabToCamel(k)] = v } } break case 'remove': h.remove(); break case 'click': { const opts = input.click_options || {} const mode = opts.dispatch ?? 'native' if (mode === 'native') { h.click() } else { const rect = h.getBoundingClientRect() const ev = new MouseEvent('click', { bubbles: true, cancelable: true, view: window, clientX: opts.clientX ?? rect.left + rect.width / 2, clientY: opts.clientY ?? rect.top + rect.height / 2, button: opts.button ?? 0, }) h.dispatchEvent(ev) } break } case 'focus': h.focus(); break case 'blur': h.blur(); break case 'scrollIntoView': h.scrollIntoView({ behavior: 'smooth', block: 'center' }); break case 'scrollTo': h.scrollTo({ top: parseInt(input.value ?? '0'), behavior: 'smooth' }); break case 'append': if (input.value) h.insertAdjacentHTML('beforeend', input.value) break case 'prepend': if (input.value) h.insertAdjacentHTML('afterbegin', input.value) break case 'setValue': // React-compatible value setter: uses the native value setter so // React's synthetic event system notices the change. if (h instanceof HTMLInputElement || h instanceof HTMLTextAreaElement || h instanceof HTMLSelectElement) { const proto = Object.getPrototypeOf(h) const nativeSetter = Object.getOwnPropertyDescriptor(proto, 'value')?.set if (nativeSetter) { nativeSetter.call(h, input.value ?? '') } else { ;(h as any).value = input.value ?? '' } h.dispatchEvent(new Event('input', { bubbles: true })) h.dispatchEvent(new Event('change', { bubbles: true })) } else { throw new Error('setValue only works on input/textarea/select') } break } affected++ } catch (e: unknown) { errors.push(`${(el as HTMLElement).tagName.toLowerCase()}: ${(e as Error).message}`) } } return JSON.stringify({ status: affected > 0 ? 'success' : 'error', operation: input.operation, matched: nodes.length, targeted: targets.length, affected, errors: errors.length ? errors : undefined, }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) /* -------------------------------------------------------------------------- * dom_observe — FIXED * ------------------------------------------------------------------------ */ export const domObserveTool = tool({ name: 'dom_observe', description: 'Watch a DOM subtree for changes (via MutationObserver). Returns list of mutations over a duration window. ' + 'Safe: attributeOldValue / characterDataOldValue are only enabled when their parent flag is enabled (fixed a crash in the original impl).', inputSchema: z.object({ selector: z.string().describe('CSS selector of root to observe'), duration_ms: z.number().optional().describe('How long to observe (default 3000)'), types: z.array(z.enum(['childList', 'attributes', 'characterData'])).optional().describe('Which mutation types to watch. Default: all three.'), subtree: z.boolean().optional().describe('Observe descendants (default true)'), attributeFilter: z.array(z.string()).optional().describe('Only report changes to these attributes (e.g. ["class","style"])'), max_mutations: z.number().optional().describe('Max mutations to return. Default 100.'), }), callback: async (input) => { try { const root = document.querySelector(input.selector) if (!root) return JSON.stringify({ status: 'error', error: `Root element not found: ${input.selector}` }) // Determine which types to observe const watchChild = !input.types || input.types.includes('childList') const watchAttrs = !input.types || input.types.includes('attributes') const watchChar = !input.types || input.types.includes('characterData') if (!watchChild && !watchAttrs && !watchChar) { return JSON.stringify({ status: 'error', error: 'At least one of types must be enabled' }) } const mutations: any[] = [] const obs = new MutationObserver((muts) => { for (const m of muts) { mutations.push({ type: m.type, target: (m.target as Element).tagName?.toLowerCase(), added: m.addedNodes.length, removed: m.removedNodes.length, attribute: m.attributeName, oldValue: m.oldValue, }) } }) // Build the init object carefully — *OldValue flags REQUIRE their parent flag. const initOpts: MutationObserverInit = { childList: watchChild, attributes: watchAttrs, characterData: watchChar, subtree: input.subtree ?? true, } if (watchAttrs) initOpts.attributeOldValue = true if (watchChar) initOpts.characterDataOldValue = true if (watchAttrs && input.attributeFilter && input.attributeFilter.length > 0) { initOpts.attributeFilter = input.attributeFilter } obs.observe(root, initOpts) await new Promise((r) => setTimeout(r, input.duration_ms || 3000)) obs.disconnect() const cap = input.max_mutations ?? 100 return JSON.stringify({ status: 'success', root: input.selector, duration_ms: input.duration_ms || 3000, watched: { childList: watchChild, attributes: watchAttrs, characterData: watchChar }, total: mutations.length, returned: Math.min(mutations.length, cap), mutations: mutations.slice(0, cap), }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) /* -------------------------------------------------------------------------- * dom_wait_for — wait for a selector to appear / become visible * ------------------------------------------------------------------------ */ export const domWaitForTool = tool({ name: 'dom_wait_for', description: 'Wait until a selector matches (or becomes visible). Returns when found, or times out. Useful before interacting with async-rendered UI (panels, modals, routes).', inputSchema: z.object({ selector: z.string(), timeout_ms: z.number().optional().describe('Default 10000'), require_visible: z.boolean().optional().describe('Wait until the element is actually visible (width>0, display!=none). Default false.'), poll_ms: z.number().optional().describe('Polling interval. Default 100.'), }), callback: async (input) => { const deadline = Date.now() + (input.timeout_ms ?? 10000) const poll = input.poll_ms ?? 100 while (Date.now() < deadline) { const el = document.querySelector(input.selector) if (el) { if (!input.require_visible) { return JSON.stringify({ status: 'success', selector: input.selector, found_ms: Date.now() - (deadline - (input.timeout_ms ?? 10000)), element: serializeElement(el, 0, 0), }) } const rect = (el as HTMLElement).getBoundingClientRect() const style = window.getComputedStyle(el) if (rect.width > 0 && rect.height > 0 && style.visibility !== 'hidden' && style.display !== 'none') { return JSON.stringify({ status: 'success', selector: input.selector, element: serializeElement(el, 0, 0), }) } } await new Promise((r) => setTimeout(r, poll)) } return JSON.stringify({ status: 'error', error: 'Timeout', selector: input.selector, timeout_ms: input.timeout_ms ?? 10000, }) }, }) /* -------------------------------------------------------------------------- * dom_dispatch_event — fire real user-input events * ------------------------------------------------------------------------ */ export const domDispatchEventTool = tool({ name: 'dom_dispatch_event', description: 'Dispatch a real DOM event on an element. Supports keyboard (keydown/keypress/keyup), mouse (click/mousedown/mouseup/mousemove), input, change, focus, blur, submit, scroll, or any custom event. ' + 'Use this to simulate real user interaction (e.g. Enter key on a textarea, typing in a React input).', inputSchema: z.object({ selector: z.string(), nth: z.number().optional(), event_type: z.string().describe('Event name, e.g. "keydown", "click", "input", "submit", "custom-event"'), category: z.enum(['keyboard', 'mouse', 'input', 'custom', 'generic']).optional().describe('Default: auto-detect from event_type'), bubbles: z.boolean().optional().describe('Default true'), cancelable: z.boolean().optional().describe('Default true'), // Keyboard specifics key: z.string().optional().describe('e.g. "Enter", "Escape", "ArrowDown"'), code: z.string().optional().describe('e.g. "Enter", "KeyA"'), ctrlKey: z.boolean().optional(), shiftKey: z.boolean().optional(), altKey: z.boolean().optional(), metaKey: z.boolean().optional(), // Mouse specifics clientX: z.number().optional(), clientY: z.number().optional(), button: z.number().optional(), // CustomEvent detail detail: z.any().optional(), }), callback: (input) => { try { const nodes = Array.from(document.querySelectorAll(input.selector)) if (nodes.length === 0) return JSON.stringify({ status: 'error', error: 'No elements matched' }) const el = nodes[input.nth ?? 0] as HTMLElement if (!el) return JSON.stringify({ status: 'error', error: `nth=${input.nth} out of range (${nodes.length})` }) const bubbles = input.bubbles ?? true const cancelable = input.cancelable ?? true const category = input.category ?? ( /^key/i.test(input.event_type) ? 'keyboard' : /^(click|mouse|contextmenu|dblclick)/i.test(input.event_type) ? 'mouse' : /^(input|change|focus|blur)/i.test(input.event_type) ? 'input' : 'generic' ) let ev: Event if (category === 'keyboard') { ev = new KeyboardEvent(input.event_type, { bubbles, cancelable, view: window, key: input.key, code: input.code, ctrlKey: !!input.ctrlKey, shiftKey: !!input.shiftKey, altKey: !!input.altKey, metaKey: !!input.metaKey, }) } else if (category === 'mouse') { const rect = el.getBoundingClientRect() ev = new MouseEvent(input.event_type, { bubbles, cancelable, view: window, clientX: input.clientX ?? rect.left + rect.width / 2, clientY: input.clientY ?? rect.top + rect.height / 2, button: input.button ?? 0, ctrlKey: !!input.ctrlKey, shiftKey: !!input.shiftKey, altKey: !!input.altKey, metaKey: !!input.metaKey, }) } else if (category === 'custom') { ev = new CustomEvent(input.event_type, { bubbles, cancelable, detail: input.detail }) } else { ev = new Event(input.event_type, { bubbles, cancelable }) } const prevented = !el.dispatchEvent(ev) return JSON.stringify({ status: 'success', event_type: input.event_type, category, target: el.tagName.toLowerCase(), default_prevented: prevented, }) } catch (err: unknown) { return JSON.stringify({ status: 'error', error: (err as Error).message }) } }, }) export const DOM_TOOLS = [domQueryTool, domMutateTool, domObserveTool, domWaitForTool, domDispatchEventTool]