import { hash } from '@ibgib/helper-gib/dist/helpers/utils-helper.mjs'; // import { SemanticChunkInfo } from './page-analyzer-types.mjs'; const logalot = true; const lc = `[semantic-chunker.mts]`; // The threshold at which an element's score is considered a heading. const HEADING_SCORE_THRESHOLD = 30; // The maximum character length for a single chunk before it's broken down further. const MAX_CHUNK_CHAR_LENGTH = 2000; /** * Calculates a "heading score" for a given element to determine if it should be * treated as a semantic heading for chunking purposes. * * @returns A score where a higher number indicates a higher likelihood of being a heading. * * ## intent * * some pages do not use conventional headings, rather, they use some sort of * manual semantic headings. this is an attempt at providing a likelihood that * such an element is a heading. * * For example, if the following is found: * *

1. TOPIC YO: here is some things some paragraph

*

Sub-topic here and here is some more text as a sub-topic...

*

Sub-topic here as well and here is some more text as a sub-topic...

*

maybe just another paragraph

*

2. TOPIC TWO YO: here is some things some paragraph

*

Another sub-topic here and here is some more text as a sub-topic...

* * Some of these are actually headings or parts of them include headings and * this is a specific use case this function should be handling. */ function getHeadingScore(el: HTMLElement): number { if (!el?.tagName) { return 0; } let score = 0; const text = el.innerText?.trim() ?? ''; if (!text) { return 0; } const tagName = el.tagName.toLowerCase(); // Assign base scores for traditional heading tags. switch (tagName) { case 'h1': score += 50; break; case 'h2': score += 45; break; case 'h3': score += 40; break; case 'h4': score += 35; break; case 'h5': score += 30; break; case 'h6': score += 25; break; } // Strong signal: A

tag that contains ONLY a tag with a numbered format. // This is common in legal documents or rules. const hasSingleStrongChild = el.children.length === 1 && el.children[0].tagName.toLowerCase() === 'strong'; if (tagName === 'p' && hasSingleStrongChild) { const strongText = (el.children[0] as HTMLElement).innerText?.trim() ?? ''; if (/^\d+\.\s/.test(strongText)) { score += 50; // Very strong signal } } // Another strong signal: Text is all uppercase (and of reasonable length). if (text.length > 5 && text.length < 100 && text === text.toUpperCase()) { score += 40; } // Moderate signal: Element contains a tag. if (el.querySelector('strong')) { score += 15; } // Negative signal: Text is too long to be a heading. if (text.length > 150) { score -= 20; } // Negative signal: Contains significant text OUTSIDE of its children, // indicating it's a paragraph with some bolding, not a heading. const directText = Array.from(el.childNodes) .filter(n => n.nodeType === Node.TEXT_NODE && n.textContent?.trim()) .map(n => n.textContent!.trim()) .join(' '); if (directText.length > 30) { score -= 15; } return score; } /** * Recursively builds a tree of SemanticChunkInfo nodes from a list of DOM nodes. * * This function works by iterating through a list of sibling nodes. * - If it finds a "heading" (based on getHeadingScore), it creates a new chunk. * - It then groups all subsequent siblings under that heading until it finds another * heading of the same or higher importance. * - It calls itself recursively on the collected group of child nodes. * - If it encounters simple content nodes, it groups them into a single "content" chunk. * - If a content chunk exceeds MAX_CHUNK_CHAR_LENGTH, it recursively breaks it down. */ // async function buildSemanticTree(nodes: NodeList): Promise { // const chunks: SemanticChunkInfo[] = []; async function buildSemanticTree(nodes: NodeList): Promise { const chunks: any[] = []; let currentContentNodes: HTMLElement[] = []; // Helper to process and flush the collected content nodes into a chunk const flushContentChunk = async () => { if (currentContentNodes.length === 0) { return; } const text = currentContentNodes.map(n => n.innerText?.trim()).join('\n\n'); if (text.trim()) { if (text.length > MAX_CHUNK_CHAR_LENGTH) { // If the combined text is too long, don't create one big chunk. // Instead, recursively call buildSemanticTree on the children of each node in the group. // This will "zoom in" and apply the same logic to the smaller pieces. if (logalot) { console.log(`[semantic-chunker] Content chunk is too large (${text.length} > ${MAX_CHUNK_CHAR_LENGTH}). Recursively breaking it down.`); } for (const node of currentContentNodes) { const subChunks = await buildSemanticTree(node.childNodes); chunks.push(...subChunks); } } else { // The chunk is a reasonable size, create it as a single unit. chunks.push({ gibId: await hash({ s: text }), text: text, tags: currentContentNodes.map(n => n.tagName.toLowerCase()), }); } } currentContentNodes = []; }; for (let i = 0; i < nodes.length; i++) { const node = nodes[i]; if (node.nodeType !== Node.ELEMENT_NODE) { continue; } // Skip text/comment nodes const el = node as HTMLElement; const score = getHeadingScore(el); if (score >= HEADING_SCORE_THRESHOLD) { // Found a heading. First, flush any preceding content. await flushContentChunk(); // This is the new heading chunk. const title = el.innerText.trim(); // const headingChunk: SemanticChunkInfo = { const headingChunk: any = { gibId: await hash({ s: title }), title: title, text: '', // Will be populated by children tags: [el.tagName.toLowerCase()], children: [], }; // Collect all subsequent nodes that belong to this new heading section. const childNodesForHeading: Node[] = []; let j = i + 1; for (; j < nodes.length; j++) { const nextNode = nodes[j]; if (nextNode.nodeType === Node.ELEMENT_NODE) { const nextEl = nextNode as HTMLElement; const nextScore = getHeadingScore(nextEl); if (nextScore >= score) { break; // Found another heading of same or greater importance. } } childNodesForHeading.push(nextNode); } // Recursively build the tree for the collected child nodes. headingChunk.children = await buildSemanticTree(childNodesForHeading as any); // The parent's text is a concatenation of its children's text. headingChunk.text = headingChunk.children .map(c => c.title ? `${c.title}\n${c.text}` : c.text) .join('\n\n'); chunks.push(headingChunk); // Advance the outer loop past the nodes we just processed. i = j - 1; } else { // This node is not a heading. // If this element itself is very large, flush what we have and then process it. // This prevents a small paragraph from being grouped with a massive div that follows it. const elText = el.innerText?.trim(); if (elText && elText.length > MAX_CHUNK_CHAR_LENGTH) { await flushContentChunk(); // Flush anything before this large element currentContentNodes.push(el); await flushContentChunk(); // Immediately process the large element (which will trigger recursion) } else { // Add it to the list of simple content nodes to be grouped. currentContentNodes.push(el); } } } // Flush any remaining content at the very end. await flushContentChunk(); return chunks; } /** * Creates a semantic tree structure from the provided HTML element by analyzing * its content for "semantic headings" based on styling and structure cues, * not just h1-h6 tags. * * This builds a full, granular tree of the document's structure. * * @returns A promise that resolves to the root-level array of SemanticChunkInfo nodes. */ // export async function semanticChunker(bestCandidateEl: HTMLElement): Promise { export async function semanticChunker(bestCandidateEl: HTMLElement): Promise { const lcChunker = `${lc}[semanticChunker]`; if (logalot) { console.log(`${lcChunker} starting...`); } if (!bestCandidateEl) { if (logalot) { console.warn(`${lcChunker} bestCandidateEl is falsy, returning empty array.`); } return []; } const tree = await buildSemanticTree(bestCandidateEl.childNodes); if (logalot) { console.log(`${lcChunker} complete. Built tree:`); console.dir(tree); } return tree; }