import { hash } from '@ibgib/helper-gib/dist/helpers/utils-helper.mjs';
// import { SemanticChunkInfo } from './page-analyzer-types.mjs';
const logalot = true;
const lc = `[semantic-chunker.mts]`;
// The threshold at which an element's score is considered a heading.
const HEADING_SCORE_THRESHOLD = 30;
// The maximum character length for a single chunk before it's broken down further.
const MAX_CHUNK_CHAR_LENGTH = 2000;
/**
* Calculates a "heading score" for a given element to determine if it should be
* treated as a semantic heading for chunking purposes.
*
* @returns A score where a higher number indicates a higher likelihood of being a heading.
*
* ## intent
*
* some pages do not use conventional headings, rather, they use some sort of
* manual semantic headings. this is an attempt at providing a likelihood that
* such an element is a heading.
*
* For example, if the following is found:
*
*
1. TOPIC YO: here is some things some paragraph
* Sub-topic here and here is some more text as a sub-topic...
* Sub-topic here as well and here is some more text as a sub-topic...
* maybe just another paragraph
* 2. TOPIC TWO YO: here is some things some paragraph
* Another sub-topic here and here is some more text as a sub-topic...
*
* Some of these are actually headings or parts of them include headings and
* this is a specific use case this function should be handling.
*/
function getHeadingScore(el: HTMLElement): number {
if (!el?.tagName) { return 0; }
let score = 0;
const text = el.innerText?.trim() ?? '';
if (!text) { return 0; }
const tagName = el.tagName.toLowerCase();
// Assign base scores for traditional heading tags.
switch (tagName) {
case 'h1': score += 50; break;
case 'h2': score += 45; break;
case 'h3': score += 40; break;
case 'h4': score += 35; break;
case 'h5': score += 30; break;
case 'h6': score += 25; break;
}
// Strong signal: A tag that contains ONLY a tag with a numbered format.
// This is common in legal documents or rules.
const hasSingleStrongChild = el.children.length === 1 && el.children[0].tagName.toLowerCase() === 'strong';
if (tagName === 'p' && hasSingleStrongChild) {
const strongText = (el.children[0] as HTMLElement).innerText?.trim() ?? '';
if (/^\d+\.\s/.test(strongText)) {
score += 50; // Very strong signal
}
}
// Another strong signal: Text is all uppercase (and of reasonable length).
if (text.length > 5 && text.length < 100 && text === text.toUpperCase()) {
score += 40;
}
// Moderate signal: Element contains a tag.
if (el.querySelector('strong')) {
score += 15;
}
// Negative signal: Text is too long to be a heading.
if (text.length > 150) {
score -= 20;
}
// Negative signal: Contains significant text OUTSIDE of its children,
// indicating it's a paragraph with some bolding, not a heading.
const directText = Array.from(el.childNodes)
.filter(n => n.nodeType === Node.TEXT_NODE && n.textContent?.trim())
.map(n => n.textContent!.trim())
.join(' ');
if (directText.length > 30) {
score -= 15;
}
return score;
}
/**
* Recursively builds a tree of SemanticChunkInfo nodes from a list of DOM nodes.
*
* This function works by iterating through a list of sibling nodes.
* - If it finds a "heading" (based on getHeadingScore), it creates a new chunk.
* - It then groups all subsequent siblings under that heading until it finds another
* heading of the same or higher importance.
* - It calls itself recursively on the collected group of child nodes.
* - If it encounters simple content nodes, it groups them into a single "content" chunk.
* - If a content chunk exceeds MAX_CHUNK_CHAR_LENGTH, it recursively breaks it down.
*/
// async function buildSemanticTree(nodes: NodeList): Promise {
// const chunks: SemanticChunkInfo[] = [];
async function buildSemanticTree(nodes: NodeList): Promise {
const chunks: any[] = [];
let currentContentNodes: HTMLElement[] = [];
// Helper to process and flush the collected content nodes into a chunk
const flushContentChunk = async () => {
if (currentContentNodes.length === 0) { return; }
const text = currentContentNodes.map(n => n.innerText?.trim()).join('\n\n');
if (text.trim()) {
if (text.length > MAX_CHUNK_CHAR_LENGTH) {
// If the combined text is too long, don't create one big chunk.
// Instead, recursively call buildSemanticTree on the children of each node in the group.
// This will "zoom in" and apply the same logic to the smaller pieces.
if (logalot) { console.log(`[semantic-chunker] Content chunk is too large (${text.length} > ${MAX_CHUNK_CHAR_LENGTH}). Recursively breaking it down.`); }
for (const node of currentContentNodes) {
const subChunks = await buildSemanticTree(node.childNodes);
chunks.push(...subChunks);
}
} else {
// The chunk is a reasonable size, create it as a single unit.
chunks.push({
gibId: await hash({ s: text }),
text: text,
tags: currentContentNodes.map(n => n.tagName.toLowerCase()),
});
}
}
currentContentNodes = [];
};
for (let i = 0; i < nodes.length; i++) {
const node = nodes[i];
if (node.nodeType !== Node.ELEMENT_NODE) { continue; } // Skip text/comment nodes
const el = node as HTMLElement;
const score = getHeadingScore(el);
if (score >= HEADING_SCORE_THRESHOLD) {
// Found a heading. First, flush any preceding content.
await flushContentChunk();
// This is the new heading chunk.
const title = el.innerText.trim();
// const headingChunk: SemanticChunkInfo = {
const headingChunk: any = {
gibId: await hash({ s: title }),
title: title,
text: '', // Will be populated by children
tags: [el.tagName.toLowerCase()],
children: [],
};
// Collect all subsequent nodes that belong to this new heading section.
const childNodesForHeading: Node[] = [];
let j = i + 1;
for (; j < nodes.length; j++) {
const nextNode = nodes[j];
if (nextNode.nodeType === Node.ELEMENT_NODE) {
const nextEl = nextNode as HTMLElement;
const nextScore = getHeadingScore(nextEl);
if (nextScore >= score) {
break; // Found another heading of same or greater importance.
}
}
childNodesForHeading.push(nextNode);
}
// Recursively build the tree for the collected child nodes.
headingChunk.children = await buildSemanticTree(childNodesForHeading as any);
// The parent's text is a concatenation of its children's text.
headingChunk.text = headingChunk.children
.map(c => c.title ? `${c.title}\n${c.text}` : c.text)
.join('\n\n');
chunks.push(headingChunk);
// Advance the outer loop past the nodes we just processed.
i = j - 1;
} else {
// This node is not a heading.
// If this element itself is very large, flush what we have and then process it.
// This prevents a small paragraph from being grouped with a massive div that follows it.
const elText = el.innerText?.trim();
if (elText && elText.length > MAX_CHUNK_CHAR_LENGTH) {
await flushContentChunk(); // Flush anything before this large element
currentContentNodes.push(el);
await flushContentChunk(); // Immediately process the large element (which will trigger recursion)
} else {
// Add it to the list of simple content nodes to be grouped.
currentContentNodes.push(el);
}
}
}
// Flush any remaining content at the very end.
await flushContentChunk();
return chunks;
}
/**
* Creates a semantic tree structure from the provided HTML element by analyzing
* its content for "semantic headings" based on styling and structure cues,
* not just h1-h6 tags.
*
* This builds a full, granular tree of the document's structure.
*
* @returns A promise that resolves to the root-level array of SemanticChunkInfo nodes.
*/
// export async function semanticChunker(bestCandidateEl: HTMLElement): Promise {
export async function semanticChunker(bestCandidateEl: HTMLElement): Promise {
const lcChunker = `${lc}[semanticChunker]`;
if (logalot) { console.log(`${lcChunker} starting...`); }
if (!bestCandidateEl) {
if (logalot) { console.warn(`${lcChunker} bestCandidateEl is falsy, returning empty array.`); }
return [];
}
const tree = await buildSemanticTree(bestCandidateEl.childNodes);
if (logalot) {
console.log(`${lcChunker} complete. Built tree:`);
console.dir(tree);
}
return tree;
}