import { extractErrorMsg, pretty } from '@ibgib/helper-gib/dist/helpers/utils-helper.mjs'; import { GLOBAL_LOG_A_LOT } from "../../constants.mjs"; const logalot = GLOBAL_LOG_A_LOT; import type { AnalysisResult, ConstructRule, TermFrequencyMap } from './types.mjs'; import type { CorpusAnalyzer } from './corpus-analyzer.mjs'; import { DEFAULT_TOKEN_CONSTRUCT_RULE } from './analysis-engine-constants.mjs'; /** * Internal representation of a compiled rule, separating the pattern * from the flags it requires. */ export interface CompiledInfo { pattern: string; flags: string; } /** * for this early dev implementation, this is hard-coded. But we absolutely have * to get this hooked up with ibgibs in code. */ const DEFAULT_ENGINE_CONSTRUCT_RULES: ConstructRule[] = [ DEFAULT_TOKEN_CONSTRUCT_RULE, ]; /** * heart of the text analysis process that creates a slim composable DSL on top * of regex in order to extract information, e.g., TF-IDF, out of text. * * note that this started as a normal TF-IDF engine, but then the "term" was * abstracted to "construct" so that not only simple words but, e.g., * grammatical constructs can be extracted and analyzed. * * ## usage * * This is consumed in {@link CorpusAnalyzer} * * ## broad idea with ibgib integration * * we want to build a text analyzing engine, but not one that has to be extended * via code. IOW, we want to avoid requiring code writing and recompilation. * Ibgib itself is excellent for "version control", and eventually it will be * used to version src code as a shim to how software is created nowadays (it * basically is a more generalized, more powerful version of git). But right * _now_ we can easily "version control" semantic text entities, and as it * happens, the {@link ConstructRule} that underpins this engine is entirely * text-based. This is not a coincidence. * * So we should be able to not only create these patterns on the fly at runtime, * but we should be able to expose this pattern creation and management to * agents, who can then manage them at runtime, create "tests" that it expects * to see, depending on how much you want the agent to process things. * * The purpose of this is to add analyses to learner's text responses and * speech, to clarify the learner's progression in a language - or any * text-encodable skill really, since all skills require tactics, strategies, * etc. (domain jargon), which can be codified. * * So an agent should ultimately be able to: * * * analyze learner's... * * minigames AND * * entire chat history that the agent has access to, i.e., the agent's * context window * * create new ConstructRule entries to broaden the analysis as needed * * edit ConstructRule entries as needed, depending on how well they perform in * reports according to the agent's opinion * * Using ibgib's protocol for the storage of these ConstructRule entries means * that we should be able to: * * * share the ibgibs reusing the ibgib protocol plumbing * * rel8 rules to create ibgib-stored rulesets * * fork/mut8 rules and rulesets * * basically do all the cool stuff that we can do with other ibgibs */ export class AnalysisEngine { private lc: string = `[${AnalysisEngine.name}]`; private rules: Map; // A cache for compiled info to avoid re-compiling the same rule repeatedly. private compiledInfoCache: Map; public static readonly PRIMORDIAL_TOKEN_CONSTRUCT_NAME = 'token'; public static readonly DEFAULT = new AnalysisEngine(DEFAULT_ENGINE_CONSTRUCT_RULES); constructor(rules: ConstructRule[]) { const lc = `${this.lc}[ctor]`; try { if (logalot) { console.log(`${lc} starting... (I: 08cbaa69ce78baea98ac5f68b2cc6825)`); } // Use a Map for efficient rule lookup by name. this.rules = new Map(rules.map(r => [r.name, r])); this.compiledInfoCache = new Map(); if (!this.rules.has(AnalysisEngine.PRIMORDIAL_TOKEN_CONSTRUCT_NAME)) { throw new Error(`A construct rule named "token" must be provided.`); } } catch (error) { console.error(`${lc} ${extractErrorMsg(error)}`); throw error; } finally { if (logalot) { console.log(`${lc} complete.`); } } } /** * The core recursive compiler. It takes a rule name and returns its * compiled pattern and the set of flags it requires. */ private getCompiledInfo({ name, visited = new Set(), }: { name: string, visited?: Set | undefined, }): CompiledInfo { const lc = `${this.lc}[${this.getCompiledInfo.name}]`; try { if (logalot) { console.log(`${lc} starting... (I: 2ecd741a862ed095ed01a0f874168625)`); } visited ??= new Set(); // Return from cache if this rule has already been compiled. if (this.compiledInfoCache.has(name)) { return this.compiledInfoCache.get(name)!; } if (visited.has(name)) { throw new Error(`Circular dependency detected involving: ${name}`); } visited.add(name); const rule = this.rules.get(name); if (!rule) { throw new Error(`Rule not found: ${name}`); } let pattern = rule.pattern; const flagsSet = new Set(); // STEP 1: Process REGEX(...) definitions first. // This extracts the raw regex and collects any specified flags (like 'i'). pattern = pattern.replace( /REGEX\(\/(.*?)\/([a-z]*)\)/g, (match, regexBody, regexFlags) => { for (const flag of regexFlags) { flagsSet.add(flag); } return regexBody; // Return just the content of the regex } ); // STEP 2: Process [CONSTRUCT:...] definitions. // This recursively compiles child rules and merges their flags. pattern = pattern.replace(/\[CONSTRUCT:([\w-]+)\]/g, (match, subName) => { const childInfo = this.getCompiledInfo({ name: subName, visited: new Set(visited), }); for (const flag of childInfo.flags) { flagsSet.add(flag); } // Use the child's pattern, wrapped in a NON-CAPTURING group `(?:...)` return `(?:${childInfo.pattern})`; }); // STEP 3: Extract the content from the '<< ... >>' capture group. // This is done *after* all replacements to get the final combined pattern. const captureMatch = /<<\s*(.*)\s*>>/.exec(pattern); if (captureMatch) { pattern = captureMatch[1]; } else { throw new Error(`Rule "${name}" must have a '<< ... >>' capture group.`); } // STEP 4: Clean up whitespace that acts as a delimiter between constructs. pattern = pattern.trim().replace(/\s+/g, `\\s+`); const result: CompiledInfo = { pattern: pattern, flags: Array.from(flagsSet).join('') }; // Cache the result before returning. this.compiledInfoCache.set(name, result); return result; } catch (error) { console.error(`${lc} ${extractErrorMsg(error)}`); throw error; } finally { if (logalot) { console.log(`${lc} complete.`); } } } public analyze({ text }: { text: string }): AnalysisResult { const lc = `${this.lc}[${this.analyze.name}]`; try { if (logalot) { console.log(`${lc} starting... (I: 9d9158bc8e18fa22d9ffa8c8be78f825)`); } const constructs: { [key: string]: TermFrequencyMap } = {}; for (const name of this.rules.keys()) { try { const compiledInfo = this.getCompiledInfo({ name }); // Combine collected flags with the required defaults ('gdu'). const finalFlags = Array.from(new Set('gdu' + compiledInfo.flags)).join(''); // The final regex is the compiled pattern wrapped in its own CAPTURING group. const finalRegex = new RegExp(`(${compiledInfo.pattern})`, finalFlags); constructs[name] = {}; const matches = text.matchAll(finalRegex); for (const match of matches) { // match[1] will be the content of our single, outer capture group. const term = match[1]?.trim(); if (term) { const normalizedTerm = term.toLowerCase(); constructs[name][normalizedTerm] = (constructs[name][normalizedTerm] || 0) + 1; } } } catch (error: any) { console.error(`${lc} Error processing rule "${name}": ${extractErrorMsg(error)}`); // If a rule fails to compile or run, log the error and mark it in the results. console.error(``, error.message); constructs[name] = { 'ERROR: See console for details': 1 }; } } const tokenFrequencyMap = constructs[AnalysisEngine.PRIMORDIAL_TOKEN_CONSTRUCT_NAME] || {}; const tokenCount = Object.values(tokenFrequencyMap).reduce((sum, count) => sum + count, 0); const uniqueTokenCount = Object.keys(tokenFrequencyMap).length; return { characterCount: text.length, tokenCount, uniqueTokenCount, constructs, }; } catch (error) { console.error(`${lc} ${extractErrorMsg(error)}`); throw error; } finally { if (logalot) { console.log(`${lc} complete.`); } } } }