/** * Author: Charuka Rathnayaka * Email: CharukaR@99x.io **/ import { AbstractDataExtractor } from './AbstractDataExtractor.js'; import { GemmaExtractorPromptTemplate } from '../promptTemplate/GemmaExtractorPromptTemplate.js'; import { AbstractPromptTemplate } from '../promptTemplate/AbstractPromptTemplate.js'; import { Utility } from '../utils/Utility.js'; import { BasicLLMInferenceEngine } from '@optimaxer/web-core'; export class SLMDataExtractor extends AbstractDataExtractor { /** * extractData * This function extracts data from the given text using the SLM model. * @param text The text to extract data from. * @param schema The schema to extract data based on. * @returns Promise<{ [key: string]: any }> - The extracted data. */ async extractData(text: string, schema: { [key: string]: any } = {}, llmInstance: BasicLLMInferenceEngine): Promise<{ [key: string]: any }> { // implementation const promptTemplate: AbstractPromptTemplate = new GemmaExtractorPromptTemplate(); const requiredFields: string[] = Object.keys(schema); const formattedPrompt: string = promptTemplate.formatPrompt({userText: text, schema: schema, requiredFields: requiredFields}); // console.log("formattedPrompt:", formattedPrompt); const llm_extraction: string = await llmInstance.generateInference(formattedPrompt); // console.log("sentiment Inference", llm_extraction); const json_extraction = Utility.extractJsonFromLlmResponse(llm_extraction) return json_extraction } /** * extractDataForComplexObject * This function extracts data for a complex object from the given text using the SLM model. * @param text The text to extract data from. * @param schema The schema to extract data based on. * @returns Promise<{ [key: string]: any }> - The extracted data **/ async extractDataForComplexObject(text: string, schema: { [key: string]: any } = {}, llmInstance: BasicLLMInferenceEngine): Promise<{ [key: string]: any }> { console.log("schema:", schema); const arrayOfSchema = Utility.convertNestedJsonToArrayOfSimpleJson(schema); console.log("arrayOfSchema:", arrayOfSchema); var arrayExtractions = [] for (let i = 0; i < arrayOfSchema.length; i++) { console.log("arrayOfSchema[i]:", arrayOfSchema[i]); if(Object.keys(arrayOfSchema[i]).length == 0){ arrayExtractions.push({}) } else{ const promptTemplate: AbstractPromptTemplate = new GemmaExtractorPromptTemplate(); const requiredFields: string[] = Object.keys(arrayOfSchema[i]); var does_keys_aggregated = false; if(requiredFields.includes("does_keys_aggregated")){ does_keys_aggregated = true; requiredFields.splice(requiredFields.indexOf("does_keys_aggregated"), 1); } const formattedPrompt: string = promptTemplate.formatPrompt({userText: text, schema: arrayOfSchema[i], requiredFields: requiredFields}); // console.log("formattedPrompt:", formattedPrompt); const llm_extraction: string = await llmInstance.generateInference(formattedPrompt); console.log("Inference", llm_extraction); const json_extraction = Utility.extractJsonFromLlmResponse(llm_extraction) const replacedJson = this.replaceDataValues(json_extraction, arrayOfSchema[i]); console.log("replacedJson:", replacedJson); arrayExtractions.push({...replacedJson, does_keys_aggregated: does_keys_aggregated}); } } console.log("arrayExtractions:", arrayExtractions); const compositeJson = Utility.rebuildNestedJson(arrayExtractions); console.log("compositeJson:", compositeJson); return compositeJson; } replaceDataValues(data:{ [key: string]: any }, schema: { [key: string]: any }): { [key: string]: any } { const keysWithGivenValues = Object.keys(schema).filter(key => Array.isArray(schema[key])); if(keysWithGivenValues.length === 0){ return data; } keysWithGivenValues.forEach(key => { if(data[key]){ if(schema[key].length>0){ const mostSimilarValue = Utility.getClosestMatch(data[key], schema[key]); data[key] = mostSimilarValue; }else{ console.log("Error: No values found for the key", key); } } }); return data; } }