/** * @license * Copyright 2016 Telefónica I+D * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import * as fs from 'fs'; import * as yaml from 'js-yaml'; import * as _ from 'lodash'; import { EventEmitter } from 'events'; import { Luis } from './luis-model'; interface Token { token: string; startChar: number; endChar: number; } const MIN_EXAMPLES_PER_INTENT = 3; export type culture = 'en-us' | 'es-es'; export class LanguageModelParser extends EventEmitter { private doc: any = {}; public culture: culture; parse(files: string[], culture: culture, ner?: boolean): Luis.Model { files.forEach(file => { // XXX Conflicting keys not supported. Multiple files could be merged together. try { let yamlFileContents = yaml.safeLoad(fs.readFileSync(file, 'utf8')); // Look for repeated elements in lists for each file and warn about it. // Note that when merging, new duplicates could appear but those will be // silently removed. The goal is to only warn at a file level. this.warnAboutDuplicates(yamlFileContents); mergeDeep(this.doc, yamlFileContents); } catch (err) { let e = `File "${file}": ${err.message}`; this.emitError(e); } }); this.culture = culture; let luisModel: Luis.Model = { luis_schema_version: ner ? '2.1.0' : '1.3.0', name: 'tef', desc: 'Bot Model ' + new Date(), culture: culture, intents: [], entities: [], composites: [], bing_entities: [], actions: [], model_features: [], regex_features: [], utterances: [] }; let keys = Object.keys(this.doc); let replacements = new Map(); keys.filter(listKey => listKey.startsWith('list.')) .forEach(listKey => { replacements.set( listKey.slice('list.${'.length, -1), this.doc[listKey] ); }); let usedReplacements = new Set(); let missedReplacements = new Set(); let entitiesMap = new Map(); let utterancesMap = new Map(); let intentNames = this.doc.intents ? Object.keys(this.doc.intents) .map(intentName => { if (intentName.length > 50) { let err = `Intent "${intentName}" should be less than 50 characters. was ${intentName.length}`; this.emitError(err); } return intentName; }) : []; intentNames.forEach(intent => { let sentences = this.doc.intents[intent]; sentences .map((sentence: string) => { let match = sentence.match(/\w+\[.*\]\w+/); if (match) { let err = `White space missing before or after entity declaration in entry "${sentence}" -> "${match[0]}"`; this.emitError(err); } return sentence; }) .map((sentence: string) => this.searchMissedVariables(sentence, replacements, missedReplacements)) .map((sentence: string) => this.expandVariables(sentence, replacements, usedReplacements)) .reduce((a: string[], b: string[]) => a.concat(b)) // flatten arrays .forEach((sentence: string) => { let utterance = this.buildUtterance(sentence, intent, ner); utterance.entities.forEach(entity => this.registerEntity(entity, entitiesMap)); if (utterancesMap.has(utterance.text)) { let err = `Utterance "${utterance.text}" is assigned to ` + `both "${utterancesMap.get(utterance.text).intent}" and "${utterance.intent}" intents`; this.emitError(err); } utterancesMap.set(utterance.text, utterance); }); }); let utterances = Array.from(utterancesMap.values()); // Look for intents with too many examples let examplesPerIntent = _.countBy(utterances, 'intent'); let intentsWithTooManyExamples = _.pickBy(examplesPerIntent, (counter: number) => counter < MIN_EXAMPLES_PER_INTENT); if (!_.isEmpty(intentsWithTooManyExamples)) { let err = `The following intents have less than ${MIN_EXAMPLES_PER_INTENT} examples:\n` + _.keys(intentsWithTooManyExamples).map(intent => ` - ${intent}`).join('\n'); this.emitError(err); } // Print warnings about unused lists if (usedReplacements.size < replacements.size) { replacements.forEach((values, key) => { if (!usedReplacements.has(key)) { this.emitWarning(`The list "list.$\{${key}\}" has not been used in any sentence.`); } }); } // Print warnings about sentences with list placeholders that points to an non-existent list if (missedReplacements.size > 0) { missedReplacements.forEach(value => { this.emitWarning(`The list "list.$\{${value}\}" is being used from some sentences but it has not been declared.`); }); } let features = _.toPairs(this.doc.phraselist) .map((value: any) => { let name = String(value[0]); let activated = value[1].activated == null ? true : value[1].activated; let mode = value[1].mode == null ? true : value[1].mode; let words = (value[1].words || []) .map((word: any) => { let strword = ner ? '[' + String(word) + ':' + name + ']' : String(word); if (strword.indexOf(',') !== -1) { let err = `Phrase list "${name}" can not contain commas ('${strword}')`; this.emitError(err); } return LanguageModelParser.tokenize(strword).join(' '); }) .join(','); return { activated, mode, name, words } as Luis.ModelFeature; }); let bingEntities = this.doc.builtin || []; luisModel.utterances = utterances; luisModel.entities = Array.from(entitiesMap.values()); luisModel.intents = intentNames.map(intent => { name: intent }); luisModel.model_features = features; luisModel.bing_entities = bingEntities; return luisModel; } private searchMissedVariables(sentence: string, variables: Map, missedVariables: Set): string { let match = sentence.match(/\${(.+?)}/); if (match) { for (let i = 1; i < match.length; i++) { if (!variables.has(match[i])) { missedVariables.add(match[i]); } } } return sentence; } private expandVariables(sentence: string, variables: Map, usedVariables: Set): string[] { let expandedSentences = new Set([sentence]); expandedSentences.forEach(sentence => { variables.forEach((values, key) => { values.forEach(value => { let search = '${' + key + '}'; if (sentence.indexOf(search) !== -1) { usedVariables.add(key); let newSentence = sentence.replace(search, value); if (newSentence !== sentence) { expandedSentences.add(newSentence); expandedSentences.delete(sentence); } else { expandedSentences.add(sentence); } } }); }); }); return Array.from(expandedSentences); } private extractEntities(sentence: string): any[] { let regexEntity = /\[(.+?):(.+?)\]/g; // entities are tagged as [entityValue:entityType], ex. [Burgos:city] let entities: any[] = []; let match: RegExpExecArray; while (match = regexEntity.exec(sentence)) { let entityValue = match[1]; let entityType = match[2]; entities.push({ entityValue, entityType }); } return entities; } private registerEntity(entity: any, entitiesMap: Map): void { let entityType: string = entity.entity; let entitySubtype: string; let composedEntitySeparatorPosition = entityType.indexOf('::'); if (composedEntitySeparatorPosition !== -1) { entitySubtype = entityType.substring(composedEntitySeparatorPosition + '::'.length); entityType = entityType.substring(0, composedEntitySeparatorPosition); } let luisEntity = entitiesMap.get(entityType); luisEntity = luisEntity || { name: entityType }; if (entitySubtype) { luisEntity.children = luisEntity.children || []; if (luisEntity.children.indexOf(entitySubtype) === -1) { luisEntity.children.push(entitySubtype); } } entitiesMap.set(entityType, luisEntity); } private normalizeSentence(sentence: string): string { let normalized = sentence // replace multiple spaces with a single one .replace(/\s\s+/g, ' ') .trim(); switch (this.culture) { case 'en-us': // in an en-us luis app, nothing but ASCII 7 chars are lowercased // see test/fixtures/{en-es}-cornercases.yaml normalized = normalized.replace(/[A-Z]/g, capture => capture.toLowerCase()); break; default: normalized = normalized.toLocaleLowerCase(); } return normalized; } private static wordCount(sentence: string): number { return LanguageModelParser.tokenize(sentence).length; } /** * Tokenize a sentence following the LUIS rules returning the tokens and delimiters. * TODO: Memoize this function. */ private static splitSentenceByTokens(sentence: string): Token[] { if (!sentence || sentence.trim().length === 0) { return []; } sentence = sentence.replace(/[\s\uFEFF\xA0]+$/g, ''); // Right trim // The following is a RegExp that contains the UTF-8 characters (http://www.utf8-chartable.de/unicode-utf8-table.pl) // that are understood by LUIS as part of a word. Chars not included here // are considered as separated words by LUIS and so as independent tokens const WORD_CHARS = '0-9A-Za-z' + // Numbers and English letters 'ªº' + // Ordinal indicators '\u00B5' + // Micro sign '\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02AF' + // Non-english latin letters (accents and others) '\u02B0-\u02C1' + // Modifier letters '\u0370-\u0374\u0376-\u0377\u037A-\u037D\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03FF' + // Greek and Coptic alphabets '\u0400-\u0481\u048A-\u0523' // Cyrillic alphabet // Leaving the remaining alphabets for another brave person ; // A word is any number > 0 of WORD_CHARS const WORD = new RegExp(`^[${WORD_CHARS}]+`); // A non-word is any character not in WORD_CHARS and not a space const NON_WORD = new RegExp(`^[^\s${WORD_CHARS}]`); let tokens: Token[] = []; // Walk through the sentence consuming chunks that matches WORD or NON_WORD let sentenceIndex = 0; while (sentence.length) { // Ignore spaces at the beginning of the remaining sentence let leadingSpaces = sentence.match(/^\s*/)[0].length; // Consume the spaces sentenceIndex += leadingSpaces; sentence = sentence.slice(leadingSpaces); // Try a word let tokenRegExpRes = sentence.match(WORD); if (!tokenRegExpRes) { // If not a word, try a non-word tokenRegExpRes = sentence.match(NON_WORD); } if (!tokenRegExpRes) { // If not word nor non-word... It should be impossible throw new Error(`The sentence ${sentence} cannot be classified as word or non-word`); } let token = tokenRegExpRes[0]; tokens.push({ token: token, startChar: sentenceIndex, endChar: sentenceIndex + token.length - 1 }); // Consume the recognized token sentenceIndex += token.length; sentence = sentence.slice(token.length); } return tokens; } /** * Tokenize a sentence following the LUIS rules and return an array of strings */ private static tokenize(sentence: string): string[] { return LanguageModelParser.splitSentenceByTokens(sentence).map(token => token.token); } private buildUtterance(sentence: string, intent: string, ner: boolean) { let entities: any[] = []; let parts = ''; let trimmedSentence = sentence .replace(/\s\s+/g, ' ') // replace multiple spaces with a single one .trim(); trimmedSentence // split by entities: // "Santiago Bernabeu went to [Santiago Bernabeu:place]." will split in // [ "Santiago Bernabeu went to ", "[Santiago Bernabeu:place]", "." ] .split(/(\[.+?:.+?\])/g) .forEach(part => { let extractedEntities = this.extractEntities(part); if (extractedEntities.length) { extractedEntities.forEach(entity => { let startPos = ner ? trimmedSentence.indexOf(part) : LanguageModelParser.wordCount(parts); parts += ner ? part : entity.entityValue; let endPos = ner ? startPos + part.length - 1 : LanguageModelParser.wordCount(parts) - 1; entities.push({ entity: entity.entityType, startPos, endPos }); }); } else { parts += part; } }); let utterance: Luis.Utterance = { text: ner ? parts : this.normalizeSentence(parts), intent, entities }; return utterance; } private warnAboutDuplicates(obj: any, prefix: string[] = []) { const duplicates = (array: any[]) => _.uniq(_.filter(array, (v, i, col) => _.includes(col, v, i + 1))); _.forEach(obj, (value, key) => { if (value && Array.isArray(value)) { let duplicatedValues = duplicates(value); if (duplicatedValues.length) { let breadcrumb = prefix.concat(key).join(' -> '); this.emitWarning(`The key "${breadcrumb}" has the following duplicated values: ${duplicatedValues.join(', ')}`); } } else if (isObject(value)) { this.warnAboutDuplicates(value, prefix.concat(key)); } }); } private emitWarning(msg: string) { this.emit('warning', msg); } private emitError(msg: string) { this.emit('error', msg); } } /** * http://stackoverflow.com/a/34749873/12388 * Simple is object check. * @param item * @returns {boolean} */ function isObject(item: any) { return (item && typeof item === 'object' && !Array.isArray(item)); } /** * http://stackoverflow.com/a/34749873/12388 * Deep merge two objects. * @param target * @param source */ function mergeDeep(target: any, source: any) { if (isObject(target) && isObject(source)) { for (const key in source) { if (!source[key]) { // Ignore empty values such as an intent key without examples continue; } else if (isObject(source[key])) { if (!target[key]) { Object.assign(target, { [key]: {} }); } if (!isObject(target[key])) { throw new Error(`Property ${key} already exist in target but it is not an object`); } mergeDeep(target[key], source[key]); } else if (Array.isArray(source[key])) { if (!target[key]) { Object.assign(target, { [key]: [] }); } if (!Array.isArray(target[key])) { throw new Error(`Property ${key} already exist in target but it is not an array`); } target[key] = _.uniq(target[key].concat(source[key])); } else { Object.assign(target, { [key]: source[key] }); } } } return target; }