/* * Copyright (c) Minh Loi. * * This file is part of Ulangi which is released under GPL v3.0. * See LICENSE or go to https://www.gnu.org/licenses/gpl-3.0.txt */ import { assertExists } from '@ulangi/assert'; import * as _ from 'lodash'; import { WordClassConverter } from '../converters/WordClassConverter'; import { DictionaryDefinition } from '../interfaces/DictionaryDefinition'; import { DictionaryEntry } from '../interfaces/DictionaryEntry'; import { WiktionaryDefinition } from '../interfaces/WiktionaryDefinition'; import { WiktionaryDefinitions } from '../interfaces/WiktionaryDefinitions'; import { WiktionaryEtymology } from '../interfaces/WiktionaryEtymology'; import { WiktionaryExample } from '../interfaces/WiktionaryExample'; import { WiktionaryGlyphOrigin } from '../interfaces/WiktionaryGlyphOrigin'; import { WiktionaryLanguage } from '../interfaces/WiktionaryLanguage'; import { WiktionaryPronunciation } from '../interfaces/WiktionaryPronunciation'; import { WiktionaryQuotation } from '../interfaces/WiktionaryQuotation'; import { WiktionaryWordClass } from '../interfaces/WiktionaryWordClass'; type WiktionarySection = | WiktionaryEtymology | WiktionaryGlyphOrigin | WiktionaryPronunciation | WiktionaryDefinitions | WiktionaryDefinition | WiktionaryWordClass | WiktionaryQuotation | WiktionaryExample; export class WiktionaryPageConverter { private wordClassConverter = new WordClassConverter(); public convertToDictionaryEntry( pageTitle: string, pageLanguage: WiktionaryLanguage ): DictionaryEntry { return { vocabularyTerm: pageTitle, definitions: this.extractDefinitions(pageLanguage.children), categories: this.extractCategories(pageLanguage), tags: [], ipa: this.extractIPA(pageLanguage), gender: this.extractGender(pageLanguage), feminine: this.extractFeminine(pageLanguage), masculine: this.extractMasculine(pageLanguage), plural: this.extractPlural(pageLanguage), pinyin: this.extractPinyin(pageLanguage), zhuyin: this.extractZhuyin(pageLanguage), simplified: this.extractSimplied(pageLanguage), traditional: this.extractTraditional(pageLanguage), hiragana: this.extractHiragana(pageLanguage), reading: this.extractReading(pageLanguage), romaji: this.extractRomaji(pageLanguage), romanization: this.extractRomanization(pageLanguage), sources: ['wiktionary'], }; } public extractIPA(pageLanguage: WiktionaryLanguage): undefined | string[] { const ipa: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'pronunciation') { const reg = /((?:\([^\)]*\))?\s?IPA\(key\): .*)/; const pronunciations = section.pronunciation.split('\n'); pronunciations .filter((pronunication): boolean => { return reg.test(pronunication); }) .forEach((pronunication): void => { const matches = reg.exec(pronunication); if (matches !== null) { matches.shift(); ipa.push( matches .filter((match): boolean => !_.isEmpty(match)) .map((match): string => match.replace(/IPA\(key\):\s/g, '')) .map(_.trim) .join(' ') ); } }); sections.push(...section.children); } else if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return ipa.length > 0 ? _.uniq(ipa) : undefined; } public extractPlural(pageLanguage: WiktionaryLanguage): undefined | string[] { const plural: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'wordClass') { if (typeof section.headword !== 'undefined') { const headwords = section.headword.split('\n'); const reg = /((?:feminine|masculine)?\s?plural [^\),]*)/g; headwords .filter((headword): boolean => { return reg.test(headword); }) .forEach((headword): void => { const matches = headword.match(reg); if (matches !== null) { plural.push( ...matches .filter((match): boolean => !_.isEmpty(match)) .map((match): string => match .replace('masculine plural', '(masculine)') .replace('feminine plural', '(feminine)') .replace('plural', '') ) .map(_.trim) ); } }); } } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return plural.length > 0 ? _.uniq(plural) : undefined; } public extractFeminine( pageLanguage: WiktionaryLanguage ): undefined | string[] { const feminine: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'wordClass') { if (typeof section.headword !== 'undefined') { const headwords = section.headword.split('\n'); // Only extract feminine singular for now (not feminine plural) const reg = /(?:female|feminine(?! plural)(?: singular)?|Feminine:) ([^\),]*)/; headwords .filter((headword): boolean => { return reg.test(headword); }) .forEach((headword): void => { const matches = reg.exec(headword); if (matches !== null) { feminine.push(matches[1]); } }); } } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return feminine.length > 0 ? _.uniq(feminine) : undefined; } public extractMasculine( pageLanguage: WiktionaryLanguage ): undefined | string[] { const masculine: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'wordClass') { if (typeof section.headword !== 'undefined') { const headwords = section.headword.split('\n'); // only extract masculine singular for now const reg = /(?:male|masculine(?! plural)(?: singular)?|Masculine:) ([^\),]*)/; headwords .filter((headword): boolean => { return reg.test(headword); }) .forEach((headword): void => { const matches = reg.exec(headword); if (matches !== null) { masculine.push(matches[1]); } }); } } else if (section.kind === 'definition') { const reg1 = /Feminine singular of adjective ([^\.]*)/; const reg2 = /(?:Female|female) equivalent of ([^\.]*)/; const matches = reg1.test(section.meaning) ? reg1.exec(section.meaning) : reg2.exec(section.meaning); if (matches !== null) { masculine.push(matches[1]); } } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return masculine.length > 0 ? _.uniq(masculine) : undefined; } public extractGender(pageLanguage: WiktionaryLanguage): undefined | string[] { const gender: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'wordClass') { if (typeof section.headword !== 'undefined') { const headwords = section.headword.split('\n'); let reg = /\s(f|m|n)(?:\s|$)/; headwords .filter((headword): boolean => { return reg.test(headword); }) .forEach((headword): void => { const matches = reg.exec(headword); if (matches !== null) { if (matches[1] === 'f') { gender.push('feminine'); } else if (matches[1] === 'm') { gender.push('masculine'); } else { gender.push('neutral'); } } }); if (gender.length === 0) { // If contains feminine singular, the the term is masculine // See: https://en.wiktionary.org/wiki/nervioso#Spanish reg = /feminine singular ([^\),]*)/; headwords .filter((headword): boolean => { return reg.test(headword); }) .forEach((headword): void => { if (reg.test(headword)) { gender.push('masculine'); } }); } } } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return gender.length > 0 ? _.uniq(gender) : undefined; } public extractPinyin(pageLanguage: WiktionaryLanguage): undefined | string[] { const pinyin: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'pronunciation') { const reg = /\(Pinyin\): ([^\r\n]*)/; const pronunciations = section.pronunciation.split('\n'); pronunciations .filter((pronunication): boolean => { return reg.test(pronunication); }) .forEach((pronunication): void => { const matches = reg.exec(pronunication); if (matches !== null && typeof matches[1] !== 'undefined') { pinyin.push( ...matches[1] .replace(/\s*\([^\)]*\)*/, '') .split(',') .map(_.trim) .filter((pinyin): boolean => !_.isEmpty(pinyin)) ); } }); } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return pinyin.length > 0 ? _.uniq(pinyin) : undefined; } public extractZhuyin(pageLanguage: WiktionaryLanguage): undefined | string[] { const zhuyin: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'pronunciation') { const reg = /\(Zhuyin\): ([^\r\n]*)/; const pronunciations = section.pronunciation.split('\n'); pronunciations .filter((pronunication): boolean => { return reg.test(pronunication); }) .forEach((pronunication): void => { const matches = reg.exec(pronunication); if (matches !== null && typeof matches[1] !== 'undefined') { zhuyin.push( ...matches[1] .replace(/\s*\([^\)]*\)*/, '') .split(',') .map(_.trim) .filter((zhuyin): boolean => !_.isEmpty(zhuyin)) ); } }); } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return zhuyin.length > 0 ? _.uniq(zhuyin) : undefined; } public extractSimplied( pageLanguage: WiktionaryLanguage ): undefined | string[] { const simplified: string[] = []; const reg1 = /simp\.\s\(([^\)]*)\)/; const reg2 = /simp\.\n\n(.*)(?:\n|$)/; if (typeof pageLanguage.form !== 'undefined') { const matches = reg1.test(pageLanguage.form) ? reg1.exec(pageLanguage.form) : reg2.exec(pageLanguage.form); if (matches !== null && typeof matches[1] !== 'undefined') { simplified.push(matches[1].trim()); } } else { const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'glyphOrigin') { const matches = reg1.test(section.glyphOrigin) ? reg1.exec(section.glyphOrigin) : reg2.exec(section.glyphOrigin); if (matches !== null && typeof matches[1] !== 'undefined') { simplified.push(matches[1].trim()); } } else if ( section.kind === 'etymology' && typeof section.etymology !== 'undefined' ) { const matches = reg1.test(section.etymology) ? reg1.exec(section.etymology) : reg2.exec(section.etymology); if (matches !== null && typeof matches[1] !== 'undefined') { simplified.push(matches[1].trim()); } } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } } return simplified.length > 0 ? _.uniq(simplified) : undefined; } public extractTraditional( pageLanguage: WiktionaryLanguage ): undefined | string[] { const traditional: string[] = []; const reg1 = /the simplified (?:and variant )?form of ([^\.]*)\./; const reg2 = /Simplified from ([^\.,]*)/; if (typeof pageLanguage.form !== 'undefined') { const matches = reg1.test(pageLanguage.form) ? reg1.exec(pageLanguage.form) : reg2.exec(pageLanguage.form); if (matches !== null && typeof matches[1] !== 'undefined') { traditional.push(matches[1].trim()); } } const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'glyphOrigin') { const matches = reg1.test(section.glyphOrigin) ? reg1.exec(section.glyphOrigin) : reg2.exec(section.glyphOrigin); if (matches !== null && typeof matches[1] !== 'undefined') { traditional.push(matches[1].trim()); } } else if ( section.kind === 'etymology' && typeof section.etymology !== 'undefined' ) { const matches = reg1.test(section.etymology) ? reg1.exec(section.etymology) : reg2.exec(section.etymology); if (matches !== null && typeof matches[1] !== 'undefined') { traditional.push(matches[1].trim()); } } else if ( section.kind === 'definitions' && typeof section.headword !== 'undefined' ) { const matches = reg1.test(section.headword) ? reg1.exec(section.headword) : reg2.exec(section.headword); if (matches !== null && typeof matches[1] !== 'undefined') { traditional.push(matches[1].trim()); } } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return traditional.length > 0 ? _.uniq(traditional) : undefined; } public extractReading( pageLanguage: WiktionaryLanguage ): undefined | string[] { const reading: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'pronunciation') { const pronunciations = section.pronunciation.split('\n'); const reg1 = /(^Kun’yomi|^On’yomi)/; const reg2 = /\(Tokyo\)([^\n\)]*\))/; let readingType: string; if (reg1.test(pronunciations[0])) { const matches = reg1.exec(pronunciations[0]); if (matches !== null) { readingType = matches[0]; pronunciations.shift(); } } pronunciations .filter((pronunication): boolean => { return reg2.test(pronunication); }) .forEach((pronunication): void => { const matches = reg2.exec(pronunication); if (matches !== null) { reading.push( typeof readingType !== 'undefined' ? `(${readingType}) ` + matches[1].trim() : matches[1].trim() ); } }); } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return reading.length > 0 ? _.uniq(reading) : undefined; } public extractHiragana( pageLanguage: WiktionaryLanguage ): undefined | string[] { const hiragana: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'wordClass') { const reg = /hiragana ([^\},]*)(?:\)|,|\.)/; if (typeof section.headword !== 'undefined') { const matches = reg.exec(section.headword); if (matches !== null && typeof matches[1] !== 'undefined') { hiragana.push(matches[1].trim()); } } } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return hiragana.length > 0 ? _.uniq(hiragana) : undefined; } public extractRomaji(pageLanguage: WiktionaryLanguage): undefined | string[] { const romaji: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'wordClass') { const reg = /rōmaji ([^\},]*)[\)|,]/; if (typeof section.headword !== 'undefined') { const matches = reg.exec(section.headword); if (matches !== null && typeof matches[1] !== 'undefined') { romaji.push(matches[1].trim()); } } } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return romaji.length > 0 ? _.uniq(romaji) : undefined; } public extractRomanization( pageLanguage: WiktionaryLanguage ): undefined | string[] { const romanization: string[] = []; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.shift()); if (section.kind === 'pronunciation') { const reg = /Revised\sRomanization\?\n(.*)\n/; const matches = reg.exec(section.pronunciation); if (matches !== null && typeof matches[1] !== 'undefined') { romanization.push(matches[1].trim()); } } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return romanization.length > 0 ? _.uniq(romanization) : undefined; } public extractDefinitions( sections: readonly WiktionarySection[] ): DictionaryDefinition[] { return _.flatMap(sections, (section): DictionaryDefinition[] => { if (section.kind === 'wordClass') { if (this.wordClassConverter.canConvert(section.wordClass)) { return this.extractDefinitions(section.children.slice()).map( (definition): DictionaryDefinition => { return { ...definition, wordClasses: [ this.wordClassConverter.convert(section.wordClass), ], }; } ); } else { return this.extractDefinitions(section.children.slice()); } } else if (section.kind === 'definition') { const subDefinitions = this.extractDefinitions( section.children.slice() ); if (subDefinitions.length > 0) { return subDefinitions.map( (subDefinition): DictionaryDefinition => { return { meaning: [section.meaning, subDefinition.meaning].join(': '), source: section.source, wordClasses: [], }; } ); } else { return [ { meaning: section.meaning, wordClasses: [], source: section.source, }, ]; } } else if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { return this.extractDefinitions(section.children.slice()); } else { return []; } }); } public extractCategories(pageLanguage: WiktionaryLanguage): string[] { const categories = pageLanguage.categories; const sections: WiktionarySection[] = pageLanguage.children.slice(); while (sections.length > 0) { const section = assertExists(sections.pop()); if (section.kind === 'wordClass') { categories.push(...section.categories); } if ( section.kind !== 'example' && section.kind !== 'quotation' && section.kind !== 'glyphOrigin' ) { sections.push(...section.children); } } return _.uniq(categories); } }