import { v4 as uuidv4 } from 'uuid' import { Sender } from '../sender' import { Page } from 'puppeteer' import { DocsSearchDocument, HTag, HierarchyLevel, RadioHierarchyLevel, } from '../types' const RADIO_HIERARCHY_LEVELS: Record = { H1: 'hierarchy_radio_lvl1', H2: 'hierarchy_radio_lvl2', H3: 'hierarchy_radio_lvl3', H4: 'hierarchy_radio_lvl4', H5: 'hierarchy_radio_lvl5', } const HIERARCHY_LEVELS: Record = { H1: 'hierarchy_lvl1', H2: 'hierarchy_lvl2', H3: 'hierarchy_lvl3', H4: 'hierarchy_lvl4', H5: 'hierarchy_lvl5', } const TAG_LEVELS: Record = { H1: 100, H2: 90, H3: 80, H4: 70, H5: 60, } export default class DocsearchScaper { sender: Sender constructor(sender: Sender) { console.info('DocsearchScaper::constructor') this.sender = sender void this.sender.updateSettings({ distinctAttribute: 'url', rankingRules: [ 'words', 'typo', 'attribute', 'proximity', 'exactness', 'page_rank:desc', 'level:desc', 'position:asc', ], searchableAttributes: [ 'hierarchy_radio_lvl1', 'hierarchy_radio_lvl2', 'hierarchy_radio_lvl3', 'hierarchy_radio_lvl4', 'hierarchy_radio_lvl5', 'hierarchy_lvl1', 'hierarchy_lvl2', 'hierarchy_lvl3', 'hierarchy_lvl4', 'hierarchy_lvl5', 'hierarchy_radio_lvl0', 'hierarchy_lvl0', 'content', ], }) } _amount_of_hierarchies(pageMap: DocsSearchDocument) { return Object.keys(pageMap).filter((key) => key.startsWith('hierarchy_lvl')) .length } _is_h_tag(tag: string) { return tag.startsWith('H') } // Remove all hierarchies that are lower than the current level. // Considering hierarchy_level_5 is lower than hierarchy_level_4. _remove_lower_lvl_hierarchies( pageMap: DocsSearchDocument, currentLevel: string ): DocsSearchDocument { for (const hierarchy in pageMap) { const levelMatch = hierarchy.match(/\d+/) || [] const currentLevelMatch = currentLevel.match(/\d+/) || [] if (levelMatch[0] && currentLevelMatch[0]) { if (parseInt(levelMatch[0]) > parseInt(currentLevelMatch[0])) { delete pageMap[hierarchy as keyof DocsSearchDocument] } } } return pageMap } _empty_radio_lvl_hierarchies( document: DocsSearchDocument ): DocsSearchDocument { return { ...document, hierarchy_radio_lvl0: null, hierarchy_radio_lvl1: null, hierarchy_radio_lvl2: null, hierarchy_radio_lvl3: null, hierarchy_radio_lvl4: null, hierarchy_radio_lvl5: null, } } _fill_lvl_fields( document: DocsSearchDocument, tag: HTag, text: string ): DocsSearchDocument { return { ...document, [HIERARCHY_LEVELS[tag]]: text, [RADIO_HIERARCHY_LEVELS[tag]]: text, } } _update_document( document: DocsSearchDocument, tag: HTag, text: string, anchor?: string ): DocsSearchDocument { document = { ...document, level: TAG_LEVELS[tag], } document = this._empty_radio_lvl_hierarchies(document) document = this._remove_lower_lvl_hierarchies( document, HIERARCHY_LEVELS[tag] ) document = this._fill_lvl_fields(document, tag, text) document['anchor'] = anchor ? `#${anchor}` : '' return document } async get(url: string, page: Page) { //for each page create dataset of consecutive h1, h2, h3, p. at each header after a paragraph, create a new dataset // needs to be able to provide the `main` or `article` tag // TODO: create a configuration to provide the main tag in which the content is let elems = await page.$$( 'main h1, main h2, main h3, main h4, main h5, main p, main td, main li, main span' ) if (elems.length === 0) { elems = await page.$$('h1, h2, h3, h4, h5, p, td, li, span') } let document = {} as DocsSearchDocument document = this._empty_radio_lvl_hierarchies(document) for (let i = 0; i < elems.length; i++) { const elem = elems[i] const tag = await elem.evaluate((el) => el.tagName) let text = (await elem.evaluate((el) => el.textContent)) || '' text = this._clean_text(text) const urls_tags = new URL(url).pathname.split('/') const only_urls_tags = urls_tags.slice(1, urls_tags.length - 1) document['hierarchy_lvl0'] = only_urls_tags.join(' > ') || '' document['url'] = url // Every time a H tag is found, the previous content is indexed and then emptied if ( this._is_h_tag(tag) && this._amount_of_hierarchies(document) > 1 && document['content'] && document['content'].length > 0 ) { await this._send_data({ ...document, type: 'content' }) document['content'] = [] } const anchor = await elem.evaluate((el) => el.id) if (tag === 'H1') { document = Object.assign( {}, this._update_document(document, tag, text, anchor) ) } else if (tag === 'H2') { document = Object.assign( {}, this._update_document(document, tag, text, anchor) ) } else if (tag === 'H3') { document = Object.assign( {}, this._update_document(document, tag, text, anchor) ) } else if (tag === 'H4') { document = Object.assign( {}, this._update_document(document, tag, text, anchor) ) } else if (tag === 'H5') { document = Object.assign( {}, this._update_document(document, tag, text, anchor) ) } else if ( (tag === 'P' || tag === 'TD' || tag === 'LI' || tag === 'SPAN') && this._amount_of_hierarchies(document) > 1 ) { if (!document['content']) { document['content'] = [] } if ( text !== null && Array.isArray(document['content']) && !document['content'].includes(text) ) { document['content'].push(text) } } } // Send remaining data if (document.content && document.content?.length > 0) { await this._send_data({ ...document }) } } async _send_data(data: DocsSearchDocument) { try { data.uid = uuidv4() data.url = data.url + data.anchor if (Array.isArray(data['content'])) { data['content'] = data['content'].join('\n') } else { data['content'] = '' } await this.sender.add(data) } catch (e) { console.log('error', e) } } // Remove from a text all multiple spaces, new lines, and leading and trailing spaces, and // remove '# ' from the beginning of the text _clean_text(text: string) { text = text.replace(/[\r\n]+/gm, ' ') ///remove multiple spaces text = text.replace(/\s+/g, ' ') ///remove '# ' text = text.replace('# ', '') /// Trim leading and trailing spaces text = text.replace(/^\s+|\s+$/g, '') return text } }