import prettier from 'prettier' import { v4 as uuidv4 } from 'uuid' import { Sender } from '../sender' import { Config, Meta, DefaultDocument } from '../types' import { Page } from 'puppeteer' export default class DefaultScraper { sender: Sender settings: Config['meilisearch_settings'] constructor(sender: Sender, config: Config) { console.info('DefaultScraper::constructor') this.sender = sender this.settings = config.meilisearch_settings || { searchableAttributes: [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'title', 'meta.description', ], filterableAttributes: ['urls_tags'], distinctAttribute: 'url', } void this.sender.updateSettings(this.settings) } async get(url: string, page: Page) { const title = await page.title() //get the meta of the page const meta = await this._extract_metadata_from_page(page) //for each page create dataset of consecutive h1, h2, h3, p. at each header after a paragraph, create a new dataset let data: DefaultDocument = {} as DefaultDocument let elems = await page.$$( 'main h1, main h2, main h3, main h4, main h5, main h6, main p, main td, main li, main span' ) if (elems.length === 0) { elems = await page.$$('h1, h2, h3, h4, h5, h6, p, td, li, span') } let page_block = 0 for (let i = 0; i < elems.length; i++) { const elem = elems[i] const tag = await elem.evaluate((el) => el.tagName) let text = (await elem.evaluate((el) => el.textContent)) || '' text = this._clean_text(text) data.uid = uuidv4() data.url = url data.title = title data.meta = meta data.image_url = this._get_image_url_from_meta(meta) data.page_block = page_block const urls_tags = new URL(url).pathname.split('/') data.urls_tags = urls_tags.slice(1, urls_tags.length - 1) const id = await elem.evaluate((el) => el.id) if (tag === 'H1') { if (data['h1']) { await this._add_data(data) page_block++ data = {} as DefaultDocument } data['h1'] = text data.anchor = '#' + id } else if (tag === 'H2') { if (data['h2']) { await this._add_data(data) page_block++ data = { h1: data['h1'] } as DefaultDocument } data.anchor = '#' + id data['h2'] = text } else if (tag === 'H3') { if (data['h3']) { await this._add_data(data) page_block++ data = { h1: data['h1'], h2: data['h2'] } as DefaultDocument } data.anchor = '#' + id data['h3'] = text } else if (tag === 'H4') { if (data['h4']) { await this._add_data(data) page_block++ data = { h1: data['h1'], h2: data['h2'], h3: data['h3'], } as DefaultDocument } data.anchor = '#' + id data['h4'] = text } else if (tag === 'H5') { if (data['h5']) { await this._add_data(data) page_block++ data = { h1: data['h1'], h2: data['h2'], h3: data['h3'], h4: data['h4'], } as DefaultDocument } data.anchor = '#' + id data['h5'] = text } else if (tag === 'H6') { if (data['h6']) { await this._add_data(data) page_block++ data = { h1: data['h1'], h2: data['h2'], h3: data['h3'], h4: data['h4'], h5: data['h5'], } as DefaultDocument } data.anchor = '#' + id data['h6'] = text } else if ( tag === 'P' || tag === 'TD' || tag === 'LI' || tag === 'SPAN' ) { if (!data['p']) { data['p'] = [] } // TODO: should we leave `null` values in the `p` array? if (text && Array.isArray(data['p']) && !data['p'].includes(text)) { data['p'].push(text) } } if (i === elems.length - 1) { await this._add_data(data) } } } async _add_data(data: DefaultDocument) { if (Array.isArray(data['p'])) { data['p'] = data['p'].join('\n') } await this.sender.add(data) } // Remove from a text all multiple spaces, new lines, and leading and trailing spaces, and // remove '# ' from the beginning of the text _clean_text(text: string) { text = text.replace(/[\r\n]+/gm, ' ') ///remove multiple spaces text = text.replace(/\s+/g, ' ') ///remove '# ' text = text.replace('# ', '') /// Trim leading and trailing spaces text = text.replace(/^\s+|\s+$/g, '') return text } // Extract the meta of a page async _extract_metadata_from_page(page: Page) { return await page.evaluate(() => { const metas = document.getElementsByTagName('meta') const meta: Meta = {} as Meta for (let i = 0; i < metas.length; i++) { const name = metas[i].getAttribute('name') const content = metas[i].getAttribute('content') if (name && content) { meta[name] = content } } return meta }) } // Extract the image url from the meta of a page _get_image_url_from_meta(meta: Meta) { if (meta['og:image']) { return meta['og:image'] } else if (meta['twitter:image']) { return meta['twitter:image'] } else if (meta['image']) { return meta['image'] } return } // A function that retro-engineer the hljs generated html to extract the code async _extract_code_from_page(page: Page) { const code = await page.evaluate(() => { let code = '' const pre = document.getElementsByTagName('pre') for (let i = 0; i < pre.length; i++) { const code_elem = pre[i].getElementsByTagName('code') if (code_elem.length > 0) { code += code_elem[0].innerText } } return code }) return this._format_code(code) } // A function that use prettier to format the code that has been extracted in a html page. // Format only if the language is supported by prettier _format_code(code: string) { let formatted_code = '' try { formatted_code = prettier.format(code, { parser: 'babel', }) } catch (e) { console.log('Error while formatting code', e) return code } return formatted_code } }