// loaders.gl // SPDX-License-Identifier: MIT // Copyright (c) vis.gl contributors import type {LoaderWithParser, LoaderOptions} from '@loaders.gl/loader-utils'; import type {ArrayRowTable, ObjectRowTable, TableBatch} from '@loaders.gl/schema'; import { AsyncQueue, TableBatchBuilder, convertToArrayRow, convertToObjectRow } from '@loaders.gl/schema'; import Papa from './papaparse/papaparse'; import AsyncIteratorStreamer from './papaparse/async-iterator-streamer'; type ObjectField = {name: string; index: number; type: any}; type ObjectSchema = {[key: string]: ObjectField} | ObjectField[]; // __VERSION__ is injected by babel-plugin-version-inline // @ts-ignore TS2304: Cannot find name '__VERSION__'. const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest'; const DEFAULT_CSV_SHAPE = 'object-row-table'; export type CSVLoaderOptions = LoaderOptions & { csv?: { // loaders.gl options shape?: 'array-row-table' | 'object-row-table'; /** optimizes memory usage but increases parsing time. */ optimizeMemoryUsage?: boolean; columnPrefix?: string; header?: 'auto'; // CSV options (papaparse) // delimiter: auto // newline: auto quoteChar?: string; escapeChar?: string; // Convert numbers and boolean values in rows from strings dynamicTyping?: boolean; comments?: boolean; skipEmptyLines?: boolean | 'greedy'; // transform: null? delimitersToGuess?: string[]; // fastMode: auto }; }; export const CSVLoader = { dataType: null as unknown as ObjectRowTable | ArrayRowTable, batchType: null as unknown as TableBatch, id: 'csv', module: 'csv', name: 'CSV', version: VERSION, extensions: ['csv', 'tsv', 'dsv'], mimeTypes: ['text/csv', 'text/tab-separated-values', 'text/dsv'], category: 'table', parse: async (arrayBuffer: ArrayBuffer, options?: CSVLoaderOptions) => parseCSV(new TextDecoder().decode(arrayBuffer), options), parseText: (text: string, options?: CSVLoaderOptions) => parseCSV(text, options), parseInBatches: parseCSVInBatches, // @ts-ignore // testText: null, options: { csv: { shape: DEFAULT_CSV_SHAPE, // 'object-row-table' optimizeMemoryUsage: false, // CSV options header: 'auto', columnPrefix: 'column', // delimiter: auto // newline: auto quoteChar: '"', escapeChar: '"', dynamicTyping: true, comments: false, skipEmptyLines: true, // transform: null? delimitersToGuess: [',', '\t', '|', ';'] // fastMode: auto } } } as const satisfies LoaderWithParser; async function parseCSV( csvText: string, options?: CSVLoaderOptions ): Promise { // Apps can call the parse method directly, we so apply default options here const csvOptions = {...CSVLoader.options.csv, ...options?.csv}; const firstRow = readFirstRow(csvText); const header: boolean = csvOptions.header === 'auto' ? isHeaderRow(firstRow) : Boolean(csvOptions.header); const parseWithHeader = header; const papaparseConfig = { // dynamicTyping: true, ...csvOptions, header: parseWithHeader, download: false, // We handle loading, no need for papaparse to do it for us transformHeader: parseWithHeader ? duplicateColumnTransformer() : undefined, error: (e) => { throw new Error(e); } }; const result = Papa.parse(csvText, papaparseConfig); const rows = result.data as any[]; const headerRow = result.meta.fields || generateHeader(csvOptions.columnPrefix, firstRow.length); const shape = csvOptions.shape || DEFAULT_CSV_SHAPE; switch (shape) { case 'object-row-table': return { shape: 'object-row-table', data: rows.map((row) => (Array.isArray(row) ? convertToObjectRow(row, headerRow) : row)) }; case 'array-row-table': return { shape: 'array-row-table', data: rows.map((row) => (Array.isArray(row) ? row : convertToArrayRow(row, headerRow))) }; default: throw new Error(shape); } } // TODO - support batch size 0 = no batching/single batch? function parseCSVInBatches( asyncIterator: AsyncIterable | Iterable, options?: CSVLoaderOptions ): AsyncIterable { // Papaparse does not support standard batch size handling // TODO - investigate papaparse chunks mode options = {...options}; if (options.batchSize === 'auto') { options.batchSize = 4000; } // Apps can call the parse method directly, we so apply default options here const csvOptions = {...CSVLoader.options.csv, ...options?.csv}; const asyncQueue = new AsyncQueue(); let isFirstRow: boolean = true; let headerRow: string[] | null = null; let tableBatchBuilder: TableBatchBuilder | null = null; let schema: ObjectSchema | null = null; const config = { // dynamicTyping: true, // Convert numbers and boolean values in rows from strings, ...csvOptions, header: false, // Unfortunately, header detection is not automatic and does not infer shapes download: false, // We handle loading, no need for papaparse to do it for us // chunkSize is set to 5MB explicitly (same as Papaparse default) due to a bug where the // streaming parser gets stuck if skipEmptyLines and a step callback are both supplied. // See https://github.com/mholt/PapaParse/issues/465 chunkSize: 1024 * 1024 * 5, // skipEmptyLines is set to a boolean value if supplied. Greedy is set to true // skipEmptyLines is handled manually given two bugs where the streaming parser gets stuck if // both of the skipEmptyLines and step callback options are provided: // - true doesn't work unless chunkSize is set: https://github.com/mholt/PapaParse/issues/465 // - greedy doesn't work: https://github.com/mholt/PapaParse/issues/825 skipEmptyLines: false, // step is called on every row // eslint-disable-next-line complexity, max-statements step(results) { let row = results.data; if (csvOptions.skipEmptyLines) { // Manually reject lines that are empty const collapsedRow = row.flat().join('').trim(); if (collapsedRow === '') { return; } } const bytesUsed = results.meta.cursor; // Check if we need to save a header row if (isFirstRow && !headerRow) { // Auto detects or can be forced with csvOptions.header const header = csvOptions.header === 'auto' ? isHeaderRow(row) : Boolean(csvOptions.header); if (header) { headerRow = row.map(duplicateColumnTransformer()); return; } } // If first data row, we can deduce the schema if (isFirstRow) { isFirstRow = false; if (!headerRow) { headerRow = generateHeader(csvOptions.columnPrefix, row.length); } schema = deduceSchema(row, headerRow); } if (csvOptions.optimizeMemoryUsage) { // A workaround to allocate new strings and don't retain pointers to original strings. // https://bugs.chromium.org/p/v8/issues/detail?id=2869 row = JSON.parse(JSON.stringify(row)); } const shape = csvOptions.shape || DEFAULT_CSV_SHAPE; // Add the row tableBatchBuilder = tableBatchBuilder || new TableBatchBuilder( // @ts-expect-error TODO this is not a proper schema schema, { shape, ...options } ); try { tableBatchBuilder.addRow(row); // If a batch has been completed, emit it const batch = tableBatchBuilder && tableBatchBuilder.getFullBatch({bytesUsed}); if (batch) { asyncQueue.enqueue(batch); } } catch (error) { asyncQueue.enqueue(error as Error); } }, // complete is called when all rows have been read complete(results) { try { const bytesUsed = results.meta.cursor; // Ensure any final (partial) batch gets emitted const batch = tableBatchBuilder && tableBatchBuilder.getFinalBatch({bytesUsed}); if (batch) { asyncQueue.enqueue(batch); } } catch (error) { asyncQueue.enqueue(error as Error); } asyncQueue.close(); } }; Papa.parse(asyncIterator, config, AsyncIteratorStreamer); // TODO - Does it matter if we return asyncIterable or asyncIterator // return asyncQueue[Symbol.asyncIterator](); return asyncQueue; } /** * Checks if a certain row is a header row * @param row the row to check * @returns true if the row looks like a header */ function isHeaderRow(row: string[]): boolean { return row && row.every((value) => typeof value === 'string'); } /** * Reads, parses, and returns the first row of a CSV text * @param csvText the csv text to parse * @returns the first row */ function readFirstRow(csvText: string): any[] { const result = Papa.parse(csvText, { dynamicTyping: true, preview: 1 }); return result.data[0]; } /** * Creates a transformer that renames duplicate columns. This is needed as Papaparse doesn't handle * duplicate header columns and would use the latest occurrence by default. * See the header option in https://www.papaparse.com/docs#config * @returns a transform function that returns sanitized names for duplicate fields */ function duplicateColumnTransformer(): (column: string) => string { const observedColumns = new Set(); return (col) => { let colName = col; let counter = 1; while (observedColumns.has(colName)) { colName = `${col}.${counter}`; counter++; } observedColumns.add(colName); return colName; }; } /** * Generates the header of a CSV given a prefix and a column count * @param columnPrefix the columnPrefix to use * @param count the count of column names to generate * @returns an array of column names */ function generateHeader(columnPrefix: string, count: number = 0): string[] { const headers: string[] = []; for (let i = 0; i < count; i++) { headers.push(`${columnPrefix}${i + 1}`); } return headers; } function deduceSchema(row, headerRow): ObjectSchema { const schema: ObjectSchema = headerRow ? {} : []; for (let i = 0; i < row.length; i++) { const columnName = (headerRow && headerRow[i]) || i; const value = row[i]; switch (typeof value) { case 'number': case 'boolean': // TODO - booleans could be handled differently... schema[columnName] = {name: String(columnName), index: i, type: Float32Array}; break; case 'string': default: schema[columnName] = {name: String(columnName), index: i, type: Array}; // We currently only handle numeric rows // TODO we could offer a function to map strings to numbers? } } return schema; }