import AbortablePromiseCache from '@gmod/abortable-promise-cache' import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle' import LRU from '@jbrowse/quick-lru' import { LocalFile, RemoteFile } from 'generic-filehandle2' import CSI from './csi.ts' import TBI from './tbi.ts' import type Chunk from './chunk.ts' import type IndexFile from './indexFile.ts' import type { Options } from './indexFile.ts' import type { GenericFilehandle } from 'generic-filehandle2' const TAB = 9 const NEWLINE = 10 const SEMICOLON = 59 const decoder = new TextDecoder('utf-8') const encoder = new TextEncoder() type GetLinesCallback = ( line: string, fileOffset: number, start: number, end: number, ) => void interface GetLinesOpts { signal?: AbortSignal lineCallback: GetLinesCallback } interface ReadChunk { buffer: Uint8Array cpositions: number[] dpositions: number[] } function resolveFilehandle( filehandle?: GenericFilehandle, path?: string, url?: string, ) { if (filehandle) { return filehandle } if (path) { return new LocalFile(path) } if (url) { return new RemoteFile(url) } throw new TypeError('must provide either filehandle, path, or url') } function calculateFileOffset( cpositions: number[], dpositions: number[], pos: number, blockStart: number, minvDataPosition: number, ) { return ( cpositions[pos]! * (1 << 8) + (blockStart - dpositions[pos]!) + minvDataPosition + 1 ) } function getVcfEnd( buffer: Uint8Array, startCoordinate: number, refStart: number, refEnd: number, infoStart: number, infoEnd: number, ) { const refLen = refEnd - refStart let endCoordinate = startCoordinate + refLen // INFO is '.', no fields to check if (buffer[infoStart] === 46) { return endCoordinate } // Single pass over semicolon-delimited fields checking prefixes. // Avoids repeated indexOf scans for common bytes like 'S' and 'E' // that produce many false positives in typical INFO fields. let fieldStart = infoStart for (let i = infoStart; i <= infoEnd; i++) { if (i === infoEnd || buffer[i] === SEMICOLON) { const fieldLen = i - fieldStart if ( fieldLen >= 10 && buffer[fieldStart] === 83 && // S buffer[fieldStart + 1] === 86 && // V buffer[fieldStart + 2] === 84 && // T buffer[fieldStart + 3] === 89 && // Y buffer[fieldStart + 4] === 80 && // P buffer[fieldStart + 5] === 69 && // E buffer[fieldStart + 6] === 61 && // = buffer[fieldStart + 7] === 84 && // T buffer[fieldStart + 8] === 82 && // R buffer[fieldStart + 9] === 65 // A ) { return startCoordinate + 1 } if ( fieldLen >= 4 && buffer[fieldStart] === 69 && // E buffer[fieldStart + 1] === 78 && // N buffer[fieldStart + 2] === 68 && // D buffer[fieldStart + 3] === 61 // = ) { endCoordinate = 0 for (let k = fieldStart + 4; k < i; k++) { const c = buffer[k]! if (c >= 48 && c <= 57) { endCoordinate = endCoordinate * 10 + (c - 48) } else { break } } } fieldStart = i + 1 } } return endCoordinate } function parseIntFromBytes(buffer: Uint8Array, start: number, end: number) { let val = 0 for (let i = start; i < end; i++) { const c = buffer[i]! if (c >= 48 && c <= 57) { val = val * 10 + (c - 48) } } return val } export default class TabixIndexedFile { private filehandle: GenericFilehandle private index: IndexFile private chunkCache: AbortablePromiseCache public cache = new LRU< string, { bytesRead: number; buffer: Uint8Array; nextIn: number } >({ maxSize: 1000, }) /** * @param {object} args * @param {string} [args.path] * @param {object} [args.filehandle] * @param {string} [args.url] * @param {string} [args.tbiPath] * @param {string} [args.tbiUrl] * @param {object} [args.tbiFilehandle] * @param {string} [args.csiPath] * @param {string} [args.csiUrl] * @param {object} [args.csiFilehandle] * @param {number} [args.chunkCacheSize] * @param {number} [args.yieldTime] yield to main thread after N milliseconds if reading features is taking a long time to avoid hanging main thread * @param {Function} [args.renameRefSeqs] optional function with sig `string => string` to transform reference sequence names for the purpose of indexing and querying. note that the data that is returned is not altered, just the names of the reference sequences that are used for querying. */ constructor({ path, filehandle, url, tbiPath, tbiUrl, tbiFilehandle, csiPath, csiUrl, csiFilehandle, chunkCacheSize = 5 * 2 ** 20, }: { path?: string filehandle?: GenericFilehandle url?: string tbiPath?: string tbiUrl?: string tbiFilehandle?: GenericFilehandle csiPath?: string csiUrl?: string csiFilehandle?: GenericFilehandle chunkCacheSize?: number }) { this.filehandle = resolveFilehandle(filehandle, path, url) if (tbiFilehandle) { this.index = new TBI({ filehandle: tbiFilehandle }) } else if (csiFilehandle) { this.index = new CSI({ filehandle: csiFilehandle }) } else if (tbiPath) { this.index = new TBI({ filehandle: new LocalFile(tbiPath) }) } else if (csiPath) { this.index = new CSI({ filehandle: new LocalFile(csiPath) }) } else if (path) { this.index = new TBI({ filehandle: new LocalFile(`${path}.tbi`) }) } else if (csiUrl) { this.index = new CSI({ filehandle: new RemoteFile(csiUrl) }) } else if (tbiUrl) { this.index = new TBI({ filehandle: new RemoteFile(tbiUrl) }) } else if (url) { this.index = new TBI({ filehandle: new RemoteFile(`${url}.tbi`) }) } else { throw new TypeError( 'must provide one of tbiFilehandle, tbiPath, csiFilehandle, csiPath, tbiUrl, csiUrl', ) } this.chunkCache = new AbortablePromiseCache({ cache: new LRU({ maxSize: Math.floor(chunkCacheSize / (1 << 16)) }), fill: (args: Chunk, signal?: AbortSignal) => this.readChunk(args, { signal }), }) } /** * @param {string} refName name of the reference sequence * @param {number|undefined} start start of the region (0-based half-open) * @param {number|undefined} end end of the region (0-based half-open) * @param {GetLinesOpts|GetLinesCallback} opts callback invoked for each line, or an options object with `lineCallback` and optional `signal` * @returns {Promise} promise that is resolved when the whole read is finished, rejected on error */ async getLines( refName: string, s: number | undefined, e: number | undefined, opts: GetLinesOpts | GetLinesCallback, ) { let signal: AbortSignal | undefined let options: Options = {} let callback: GetLinesCallback if (typeof opts === 'function') { callback = opts } else { options = opts callback = opts.lineCallback signal = opts.signal } const metadata = await this.index.getMetadata(options) const start = s ?? 0 const end = e ?? metadata.maxRefLength if (!(start <= end)) { throw new TypeError( 'invalid start and end coordinates. start must be less than or equal to end', ) } if (start === end) { return } const chunks = await this.index.blocksForRange(refName, start, end, options) const isVCF = metadata.format === 'VCF' const refCol = metadata.columnNumbers.ref || 0 const startCol = metadata.columnNumbers.start || 0 const endCol = isVCF ? 8 : metadata.columnNumbers.end || 0 const maxColumn = Math.max(refCol, startCol, endCol) const metaCharCode = metadata.metaChar?.charCodeAt(0) const coordinateOffset = metadata.coordinateType === '1-based-closed' ? -1 : 0 const regionRefNameBytes = encoder.encode(refName) const tabs = Array.from({ length: maxColumn + 1 }) for (const c of chunks) { const { buffer, cpositions, dpositions } = await this.chunkCache.get( c.toString(), c, signal, ) let blockStart = 0 let pos = 0 while (blockStart < buffer.length) { const n = buffer.indexOf(NEWLINE, blockStart) if (n === -1) { break } // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition if (dpositions) { const target = blockStart + c.minv.dataPosition while (pos < dpositions.length && target >= dpositions[pos]!) { pos++ } } // skip meta lines if (metaCharCode !== undefined && buffer[blockStart] === metaCharCode) { blockStart = n + 1 continue } // find tab positions tabs[0] = blockStart - 1 let prev = blockStart - 1 for (let i = 0; i < maxColumn; i++) { const tabPos = buffer.indexOf(TAB, prev + 1) if (tabPos === -1 || tabPos >= n) { tabs[i + 1] = n break } tabs[i + 1] = tabPos prev = tabPos } // compare ref name bytes directly const refStart = tabs[refCol - 1]! + 1 const refEnd = tabs[refCol]! const refLen = refEnd - refStart if (refLen !== regionRefNameBytes.length) { blockStart = n + 1 continue } let refMatch = true for (let i = 0; i < refLen; i++) { if (buffer[refStart + i] !== regionRefNameBytes[i]) { refMatch = false break } } if (!refMatch) { blockStart = n + 1 continue } // parse start coordinate const startCoordinate = parseIntFromBytes(buffer, tabs[startCol - 1]! + 1, tabs[startCol]!) + coordinateOffset if (startCoordinate >= end) { return } // parse end coordinate let endCoordinate: number if (endCol === 0 || endCol === startCol) { endCoordinate = startCoordinate + 1 } else if (isVCF) { endCoordinate = getVcfEnd( buffer, startCoordinate, tabs[3]! + 1, tabs[4]!, tabs[endCol - 1]! + 1, tabs[endCol]!, ) } else { endCoordinate = parseIntFromBytes( buffer, tabs[endCol - 1]! + 1, tabs[endCol]!, ) } if (endCoordinate > start) { const line = decoder.decode(buffer.subarray(blockStart, n)) callback( line, calculateFileOffset( cpositions, dpositions, pos, blockStart, c.minv.dataPosition, ), startCoordinate, endCoordinate, ) } blockStart = n + 1 } } } async getMetadata(opts: Options = {}) { return this.index.getMetadata(opts) } async getHeaderBuffer(opts: Options = {}) { const { firstDataLine, metaChar, maxBlockSize } = await this.getMetadata(opts) const maxFetch = (firstDataLine?.blockPosition ?? 0) + maxBlockSize // TODO: what if we don't have a firstDataLine, and the header actually // takes up more than one block? this case is not covered here const buf = await this.filehandle.read(maxFetch, 0, opts) const bytes = (await unzip(buf)) as Uint8Array // trim off lines after the last meta line if (metaChar) { let lastNewline = -1 const metaByte = metaChar.charCodeAt(0) for (let i = 0, l = bytes.length; i < l; i++) { const byte = bytes[i] if (i === lastNewline + 1 && byte !== metaByte) { break } if (byte === NEWLINE) { lastNewline = i } } return bytes.subarray(0, lastNewline + 1) } return bytes } async getHeader(opts: Options = {}) { const bytes = await this.getHeaderBuffer(opts) return decoder.decode(bytes) } async getReferenceSequenceNames(opts: Options = {}) { const metadata = await this.getMetadata(opts) return metadata.refIdToName } /** * return the number of data lines in the given reference sequence * @param {string} refName reference sequence name * @returns {number} number of data lines present on that reference sequence */ async lineCount(refName: string, opts: Options = {}) { return this.index.lineCount(refName, opts) } async readChunk(c: Chunk, opts: Options = {}) { const ret = await this.filehandle.read( c.fetchedSize(), c.minv.blockPosition, opts, ) return unzipChunkSlice(ret, c, this.cache) } }