import { LocalFile, RemoteFile } from 'generic-filehandle2' import { BlockView } from './block-view.ts' import { decoder, getDataView, parseKey } from './util.ts' import type { BigWigFeatureArrays, BigWigFeatureArraysMulti, BigWigHeader, BigWigHeaderWithRefNames, Feature, RefInfo, RequestOptions2, RequestOptions, Statistics, SummaryFeatureArrays, SummaryFeatureArraysMulti, ZoomLevel, } from './types.ts' import type { GenericFilehandle } from 'generic-filehandle2' const BIG_WIG_MAGIC = -2003829722 const BIG_BED_MAGIC = -2021002517 export abstract class BBI { protected bbi: GenericFilehandle private headerP?: Promise protected renameRefSeqs: (a: string) => string /** * Returns file header metadata including chromosome list, zoom levels, autoSql * definition, and summary statistics. * * @param opts - optional `RequestOptions` (e.g. `opts.signal` for abort) * @returns `Promise` */ public getHeader(opts?: RequestOptions) { if (!this.headerP) { this.headerP = this._getHeader(opts).catch((e: unknown) => { this.headerP = undefined throw e }) } return this.headerP } /** * @param args.filehandle - a filehandle from generic-filehandle2 * @param args.path - path to a local file * @param args.url - URL of a remote file * @param args.renameRefSeqs - optional mapping function to rename internal * reference sequence names before querying */ public constructor(args: { filehandle?: GenericFilehandle path?: string url?: string renameRefSeqs?: (a: string) => string }) { const { filehandle, renameRefSeqs = s => s, path, url } = args this.renameRefSeqs = renameRefSeqs if (filehandle) { this.bbi = filehandle } else if (url) { this.bbi = new RemoteFile(url) } else if (path) { this.bbi = new LocalFile(path) } else { throw new Error('no file given') } } private async _getHeader(opts?: RequestOptions) { const header = await this._getMainHeader(opts) const chroms = await this._readChromosomeTree(header, opts) return { ...header, ...chroms, } } private async _getMainHeader( opts?: RequestOptions, requestSize = 2000, ): Promise { const b = await this.bbi.read(requestSize, 0, opts) const dataView = getDataView(b) let offset = 0 const magic = dataView.getInt32(offset, true) offset += 4 if (magic !== BIG_WIG_MAGIC && magic !== BIG_BED_MAGIC) { throw new Error('not a BigWig/BigBed file') } const version = dataView.getUint16(offset, true) offset += 2 const numZoomLevels = dataView.getUint16(offset, true) offset += 2 // Offset to the B+ tree that maps chromosome names to integer IDs const chromosomeTreeOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 const unzoomedDataOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 const unzoomedIndexOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 const fieldCount = dataView.getUint16(offset, true) offset += 2 const definedFieldCount = dataView.getUint16(offset, true) offset += 2 const asOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 const totalSummaryOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 const uncompressBufSize = dataView.getUint32(offset, true) offset += 4 const extHeaderOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 const zoomLevels = [] as ZoomLevel[] for (let i = 0; i < numZoomLevels; i++) { const reductionLevel = dataView.getUint32(offset, true) offset += 4 offset += 4 // reserved const dataOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 const indexOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 zoomLevels.push({ reductionLevel, dataOffset, indexOffset, }) } const fileType = magic === BIG_BED_MAGIC ? 'bigbed' : 'bigwig' // A short read means we hit EOF, so a larger request can't return more // bytes - stop growing to avoid looping on a truncated/corrupt file. const reachedEof = b.length < requestSize // autoSql is a null-terminated string at asOffset; if the terminator isn't // in the buffer yet the string is truncated and we need a larger fetch const autoSqlTruncated = asOffset !== 0 && !b.includes(0, asOffset) // refetch header if it is too large on first pass, // 8*5 is the sizeof the totalSummary struct if ( !reachedEof && (asOffset > requestSize || totalSummaryOffset > requestSize - 8 * 5 || autoSqlTruncated) ) { return this._getMainHeader(opts, requestSize * 2) } let totalSummary: Statistics if (totalSummaryOffset) { const summaryView = getDataView(b, totalSummaryOffset) totalSummary = { basesCovered: Number(summaryView.getBigUint64(0, true)), scoreMin: summaryView.getFloat64(8, true), scoreMax: summaryView.getFloat64(16, true), scoreSum: summaryView.getFloat64(24, true), scoreSumSquares: summaryView.getFloat64(32, true), } } else { throw new Error('no stats') } let autoSql = '' if (asOffset) { const nullPos = b.indexOf(0, asOffset) const end = nullPos === -1 ? b.length : nullPos autoSql = decoder.decode(b.subarray(asOffset, end)) } return { zoomLevels, magic, extHeaderOffset, numZoomLevels, fieldCount, totalSummary, definedFieldCount, uncompressBufSize, asOffset, chromosomeTreeOffset, totalSummaryOffset, unzoomedDataOffset, unzoomedIndexOffset, fileType, version, autoSql, } } // Reads the B+ tree that maps chromosome names to integer IDs // This is part of the "cirTree" (combined ID R-tree) structure, which uses // integer chromosome IDs instead of strings for more efficient spatial indexing private async _readChromosomeTree( header: BigWigHeader, opts?: { signal?: AbortSignal }, ) { const refsByNumber: RefInfo[] = [] const refsByName = {} as Record const chromosomeTreeOffset = header.chromosomeTreeOffset const dataView = getDataView( await this.bbi.read(32, chromosomeTreeOffset, opts), ) const keySize = dataView.getUint32(8, true) const valSize = dataView.getUint32(12, true) // Recursively traverses the B+ tree to populate chromosome name-to-ID mappings const readBPlusTreeNode = async (currentOffset: number) => { const header = getDataView(await this.bbi.read(4, currentOffset, opts)) const isLeafNode = header.getUint8(0) const count = header.getUint16(2, true) // Leaf nodes contain the actual chromosome name-to-ID mappings if (isLeafNode) { const b = await this.bbi.read( count * (keySize + valSize), currentOffset + 4, opts, ) const dataView = getDataView(b) let offset = 0 for (let n = 0; n < count; n++) { const key = parseKey(b, offset, keySize) offset += keySize const refId = dataView.getUint32(offset, true) offset += 4 const refSize = dataView.getUint32(offset, true) offset += 4 refsByName[this.renameRefSeqs(key)] = refId refsByNumber[refId] = { name: key, id: refId, length: refSize } } } else { // Non-leaf nodes contain pointers to child nodes const dataView = getDataView( await this.bbi.read(count * (keySize + 8), currentOffset + 4, opts), ) const nextNodes = [] let offset = 0 for (let n = 0; n < count; n++) { offset += keySize const childOffset = Number(dataView.getBigUint64(offset, true)) offset += 8 nextNodes.push(readBPlusTreeNode(childOffset)) } await Promise.all(nextNodes) } } await readBPlusTreeNode(chromosomeTreeOffset + 32) return { refsByName, refsByNumber, } } private viewCache = new Map() protected getOrCreateBlockView( refsByName: Record, rTreeOffset: number, uncompressBufSize: number, blockType: string, ) { const key = `${rTreeOffset}_${blockType}` let view = this.viewCache.get(key) if (!view) { view = new BlockView( this.bbi, refsByName, rTreeOffset, uncompressBufSize, blockType, ) this.viewCache.set(key, view) } return view } /* * fetches the "unzoomed" view of the bigwig data. this is the default for bigbed * @param abortSignal - a signal to optionally abort this operation */ protected async getUnzoomedView(opts?: RequestOptions) { const { unzoomedIndexOffset, refsByName, uncompressBufSize, fileType } = await this.getHeader(opts) return this.getOrCreateBlockView( refsByName, unzoomedIndexOffset, uncompressBufSize, fileType, ) } /* * abstract method - get the view for a given scale */ protected abstract getView( scale: number, opts?: RequestOptions, ): Promise private async _getView(opts?: RequestOptions2) { const { basesPerSpan, scale } = opts || {} const viewScale = basesPerSpan ? 1 / basesPerSpan : (scale ?? 1) return this.getView(viewScale, opts) } /** * Fetches features for a single region. * * @param refName - chromosome name as it appears in the file * @param start - 0-based half-open start coordinate * @param end - 0-based half-open end coordinate * @param opts - optional scale/basesPerSpan for zoom level selection and AbortSignal * @returns `Promise` — empty array if refName not found or no features overlap the range */ public async getFeatures( refName: string, start: number, end: number, opts?: RequestOptions2, ) { const view = await this._getView(opts) return view.readWigData(this.renameRefSeqs(refName), start, end, opts) } /** * Fetches features for many regions in a single pass. All regions share one * zoom level, and adjacent on-disk blocks are coalesced across region * boundaries, reducing range requests for whole-genome overviews. * * @param regions - array of `{ refName, start, end }` query regions * @param opts - same options as `getFeatures` * @returns `Promise` — one `Feature[]` per input region in the * same order (`result[i]` corresponds to `regions[i]`) */ public async getFeaturesMulti( regions: { refName: string; start: number; end: number }[], opts?: RequestOptions2, ): Promise { const view = await this._getView(opts) return view.readWigDataMulti( regions.map(r => ({ refName: this.renameRefSeqs(r.refName), start: r.start, end: r.end, })), opts, ) } /** * Same query as `getFeatures` but returns typed arrays instead of an array * of objects, reducing GC pressure for large datasets. * * @param refName - chromosome name as it appears in the file * @param start - 0-based half-open start coordinate * @param end - 0-based half-open end coordinate * @param opts - optional scale/basesPerSpan for zoom level selection and AbortSignal * @returns `Promise` — use the * `isSummary` discriminant to distinguish the two shapes */ public async getFeaturesAsArrays( refName: string, start: number, end: number, opts?: RequestOptions2, ): Promise { const view = await this._getView(opts) return view.readWigDataAsArrays( this.renameRefSeqs(refName), start, end, opts, ) } /** * Multi-region counterpart of `getFeaturesAsArrays`. All regions share one * zoom level and adjacent on-disk blocks coalesce across region boundaries * (like `getFeaturesMulti`), but results pack into one backing set of typed * arrays instead of one object per region, minimizing allocations. * * @param regions - array of `{ refName, start, end }` query regions * @param opts - same options as `getFeatures` * @returns `Promise` — * use the `isSummary` discriminant to distinguish the two shapes; slice * region `i` with `regionOffsets[i]..regionOffsets[i + 1]` */ public async getFeaturesAsArraysMulti( regions: { refName: string; start: number; end: number }[], opts?: RequestOptions2, ): Promise { const view = await this._getView(opts) return view.readWigDataAsArraysMulti( regions.map(r => ({ refName: this.renameRefSeqs(r.refName), start: r.start, end: r.end, })), opts, ) } }