/* Phaneron - Clustered, accelerated and cloud-fit video server, pre-assembled and in kit form. Copyright (C) 2020 Streampunk Media Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . https://www.streampunk.media/ mailto:furnace@streampunk.media 14 Ormiscaig, Aultbea, Achnasheen, IV22 2JJ U.K. */ import { PackImpl, Interlace } from './packer' import { KernelParams, OpenCLBuffer } from 'nodencl' const v210Kernel = ` __kernel void read(__global uint4* restrict input, __global float4* restrict output, __private unsigned int width, __constant float4* restrict colMatrix, __global float* restrict gammaLut, __constant float4* restrict gamutMatrix) { uint item = get_global_id(0); bool lastItemOnLine = get_local_id(0) == get_local_size(0) - 1; // 48 pixels per workItem = 8 input uint4s per work item uint numPixels = lastItemOnLine && (0 != width % 48) ? width % 48 : 48; uint numLoops = numPixels / 6; uint remain = numPixels % 6; uint inOff = 8 * item; uint outOff = width * get_group_id(0) + get_local_id(0) * 48; float4 colMatR = colMatrix[0]; float4 colMatG = colMatrix[1]; float4 colMatB = colMatrix[2]; // optimise loading of the 3x3 gamut matrix float4 gamutMat0 = gamutMatrix[0]; float4 gamutMat1 = gamutMatrix[1]; float4 gamutMat2 = gamutMatrix[2]; float3 gamutMatR = (float3)(gamutMat0.s0, gamutMat0.s1, gamutMat0.s2); float3 gamutMatG = (float3)(gamutMat0.s3, gamutMat1.s0, gamutMat1.s1); float3 gamutMatB = (float3)(gamutMat1.s2, gamutMat1.s3, gamutMat2.s0); for (uint i=0; i> 10) & 0x3ff, w.s0 & 0x3ff, (w.s0 >> 20) & 0x3ff, 1); yuva[1] = (ushort4)(w.s1 & 0x3ff, yuva[0].s1, yuva[0].s2, 1); yuva[2] = (ushort4)((w.s1 >> 20) & 0x3ff, (w.s1 >> 10) & 0x3ff, w.s2 & 0x3ff, 1); yuva[3] = (ushort4)((w.s2 >> 10) & 0x3ff, yuva[2].s1, yuva[2].s2, 1); yuva[4] = (ushort4)(w.s3 & 0x3ff, (w.s2 >> 20) & 0x3ff, (w.s3 >> 10) & 0x3ff, 1); yuva[5] = (ushort4)((w.s3 >> 20) & 0x3ff, yuva[4].s1, yuva[4].s2, 1); for (uint p=0; p<6; ++p) { float4 yuva_f = convert_float4(yuva[p]); float3 rgb; rgb.s0 = gammaLut[convert_ushort_sat_rte(dot(yuva_f, colMatR) * 65535.0f)]; rgb.s1 = gammaLut[convert_ushort_sat_rte(dot(yuva_f, colMatG) * 65535.0f)]; rgb.s2 = gammaLut[convert_ushort_sat_rte(dot(yuva_f, colMatB) * 65535.0f)]; float4 rgba; rgba.s0 = dot(rgb, gamutMatR); rgba.s1 = dot(rgb, gamutMatG); rgba.s2 = dot(rgb, gamutMatB); rgba.s3 = 1.0f; output[outOff+p] = rgba; } inOff++; outOff+=6; } if (remain > 0) { uint4 w = input[inOff]; ushort4 yuva[4]; yuva[0] = (ushort4)((w.s0 >> 10) & 0x3ff, w.s0 & 0x3ff, (w.s0 >> 20) & 0x3ff, 0); yuva[1] = (ushort4)(w.s1 & 0x3ff, yuva[0].s1, yuva[0].s2, 0); if (4 == remain) { yuva[2] = (ushort4)((w.s1 >> 20) & 0x3ff, (w.s1 >> 10) & 0x3ff, w.s2 & 0x3ff, 0); yuva[3] = (ushort4)((w.s2 >> 10) & 0x3ff, yuva[2].s1, yuva[2].s2, 0); } for (uint p=0; p 0) { uint4 w = (uint4)(0, 0, 0, 0); ushort3 yuv[4]; for (uint p=0; p> 10) & 0x3ff} ${(w0 >> 20) & 0x3ff} ${w1 & 0x3ff}, ` s += `${(w1 >> 10) & 0x3ff} ${(w1 >> 20) & 0x3ff} ${w2 & 0x3ff} ${(w2 >> 10) & 0x3ff}, ` s += `${(w2 >> 20) & 0x3ff} ${w3 & 0x3ff} ${(w3 >> 10) & 0x3ff} ${(w3 >> 20) & 0x3ff}` xOff += 16 } console.log(s) yOff += pitchBytes } } export function dumpBuf(buf: Buffer, width: number, numLines: number): void { let lineOff = 0 function getHex(off: number): string { return buf.readUInt32LE(lineOff + off).toString(16) } const pitch = getPitchBytes(width) for (let l = 0; l < numLines; ++l) { lineOff = l * pitch console.log( `Line ${l}: ${getHex(0)}, ${getHex(4)}, ${getHex(8)}, ${getHex(12)} ... ${getHex( 128 + 0 )}, ${getHex(128 + 4)}, ${getHex(128 + 8)}, ${getHex(128 + 12)}` ) } } // process one image line per work group const pixelsPerWorkItem = 48 export class Reader extends PackImpl { constructor(width: number, height: number) { super('v210', width, height, v210Kernel, 'read') this.numBits = 10 this.lumaBlack = 64 this.lumaWhite = 940 this.chromaRange = 896 this.isRGB = false this.numBytes = [getPitchBytes(this.width) * this.height] this.workItemsPerGroup = getPitch(this.width) / pixelsPerWorkItem this.globalWorkItems = this.workItemsPerGroup * this.height } getKernelParams(params: KernelParams): KernelParams { const srcArray: Array = params.sources as Array if (srcArray.length !== 1) throw new Error(`Reader for ${this.name} requires sources parameter with 1 OpenCL buffer`) return { input: srcArray[0], output: params.dest, width: this.width, colMatrix: params.colMatrix, gammaLut: params.gammaLut, gamutMatrix: params.gamutMatrix } } } export class Writer extends PackImpl { constructor(width: number, height: number, interlaced: boolean) { super('v210', width, height, v210Kernel, 'write') this.interlaced = interlaced this.numBits = 10 this.lumaBlack = 64 this.lumaWhite = 940 this.chromaRange = 896 this.isRGB = false this.numBytes = [getPitchBytes(this.width) * this.height] this.workItemsPerGroup = getPitch(this.width) / pixelsPerWorkItem this.globalWorkItems = (this.workItemsPerGroup * this.height) / (this.interlaced ? 2 : 1) } getKernelParams(params: KernelParams): KernelParams { const dstArray: Array = params.dests as Array if (dstArray.length !== 1) throw new Error(`Writer for ${this.name} requires dests parameter with 1 OpenCL buffer`) return { input: params.source, output: dstArray[0], width: this.width, interlace: this.interlaced ? (params.interlace as Interlace) : Interlace.Progressive, colMatrix: params.colMatrix, gammaLut: params.gammaLut } } }