/*
Phaneron - Clustered, accelerated and cloud-fit video server, pre-assembled and in kit form.
Copyright (C) 2020 Streampunk Media Ltd.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
https://www.streampunk.media/ mailto:furnace@streampunk.media
14 Ormiscaig, Aultbea, Achnasheen, IV22 2JJ U.K.
*/
import { PackImpl, Interlace } from './packer'
import { KernelParams, OpenCLBuffer } from 'nodencl'
const v210Kernel = `
__kernel void read(__global uint4* restrict input,
__global float4* restrict output,
__private unsigned int width,
__constant float4* restrict colMatrix,
__global float* restrict gammaLut,
__constant float4* restrict gamutMatrix) {
uint item = get_global_id(0);
bool lastItemOnLine = get_local_id(0) == get_local_size(0) - 1;
// 48 pixels per workItem = 8 input uint4s per work item
uint numPixels = lastItemOnLine && (0 != width % 48) ? width % 48 : 48;
uint numLoops = numPixels / 6;
uint remain = numPixels % 6;
uint inOff = 8 * item;
uint outOff = width * get_group_id(0) + get_local_id(0) * 48;
float4 colMatR = colMatrix[0];
float4 colMatG = colMatrix[1];
float4 colMatB = colMatrix[2];
// optimise loading of the 3x3 gamut matrix
float4 gamutMat0 = gamutMatrix[0];
float4 gamutMat1 = gamutMatrix[1];
float4 gamutMat2 = gamutMatrix[2];
float3 gamutMatR = (float3)(gamutMat0.s0, gamutMat0.s1, gamutMat0.s2);
float3 gamutMatG = (float3)(gamutMat0.s3, gamutMat1.s0, gamutMat1.s1);
float3 gamutMatB = (float3)(gamutMat1.s2, gamutMat1.s3, gamutMat2.s0);
for (uint i=0; i> 10) & 0x3ff, w.s0 & 0x3ff, (w.s0 >> 20) & 0x3ff, 1);
yuva[1] = (ushort4)(w.s1 & 0x3ff, yuva[0].s1, yuva[0].s2, 1);
yuva[2] = (ushort4)((w.s1 >> 20) & 0x3ff, (w.s1 >> 10) & 0x3ff, w.s2 & 0x3ff, 1);
yuva[3] = (ushort4)((w.s2 >> 10) & 0x3ff, yuva[2].s1, yuva[2].s2, 1);
yuva[4] = (ushort4)(w.s3 & 0x3ff, (w.s2 >> 20) & 0x3ff, (w.s3 >> 10) & 0x3ff, 1);
yuva[5] = (ushort4)((w.s3 >> 20) & 0x3ff, yuva[4].s1, yuva[4].s2, 1);
for (uint p=0; p<6; ++p) {
float4 yuva_f = convert_float4(yuva[p]);
float3 rgb;
rgb.s0 = gammaLut[convert_ushort_sat_rte(dot(yuva_f, colMatR) * 65535.0f)];
rgb.s1 = gammaLut[convert_ushort_sat_rte(dot(yuva_f, colMatG) * 65535.0f)];
rgb.s2 = gammaLut[convert_ushort_sat_rte(dot(yuva_f, colMatB) * 65535.0f)];
float4 rgba;
rgba.s0 = dot(rgb, gamutMatR);
rgba.s1 = dot(rgb, gamutMatG);
rgba.s2 = dot(rgb, gamutMatB);
rgba.s3 = 1.0f;
output[outOff+p] = rgba;
}
inOff++;
outOff+=6;
}
if (remain > 0) {
uint4 w = input[inOff];
ushort4 yuva[4];
yuva[0] = (ushort4)((w.s0 >> 10) & 0x3ff, w.s0 & 0x3ff, (w.s0 >> 20) & 0x3ff, 0);
yuva[1] = (ushort4)(w.s1 & 0x3ff, yuva[0].s1, yuva[0].s2, 0);
if (4 == remain) {
yuva[2] = (ushort4)((w.s1 >> 20) & 0x3ff, (w.s1 >> 10) & 0x3ff, w.s2 & 0x3ff, 0);
yuva[3] = (ushort4)((w.s2 >> 10) & 0x3ff, yuva[2].s1, yuva[2].s2, 0);
}
for (uint p=0; p 0) {
uint4 w = (uint4)(0, 0, 0, 0);
ushort3 yuv[4];
for (uint p=0; p> 10) & 0x3ff} ${(w0 >> 20) & 0x3ff} ${w1 & 0x3ff}, `
s += `${(w1 >> 10) & 0x3ff} ${(w1 >> 20) & 0x3ff} ${w2 & 0x3ff} ${(w2 >> 10) & 0x3ff}, `
s += `${(w2 >> 20) & 0x3ff} ${w3 & 0x3ff} ${(w3 >> 10) & 0x3ff} ${(w3 >> 20) & 0x3ff}`
xOff += 16
}
console.log(s)
yOff += pitchBytes
}
}
export function dumpBuf(buf: Buffer, width: number, numLines: number): void {
let lineOff = 0
function getHex(off: number): string {
return buf.readUInt32LE(lineOff + off).toString(16)
}
const pitch = getPitchBytes(width)
for (let l = 0; l < numLines; ++l) {
lineOff = l * pitch
console.log(
`Line ${l}: ${getHex(0)}, ${getHex(4)}, ${getHex(8)}, ${getHex(12)} ... ${getHex(
128 + 0
)}, ${getHex(128 + 4)}, ${getHex(128 + 8)}, ${getHex(128 + 12)}`
)
}
}
// process one image line per work group
const pixelsPerWorkItem = 48
export class Reader extends PackImpl {
constructor(width: number, height: number) {
super('v210', width, height, v210Kernel, 'read')
this.numBits = 10
this.lumaBlack = 64
this.lumaWhite = 940
this.chromaRange = 896
this.isRGB = false
this.numBytes = [getPitchBytes(this.width) * this.height]
this.workItemsPerGroup = getPitch(this.width) / pixelsPerWorkItem
this.globalWorkItems = this.workItemsPerGroup * this.height
}
getKernelParams(params: KernelParams): KernelParams {
const srcArray: Array = params.sources as Array
if (srcArray.length !== 1)
throw new Error(`Reader for ${this.name} requires sources parameter with 1 OpenCL buffer`)
return {
input: srcArray[0],
output: params.dest,
width: this.width,
colMatrix: params.colMatrix,
gammaLut: params.gammaLut,
gamutMatrix: params.gamutMatrix
}
}
}
export class Writer extends PackImpl {
constructor(width: number, height: number, interlaced: boolean) {
super('v210', width, height, v210Kernel, 'write')
this.interlaced = interlaced
this.numBits = 10
this.lumaBlack = 64
this.lumaWhite = 940
this.chromaRange = 896
this.isRGB = false
this.numBytes = [getPitchBytes(this.width) * this.height]
this.workItemsPerGroup = getPitch(this.width) / pixelsPerWorkItem
this.globalWorkItems = (this.workItemsPerGroup * this.height) / (this.interlaced ? 2 : 1)
}
getKernelParams(params: KernelParams): KernelParams {
const dstArray: Array = params.dests as Array
if (dstArray.length !== 1)
throw new Error(`Writer for ${this.name} requires dests parameter with 1 OpenCL buffer`)
return {
input: params.source,
output: dstArray[0],
width: this.width,
interlace: this.interlaced ? (params.interlace as Interlace) : Interlace.Progressive,
colMatrix: params.colMatrix,
gammaLut: params.gammaLut
}
}
}