import { type DocumentPrediction, type Point } from './documentDetection'; export type Rect = { top: number; left: number; bottom: number; right: number; }; export type Crop = { x: number; y: number; width: number; height: number; }; export type Size = { width: number; height: number; }; export type PredictionAnalysis = { // The corners of the document in the original frame in pixels corners: DocumentCorners; // The corners of the document in the original frame relative to the frame size cornersRelative: DocumentCorners; widestSide: number; documentCenter: Point; rotation: number; area: number; outOfCameraFrame: boolean; }; // The order of the corners is top-left, top-right, bottom-right, bottom-left export type DocumentCorners = [Point, Point, Point, Point]; export type TimedPrediction = { timestamp: number; originalFrameSize: Size; prediction: DocumentPrediction; analysis: PredictionAnalysis; }; export type DocumentDetectorResult = { prediction: DocumentPrediction[]; trackedDocuments: TrackedDocument[]; }; export enum TrackedDocumentStatus { OK = 'ok', TOO_FAR = 'too-far', TOO_CLOSE = 'too-close', TOO_MUCH_ROTATION = 'too-much-rotation', TOO_SKEWED = 'too-skewed', TOO_FAST_MOVEMENT = 'too-fast-movement', OUT_OF_CAMERA_FRAME = 'out-of-camera-frame', NOT_ENOUGH_FRAMES = 'not-enough-frames', TOO_FAR_FROM_CENTER = 'too-far-from-center', } export type RotationDegree = '0deg' | '90deg' | '180deg' | '270deg'; export class TrackedDocument { // NB: private fields cannot be shared between worklets timeline: TimedPrediction[]; status: TrackedDocumentStatus = TrackedDocumentStatus.NOT_ENOUGH_FRAMES; framesNumber: number = 0; rotationAbsolute: number = 0; frameGap: number = 0; // How close the document is to the camera from 0 to 1 closeToCamera: number = 0; // How close the document is to the center of the frame from 0 to 1 closeToCenter: number = 0; // The skew angle of the document in radians skewAngle: number = 0; // The speed of the document relative to the frame size speed: number = 0; outOfCameraFrame: boolean = false; constructor(timedPrediction: TimedPrediction) { this.timeline = [timedPrediction]; } get lastPrediction(): TimedPrediction { // Timeline is never empty return this.timeline[this.timeline.length - 1]!; } get timestamp() { return this.lastPrediction.timestamp; } updateStatus(status: TrackedDocumentStatus) { this.status = status; } update(timedPrediction: TimedPrediction) { this.timeline.push(timedPrediction); if (this.timeline.length > 100) { this.timeline = this.timeline.slice(-100); } this.framesNumber += 1; this.rotationAbsolute = Math.abs(timedPrediction.analysis.rotation); this.frameGap = this.calcFrameGap(timedPrediction); this.closeToCamera = this.calcCloseToCamera(timedPrediction); this.closeToCenter = this.calcCloseToCenter(timedPrediction); this.skewAngle = this.calcSkewAngle(timedPrediction); this.outOfCameraFrame = timedPrediction.analysis.outOfCameraFrame; const prev = this.timeline[this.timeline.length - 2]; if (prev) { this.speed = this.calcSpeed(prev, timedPrediction); } } private calcSpeed(prev: TimedPrediction, last: TimedPrediction) { const timeDiff = last.timestamp - prev.timestamp; const distance = quadrangleMovement( prev.analysis.corners, last.analysis.corners ); const speed = distance / timeDiff; const widestFrameSide = Math.max( last.originalFrameSize.width, last.originalFrameSize.height ); return speed / widestFrameSide; } // Calculate the minimum gap between the document and the frame private calcFrameGap(prediction: TimedPrediction): number { const [topLeft, topRight, bottomRight, bottomLeft] = prediction.analysis.corners; const { width, height } = prediction.originalFrameSize; const top = Math.min(topLeft.y, topRight.y); const bottom = Math.max(bottomLeft.y, bottomRight.y); const left = Math.min(topLeft.x, bottomLeft.x); const right = Math.max(topRight.x, bottomRight.x); const topGap = top; const bottomGap = height - bottom; const leftGap = left; const rightGap = width - right; return Math.min( topGap / height, bottomGap / height, leftGap / width, rightGap / width ); } private calcCloseToCamera(prediction: TimedPrediction): number { const [topLeft, topRight, bottomRight, bottomLeft] = prediction.analysis.corners; const { width, height } = prediction.originalFrameSize; const top = Math.min(topLeft.y, topRight.y); const bottom = Math.max(bottomLeft.y, bottomRight.y); const left = Math.min(topLeft.x, bottomLeft.x); const right = Math.max(topRight.x, bottomRight.x); return Math.max((bottom - top) / height, (right - left) / width); } private calcCloseToCenter(prediction: TimedPrediction): number { const frameCenter: Point = { x: prediction.originalFrameSize.width / 2, y: prediction.originalFrameSize.height / 2, }; const frameDiagonal = Math.sqrt( prediction.originalFrameSize.width ** 2 + prediction.originalFrameSize.height ** 2 ); const maxDistance = frameDiagonal / 2; const distanceToCenter = calcDistance( frameCenter, prediction.analysis.documentCenter ); return 1 - distanceToCenter / maxDistance; } private calcSkewAngle(prediction: TimedPrediction): number { const [topLeft, topRight, bottomRight, bottomLeft] = prediction.analysis.corners; const topRotation = Math.atan2( topRight.y - topLeft.y, topRight.x - topLeft.x ); const bottomRotation = Math.atan2( bottomRight.y - bottomLeft.y, bottomRight.x - bottomLeft.x ); const leftRotation = Math.atan2( bottomLeft.x - topLeft.x, bottomLeft.y - topLeft.y ); const rightRotation = Math.atan2( bottomRight.x - topRight.x, bottomRight.y - topRight.y ); const horizontalSkew = Math.abs(topRotation - bottomRotation); const verticalSkew = Math.abs(leftRotation - rightRotation); return Math.max(horizontalSkew, verticalSkew); } } export type DocumentDetectorConfig = { // The size of the frame used for the document detection frameSize: Size; // The threshold in probability for a document to be considered out of the frame outOfCameraFrameThreshold: number; // The time in milliseconds after which a document is no longer tracked // if it is not detected again or moved too fast documentTrackTimeout: number; // The maximum speed of the document relative to its size per millisecond // The document is not tracked if it moves faster than this // (e.g. 0.0001 means the document can move 10% of its size per second) documentTrackSpeedThreshold: number; // The minimum number of frames a document must be tracked // to get quality checks documentMinFrames: number; // The maximum movement speed of the document relative to the frame size per millisecond // (e.g. 0.0001 means the document can move 10% of the frame size per second) documentMaxMovementSpeed: number; // The maximum rotation of the document in radians documentMaxRotation: number; // The minimum gap between the document and the frame // in percentage of the frame size (e.g. 0.05) documentMinFrameGap: number; // The minimum distance between the document and the camera from 0 to 1 // in percentage of the frame size (e.g. 0.5) documentMinCloseToCamera: number; // The maximum distance between the document and the camera from 0 to 1 // in percentage of the frame size (e.g. 0.8) documentMaxCloseToCamera: number; // The minimum distance between the document and the center of the frame from 0 to 1 // 0 means the document is in the center, 1 means it is at the corner of the frame // e.g. 0.95 means the document is at most 5% away from the center documentMinCloseToCenter: number; // The maximum skew angle of the document in radians documentMaxSkewAngle: number; }; export class DocumentDetector { private trackedDocuments: TrackedDocument[] = []; constructor(private readonly config: DocumentDetectorConfig) { if (config.documentTrackTimeout <= 0) { throw new Error('documentTrackTimeout must be greater than 0'); } if (config.documentTrackSpeedThreshold <= 0) { throw new Error('documentTrackSpeedThreshold must be greater than 0'); } if (config.documentMinFrames <= 1) { throw new Error('documentMinFrames must be greater than 1'); } if (config.documentMaxMovementSpeed <= 0) { throw new Error('documentMovementSpeedThreshold must be greater than 0'); } if (config.documentMaxRotation < 0) { throw new Error('documentMaxRotation cannot be negative'); } if (config.documentMinFrameGap <= 0) { throw new Error('documentMinFrameGap must be greater than 0'); } if (config.documentMinCloseToCamera >= 1) { throw new Error('documentMinCloseToCamera must be less than 1'); } if (config.documentMaxCloseToCamera <= 0) { throw new Error('documentMaxCloseToCamera must be greater than 0'); } if (config.documentMinCloseToCenter >= 1) { throw new Error('documentMinCloseToCenter must be less than 1'); } if (config.documentMaxSkewAngle < 0) { throw new Error('documentMaxSkewAngle cannot be negative'); } } processFrame( timestamp: number, documentPredictions: DocumentPrediction[], originalFrameSize: Size ): DocumentDetectorResult { this.removeOldTrackedDocuments(timestamp); const timedPredictions: TimedPrediction[] = documentPredictions.map( (prediction) => ({ timestamp, originalFrameSize, prediction, analysis: this.analyzePrediction(prediction, originalFrameSize), }) ); const timedPredictionsUpdated = new Set(); const trackedDocumentsUpdated = new Set(); // First try to match the documents with the least movement const trackedDocumentMatches = timedPredictions .flatMap((timedPrediction) => this.matchTrackedDocuments(timedPrediction).map( ({ tracked, speed }) => { return { tracked, timedPrediction, speed }; } ) ) .sort((a, b) => a.speed - b.speed); for (const match of trackedDocumentMatches) { if ( trackedDocumentsUpdated.has(match.tracked) || timedPredictionsUpdated.has(match.timedPrediction) ) { continue; } trackedDocumentsUpdated.add(match.tracked); timedPredictionsUpdated.add(match.timedPrediction); match.tracked.update(match.timedPrediction); this.updateStatus(match.tracked); } const notMatched = timedPredictions.filter( (timedPrediction) => !timedPredictionsUpdated.has(timedPrediction) ); for (const timedPrediction of notMatched) { const trackedDocument = new TrackedDocument(timedPrediction); this.trackedDocuments.push(trackedDocument); trackedDocumentsUpdated.add(trackedDocument); } return { prediction: documentPredictions, trackedDocuments: [...trackedDocumentsUpdated], }; } private updateStatus(trackedDocument: TrackedDocument) { if (trackedDocument.outOfCameraFrame) { trackedDocument.updateStatus(TrackedDocumentStatus.OUT_OF_CAMERA_FRAME); return; } if (trackedDocument.framesNumber < this.config.documentMinFrames) { trackedDocument.updateStatus(TrackedDocumentStatus.NOT_ENOUGH_FRAMES); return; } if (trackedDocument.speed > this.config.documentMaxMovementSpeed) { trackedDocument.updateStatus(TrackedDocumentStatus.TOO_FAST_MOVEMENT); return; } if (trackedDocument.frameGap < this.config.documentMinFrameGap) { trackedDocument.updateStatus(TrackedDocumentStatus.TOO_CLOSE); return; } if (trackedDocument.closeToCamera < this.config.documentMinCloseToCamera) { trackedDocument.updateStatus(TrackedDocumentStatus.TOO_FAR); return; } if (trackedDocument.closeToCamera > this.config.documentMaxCloseToCamera) { trackedDocument.updateStatus(TrackedDocumentStatus.TOO_CLOSE); return; } if (trackedDocument.closeToCenter < this.config.documentMinCloseToCenter) { trackedDocument.updateStatus(TrackedDocumentStatus.TOO_FAR_FROM_CENTER); return; } if (trackedDocument.rotationAbsolute > this.config.documentMaxRotation) { trackedDocument.updateStatus(TrackedDocumentStatus.TOO_MUCH_ROTATION); return; } if (trackedDocument.skewAngle > this.config.documentMaxSkewAngle) { trackedDocument.updateStatus(TrackedDocumentStatus.TOO_SKEWED); return; } trackedDocument.updateStatus(TrackedDocumentStatus.OK); } private removeOldTrackedDocuments(timestamp: number) { this.trackedDocuments = this.trackedDocuments.filter( (trackedDocument) => timestamp - trackedDocument.timestamp < this.config.documentTrackTimeout ); } private matchTrackedDocuments( documentPrediction: TimedPrediction ): { tracked: TrackedDocument; speed: number }[] { const trackedDocumentDiffs = this.trackedDocuments.map( (trackedDocument) => { const lastPrediction = trackedDocument.lastPrediction; const timeDiff = documentPrediction.timestamp - lastPrediction.timestamp; const distance = calcDistance( documentPrediction.analysis.documentCenter, lastPrediction.analysis.documentCenter ); // Use square root of time difference to give more weight to recent frames const absoluteSpeed = distance / Math.sqrt(timeDiff); const speedRelativeToDocumentSize = absoluteSpeed / lastPrediction.analysis.widestSide; return { tracked: trackedDocument, speed: speedRelativeToDocumentSize, distance, timeDiff, }; } ); const sorted = trackedDocumentDiffs .filter((x) => x.speed <= this.config.documentTrackSpeedThreshold) .sort((a, b) => a.speed - b.speed); return sorted; } private analyzePrediction( prediction: DocumentPrediction, originalFrameSize: Size ): PredictionAnalysis { const corners = this.getOriginalDocumentCorners( prediction, originalFrameSize ); const documentCenter = meanPoint(...corners); const [topLeft, topRight, bottomRight, bottomLeft] = corners; const widestSide = Math.max( calcDistance(topLeft, topRight), calcDistance(topRight, bottomRight), calcDistance(bottomRight, bottomLeft), calcDistance(bottomLeft, topLeft) ); const meanLeft = meanPoint(topLeft, bottomLeft); const meanRight = meanPoint(topRight, bottomRight); const rotation = Math.atan2( meanRight.y - meanLeft.y, meanRight.x - meanLeft.x ); const area = quadrangleArea(corners); const outOfCameraFrame = prediction.probOutFromFrame > this.config.outOfCameraFrameThreshold; return { corners, cornersRelative: prediction.points as DocumentCorners, widestSide, documentCenter, rotation, area, outOfCameraFrame, }; } private getOriginalDocumentCorners( prediction: DocumentPrediction, originalFrameSize: Size ): DocumentCorners { return prediction.points.map((point) => ({ x: point.x * originalFrameSize.width, y: point.y * originalFrameSize.height, })) as DocumentCorners; } } export function getDocumentCenter(corners: DocumentCorners): Point { // TODO: implement a more precise center calculation return meanPoint(...corners); } export function getDocumentCrop( document: TrackedDocument, { rotation, frame }: { rotation: RotationDegree; frame: Size } ): Crop { const lastPrediction = document.timeline[document.timeline.length - 1]!; const rotatedCorners = rotateCornersCounterClockwise( rotation, lastPrediction.analysis.cornersRelative ) as DocumentCorners; const rotatedX = rotation === '90deg' || rotation === '270deg'; const widestSide = Math.max(frame.width, frame.height); const narrowestSide = Math.min(frame.width, frame.height); const croppedLength = widestSide - narrowestSide; const diffX = rotatedX ? croppedLength / 2 : 0; const diffY = rotatedX ? 0 : croppedLength / 2; const frameCorners = rotatedCorners.map((corner) => ({ x: corner.x * narrowestSide + diffX, y: corner.y * narrowestSide + diffY, })) as DocumentCorners; const [topLeft, topRight, bottomRight, bottomLeft] = frameCorners; const top = Math.min(topLeft.y, topRight.y); const left = Math.min(topLeft.x, bottomLeft.x); const bottom = Math.max(bottomLeft.y, bottomRight.y); const right = Math.max(topRight.x, bottomRight.x); const crop = { x: left, y: top, // TODO: consider rotation? width: right - left, height: bottom - top, }; return crop; } function meanPoint(...points: Point[]): Point { return { x: mean(points.map((point) => point.x)), y: mean(points.map((point) => point.y)), }; } export function mean(values: number[]): number { return values.reduce((acc, value) => acc + value, 0) / values.length; } function calcDistance(pointA: Point, pointB: Point): number { return Math.sqrt((pointA.x - pointB.x) ** 2 + (pointA.y - pointB.y) ** 2); } export function triangleArea(a: Point, b: Point, c: Point): number { return Math.abs( (a.x * (b.y - c.y) + b.x * (c.y - a.y) + c.x * (a.y - b.y)) / 2 ); } export function quadrangleArea(points: DocumentCorners): number { const [a, b, c, d] = points; return triangleArea(a, b, c) + triangleArea(a, c, d); } export function quadrangleMovement( quadA: DocumentCorners, quadB: DocumentCorners ): number { let totalDistance = 0; for (let i = 0; i < 4; i++) { totalDistance += calcDistance(quadA[i]!, quadB[i]!); } // Average distance between corners return totalDistance / 4; } export function rotatePointsClockwise( rotation: RotationDegree, points: Point[] ): Point[] { 'worklet'; switch (rotation) { case '90deg': return points.map(({ x, y }) => ({ x: 1 - y, y: x })); case '180deg': return points.map(({ x, y }) => ({ x: 1 - x, y: 1 - y })); case '270deg': return points.map(({ x, y }) => ({ x: y, y: 1 - x })); default: return points; } } export function rotateCornersClockwise( rotation: RotationDegree, originalCorners: DocumentCorners ): DocumentCorners { 'worklet'; const corners = rotatePointsClockwise( rotation, originalCorners ) as DocumentCorners; switch (rotation) { case '90deg': return [corners[3], corners[0], corners[1], corners[2]]; case '180deg': return [corners[2], corners[3], corners[0], corners[1]]; case '270deg': return [corners[1], corners[2], corners[3], corners[0]]; default: return corners; } } export function rotateCornersCounterClockwise( rotation: RotationDegree, corners: DocumentCorners ): DocumentCorners { 'worklet'; const clockwiseRotationMap: Record = { '0deg': '0deg', '90deg': '270deg', '180deg': '180deg', '270deg': '90deg', }; return rotateCornersClockwise(clockwiseRotationMap[rotation], corners); } export function getPredictionCrop( corners: Point[], { rotation, frame, padding = 0.0, }: { rotation: RotationDegree; frame: Size; padding?: number } ): Crop { 'worklet'; const rotatedCorners = rotateCornersCounterClockwise( rotation, corners as DocumentCorners ) as DocumentCorners; const rotatedX = rotation === '90deg' || rotation === '270deg'; const widestSide = Math.max(frame.width, frame.height); const narrowestSide = Math.min(frame.width, frame.height); const croppedLength = widestSide - narrowestSide; const diffX = rotatedX ? croppedLength / 2 : 0; const diffY = rotatedX ? 0 : croppedLength / 2; const frameCorners = rotatedCorners.map((corner) => ({ x: corner.x * narrowestSide + diffX, y: corner.y * narrowestSide + diffY, })) as DocumentCorners; const topLeft = frameCorners[0]; const topRight = frameCorners[1]; const bottomRight = frameCorners[2]; const bottomLeft = frameCorners[3]; const top = Math.min(topLeft.y, topRight.y); const left = Math.min(topLeft.x, bottomLeft.x); const bottom = Math.max(bottomLeft.y, bottomRight.y); const right = Math.max(topRight.x, bottomRight.x); const width = right - left; const height = bottom - top; const paddingX = width * padding; const paddingY = height * padding; const crop = { x: left - paddingX, y: top - paddingY, // TODO: consider rotation? width: width + paddingX * 2, height: height + paddingY * 2, }; return crop; }