;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;//      http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;// 
;// File Name:  omxVCM4P2_MCReconBlock_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   9641
;// Date:       Thursday, February 7, 2008
;// 
;// 
;// 
;//
;// Description:
;//
;//

;// Include standard headers
    INCLUDE omxtypes_s.h
    INCLUDE armCOMM_s.h

;// Import symbols required from other files

    M_VARIANTS ARM1136JS

;// ***************************************************************************
;// ARM1136JS implementation
;// ***************************************************************************
    IF  ARM1136JS
    
;// ***************************************************************************
;// MACRO DEFINITIONS
;// ***************************************************************************
    ;// Description:
    ;//
    ;//   dest[j] = (x[j] + y[j] + round) >> 1,   j=0..3
    ;//
    ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to
    ;// each sum before dividing by two, if round is 1
    ;//
    ;// Syntax:
    ;// M_UHADD8R   $dest, $x, $y, $round, $mask
    ;//
    ;// Inputs:
    ;// $x        four packed bytes,   x[3] :  x[2]  :  x[1]  :  x[0]
    ;// $y        four packed bytes,   y[3] :  y[2]  :  y[1]  :  y[0]
    ;// $round    0 if no rounding to be added, 1 if rounding to be done
    ;// $mask     some register set to 0x80808080
    ;//
    ;// Outputs:
    ;// $dest     four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]

    MACRO
    M_UHADD8R   $dest, $x, $y, $round, $mask
    IF $round = 1
        IF  $dest /= $y
            MVN         $dest, $x
            UHSUB8      $dest, $y, $dest
            EOR         $dest, $dest, $mask
        ELSE
            MVN         $dest, $y
            UHSUB8      $dest, $x, $dest
            EOR         $dest, $dest, $mask
        ENDIF
    ELSE
        UHADD8      $dest, $x, $y
    ENDIF
    MEND
;// ***************************************************************************
    ;// Description:
    ;// Load 8 bytes from $pSrc (aligned or unaligned locations)
    ;//
    ;// Syntax:
    ;// M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
    ;// 
    ;// Inputs:
    ;// $pSrc       4 byte aligned source pointer to an address just less than 
    ;//             or equal to the data location
    ;// $srcStep    The stride on source
    ;// $scratch    A scratch register, used internally for temp calculations
    ;// $offset     Difference of source data location to the source pointer
    ;//             Use when $offset != 0 (unaligned load)
    ;//
    ;// Outputs:
    ;// $pSrc       In case the macro accepts stride, it increments the pSrc by 
    ;//             that value, else unchanged
    ;// $out0       four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
    ;// $out1       four packed bytes,   z[7] :  z[6]  :  z[5]  :  z[4]
    ;//
    ;// Note: {$out0, $out1, $scratch} should be registers with ascending
    ;// register numbering. In case offset is 0, $scratch is not modified.

    MACRO
    M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
        IF $offset = 0
            LDM         $pSrc, {$out0, $out1}
            ADD         $pSrc, $pSrc, $srcStep
        ELSE
            LDM         $pSrc, {$out0, $out1, $scratch} 
            ADD         $pSrc, $pSrc, $srcStep
            
            MOV         $out0, $out0, LSR #8 * $offset
            ORR         $out0, $out0, $out1, LSL #(32 - 8 * ($offset))
            MOV         $out1, $out1, LSR #8 * $offset
            ORR         $out1, $out1, $scratch, LSL #(32 - 8 * ($offset))
        ENDIF
    MEND

;// ***************************************************************************
    ;// Description:
    ;// Loads three words for X interpolation, update pointer to next row. For 
    ;// X interpolation, given a truncated-4byteAligned source pointer, 
    ;// invariably three continous words are required from there to get the
    ;// nine bytes from the source pointer for filtering. 
    ;//
    ;// Syntax:
    ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
    ;// 
    ;// Inputs:
    ;// $pSrc       4 byte aligned source pointer to an address just less than 
    ;//             or equal to the data location
    ;//
    ;// $srcStep    The stride on source
    ;//
    ;// $offset     Difference of source data location to the source pointer
    ;//             Use when $offset != 0 (unaligned load)
    ;//
    ;// Outputs:
    ;// $pSrc       Incremented by $srcStep
    ;//
    ;// $word0, $word1, $word2, $word3
    ;//             Three of these are outputs based on the $offset parameter. 
    ;//             The outputs are specifically generated to be processed by 
    ;//             the M_EXT_XINT macro. Following is the illustration to show 
    ;//             how the nine bytes are spanned for different offsets from 
    ;//             notTruncatedForAlignmentSourcePointer.
    ;//
    ;//              ------------------------------------------------------
    ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
    ;//             |------------------------------------------------------|
    ;//             |    0   |       0     | 0123  | 4567  | 8xxx  |       |
    ;//             |    1   |      -1     | x012  | 3456  | 78xx  |       |
    ;//             |    2   |      -2     | xx01  | 2345  | 678x  |       |
    ;//             |    3   |      -3     | xxx0  |       | 1234  | 5678  |
    ;//              ------------------------------------------------------
    ;// 
    ;//             where the numbering (0-8) is to designate the 9 bytes from
    ;//             start of a particular row. The illustration doesn't take in 
    ;//             account the positioning of bytes with in the word and the 
    ;//             macro combination with M_EXT_XINT will work only in little 
    ;//             endian environs
    ;// 
    ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
    ;// register numbering

    MACRO
    M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
        IF $offset /= 3
            LDM         $pSrc, {$word0, $word1, $word2}
        ELSE
            LDM         $pSrc, {$word0, $word2, $word3}
        ENDIF
        ADD         $pSrc, $pSrc, $srcStep
    MEND

;// ***************************************************************************
    ;// Description:
    ;// Extract four registers of four pixels for X interpolation 
    ;// 
    ;// Syntax:
    ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3
    ;// 
    ;// Inputs:
    ;// $offset     Difference of source data location to the source pointer
    ;//             Use when $offset != 0 (unaligned load)
    ;// 
    ;// $word0, $word1, $word2, $word3
    ;//             Three of these are inputs based on the $offset parameter. 
    ;//             The inputs are specifically selected to be processed by 
    ;//             the M_EXT_XINT macro.
    ;//
    ;//              ------------------------------------------------------
    ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
    ;//             |------------------------------------------------------|
    ;//             |    0   |       0     | 0123  | 4567  | 8xxx  | yyyy  |
    ;//             |    1   |      -1     | x012  | 3456  | 78xx  | yyyy  |
    ;//             |    2   |      -2     | xx01  | 2345  | 678x  | yyyy  |
    ;//             |    3   |      -3     | xxx0  | yyyy  | 1234  | 5678  |
    ;//              ------------------------------------------------------
    ;// 
    ;// Outputs:
    ;// $word0, $word1, $word2, $word3
    ;//             Bytes from the original source pointer (not truncated for
    ;//             4 byte alignment) as shown in the table. 
    ;//              -------------------------------
    ;//             | word0 | word1 | word2 | word3 |
    ;//             |-------------------------------|
    ;//             | 0123  | 4567  | 1234  | 5678  |
    ;//              -------------------------------
    ;//
    ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
    ;// register numbering

    MACRO
    M_EXT_XINT $offset, $word0, $word1, $word2, $word3
        IF $offset = 0
            ; $word0 and $word1 are ok
            ; $word2, $word3 are just 8 shifted versions
            MOV         $word3, $word1, LSR #8
            ORR         $word3, $word3, $word2, LSL #24
            MOV         $word2, $word0, LSR #8
            ORR         $word2, $word2, $word1, LSL #24
        ELIF $offset = 3
            ; $word2 and $word3 are ok (taken care while loading itself)
            ; set $word0 & $word1
            MOV         $word0, $word0, LSR #24
            ORR         $word0, $word0, $word2, LSL #8
            MOV         $word1, $word2, LSR #24
            ORR         $word1, $word1, $word3, LSL #8
        ELSE
            MOV         $word0, $word0, LSR #8 * $offset
            ORR         $word0, $word0, $word1, LSL #(32 - 8 * ($offset))
            MOV         $word1, $word1, LSR #8 * $offset
            ORR         $word1, $word1, $word2, LSL #(32 - 8 * ($offset))

            MOV         $word3, $word1, LSR #8
            ORR         $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1))
            MOV         $word2, $word0, LSR #8
            ORR         $word2, $word2, $word1, LSL #24
        ENDIF
    MEND

;// ***************************************************************************
    ;// Description:
    ;// Computes half-sum and xor of two inputs and puts them in the input 
    ;// registers in that order
    ;//
    ;// Syntax:
    ;// M_HSUM_XOR      $v0, $v1, $tmp
    ;// 
    ;// Inputs:
    ;// $v0         a, first input
    ;// $v1         b, second input
    ;// $tmp        scratch register
    ;// 
    ;// Outputs:
    ;// $v0         (a + b)/2
    ;// $v1         a ^ b

    MACRO
    M_HSUM_XOR      $v0, $v1, $tmp
        UHADD8      $tmp, $v0, $v1     ;// s0 = a + b
        EOR         $v1, $v0, $v1      ;// l0 = a ^ b
        MOV         $v0, $tmp          ;// s0
    MEND
;// ***************************************************************************
    ;// Description:
    ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in 
    ;// mcReconBlock module. Very specific to the implementation of 
    ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and 
    ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are 
    ;// not significant and are used by the callee for row counter (y)
    ;//
    ;// Some points to note are:
    ;// 1. Input is pair of pair-averages and Xors
    ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another 
    ;//    running average
    ;// 3. Output is in the first argument
    ;//
    ;// Syntax:
    ;// M_AVG4         $sum0, $lsb0, $sum1, $lsb1, $rndVal
    ;// 
    ;// Inputs:
    ;// $sum0       (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged
    ;// $lsb0       (a ^ b)
    ;// $sum1       (c + d) >> 1. Not modified
    ;// $lsb1       (c ^ d)       Not modified
    ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
    ;// 
    ;// Outputs:
    ;// $sum0       (a + b + c + d + 1) / 4 : If no rounding
    ;//             (a + b + c + d + 2) / 4 : If rounding

    MACRO
    M_AVG4          $sum0, $lsb0, $sum1, $lsb1, $rndVal
        LCLS OP1
        LCLS OP2
        IF $rndVal = 0 ;// rounding case
OP1 SETS "AND"
OP2 SETS "ORR"
        ELSE           ;// Not rounding case
OP1 SETS "ORR"
OP2 SETS "AND"
        ENDIF
        
        LCLS lsb2
        LCLS sum2
        LCLS dest
    
lsb2  SETS "tmp"
sum2  SETS "$lsb0"
dest  SETS "$sum0"

        $OP1        $lsb0, $lsb0, $lsb1          ;// e0 = e0 & e1
        EOR         $lsb2, $sum0, $sum1          ;// e2 = s0 ^ s1
        $OP2        $lsb2, $lsb2, $lsb0          ;// e2 = e2 | e0
        AND         $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask
        UHADD8      $sum2, $sum0, $sum1          ;// s2 = (s0 + s1)/2
        UADD8       $dest, $sum2, $lsb2          ;// dest =  s2 + e2
    MEND
;// ***************************************************************************
;// Motion compensation handler macros
;// ***************************************************************************
    ;// Description:
    ;// Implement motion compensation routines using the named registers in 
    ;// callee function. Each of the following 4 implement the 4 predict type
    ;// Each handles 8 cases each ie all the combinations of 4 types of source 
    ;// alignment offsets and 2 types of rounding flag
    ;//
    ;// Syntax:
    ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
    ;// M_MCRECONBLOCK_HalfPixelX   $rndVal, $offset
    ;// M_MCRECONBLOCK_HalfPixelY   $rndVal, $offset
    ;// M_MCRECONBLOCK_HalfPixelXY  $rndVal, $offset
    ;// 
    ;// Inputs:
    ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
    ;// $offset     $pSrc MOD 4 value. Offset from 4 byte aligned location.
    ;// 
    ;// Outputs:
    ;// Outputs come in the named registers of the callee functions
    ;// The macro loads the data from the source pointer, processes it and 
    ;// stores in the destination pointer. Does the whole prediction cycle
    ;// of Motion Compensation routine for a particular predictType
    ;// After this only residue addition to the predicted values remain

    MACRO
    M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
    ;// Algorithmic Description:
    ;// This handles motion compensation for IntegerPixel predictType. Both
    ;// rounding cases are handled by the same code base. It is just a copy
    ;// from source to destination. Two lines are done per loop to reduce 
    ;// stalls. Loop has been software pipelined as well for that purpose.
    ;// 
    ;// M_LOAD_X loads a whole row in two registers and then they are stored
    
CaseIntegerPixelRnd0Offset$offset
CaseIntegerPixelRnd1Offset$offset
    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
YloopIntegerPixelOffset$offset
    SUBS        y, y, #2
    STRD        tmp1, tmp2, [pDst], dstStep
    STRD        tmp3, tmp4, [pDst], dstStep
    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
    BGT         YloopIntegerPixelOffset$offset

    B           SwitchPredictTypeEnd
    MEND
;// ***************************************************************************
    MACRO
    M_MCRECONBLOCK_HalfPixelX $rndVal, $offset
    ;// Algorithmic Description:
    ;// This handles motion compensation for HalfPixelX predictType. The two
    ;// rounding cases are handled by the different code base and spanned by 
    ;// different macro calls. Loop has been software pipelined to reduce 
    ;// stalls.
    ;// 
    ;// Filtering involves averaging a pixel with the next horizontal pixel.
    ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with 
    ;// all pixels in a row with 4 pixel in each register and another 2
    ;// registers with pixels corresponding to one horizontally shifted pixel
    ;// corresponding to the initial row pixels. These are set of packed 
    ;// registers appropriate to do 4 lane SIMD.
    ;// After that M_UHADD8R macro does the averaging taking care of the 
    ;// rounding as required
    
CaseHalfPixelXRnd$rndVal.Offset$offset
    IF $rndVal = 0
        LDR mask, =0x80808080
    ENDIF

    M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
YloopHalfPixelXRnd$rndVal.Offset$offset
    SUBS        y, y, #1
    M_EXT_XINT  $offset, tmp1, tmp2, tmp3, tmp4
    M_UHADD8R   tmp5, tmp1, tmp3, (1-$rndVal), mask
    M_UHADD8R   tmp6, tmp2, tmp4, (1-$rndVal), mask
    STRD        tmp5, tmp6, [pDst], dstStep
    M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
    BGT         YloopHalfPixelXRnd$rndVal.Offset$offset

    B           SwitchPredictTypeEnd
    MEND
;// ***************************************************************************
    MACRO
    M_MCRECONBLOCK_HalfPixelY $rndVal, $offset
    ;// Algorithmic Description:
    ;// This handles motion compensation for HalfPixelY predictType. The two
    ;// rounding cases are handled by the different code base and spanned by 
    ;// different macro calls. PreLoading is used to avoid reload of same data. 
    ;// 
    ;// Filtering involves averaging a pixel with the next vertical pixel.
    ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in 
    ;// each register. These are set of packed registers appropriate to do 
    ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care 
    ;// of the rounding as required
    
CaseHalfPixelYRnd$rndVal.Offset$offset
    IF $rndVal = 0
        LDR mask, =0x80808080
    ENDIF

    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load
YloopHalfPixelYRnd$rndVal.Offset$offset
    SUBS        y, y, #2
    ;// Processing one line
    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
    M_UHADD8R   tmp1, tmp1, tmp3, (1-$rndVal), mask
    M_UHADD8R   tmp2, tmp2, tmp4, (1-$rndVal), mask
    STRD        tmp1, tmp2, [pDst], dstStep
    ;// Processing another line
    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset
    M_UHADD8R   tmp3, tmp3, tmp1, (1-$rndVal), mask
    M_UHADD8R   tmp4, tmp4, tmp2, (1-$rndVal), mask
    STRD        tmp3, tmp4, [pDst], dstStep

    BGT         YloopHalfPixelYRnd$rndVal.Offset$offset

    B           SwitchPredictTypeEnd
    MEND
;// ***************************************************************************
    MACRO
    M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset
    ;// Algorithmic Description:
    ;// This handles motion compensation for HalfPixelXY predictType. The two
    ;// rounding cases are handled by the different code base and spanned by 
    ;// different macro calls. PreLoading is used to avoid reload of same data. 
    ;// 
    ;// Filtering involves averaging a pixel with the next vertical, horizontal 
    ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT
    ;// and M_EXT_XINT combination generates 4 registers with a row and its
    ;// 1 pixel right shifted version, with 4 pixels in one register. Another 
    ;// call of that macro-combination gets another row. Then M_HSUM_XOR is 
    ;// called to get mutual half-sum and xor combinations of a row with its
    ;// shifted version as they are inputs to the M_AVG4 macro which computes
    ;// the 4 element average with rounding. Note that it is the half-sum/xor 
    ;// values that are preserved for next row as they can be re-used in the 
    ;// next call to the M_AVG4 and saves recomputation.
    ;// Due to lack of register, the row counter and a masking value required 
    ;// in M_AVG4 are packed into a single register yMask where the last nibble
    ;// holds the row counter values and rest holds the masking variable left 
    ;// shifted by 4
    
CaseHalfPixelXYRnd$rndVal.Offset$offset
    LDR         yMask, =((0x01010101 << 4) + 8)

    M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
    M_EXT_XINT  $offset, t00, t01, t10, t11
    M_HSUM_XOR  t00, t10, tmp               ;// s0, l0
    M_HSUM_XOR  t01, t11, tmp               ;// s0', l0'

YloopHalfPixelXYRnd$rndVal.Offset$offset
    ;// Processsing one line
    ;// t00, t01, t10, t11 required from previous loop
    M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d'
    SUB         yMask, yMask, #2
    M_EXT_XINT  $offset, t20, t21, t30, t31
    M_HSUM_XOR  t20, t30, tmp               ;// s1, l1
    M_HSUM_XOR  t21, t31, tmp               ;// s1', l1'
    M_AVG4      t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1
    M_AVG4      t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1'
    STRD        t00, t01, [pDst], dstStep   ;// store the average
    
    ;// Processsing another line
    ;// t20, t21, t30, t31 required from above
    M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
    TST         yMask, #7
    M_EXT_XINT  $offset, t00, t01, t10, t11
    M_HSUM_XOR  t00, t10, tmp
    M_HSUM_XOR  t01, t11, tmp
    M_AVG4      t20, t30, t00, t10, $rndVal
    M_AVG4      t21, t31, t01, t11, $rndVal
    STRD        t20, t21, [pDst], dstStep

    BGT         YloopHalfPixelXYRnd$rndVal.Offset$offset

    IF $offset/=3 :LOR: $rndVal/=1
        B           SwitchPredictTypeEnd
    ENDIF
    MEND
;// ***************************************************************************
;// Motion compensation handler macros end here
;// ***************************************************************************
    ;// Description:
    ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal
    ;// combination in the "switch" to prediction processing code segment
    ;//
    ;// Syntax:
    ;// M_CASE_OFFSET $rnd, $predictType
    ;// 
    ;// Inputs:
    ;// $rnd            0 for rounding, 1 for no rounding
    ;// $predictType    The prediction mode
    ;// 
    ;// Outputs:
    ;// Populated list of "M_CASE"s for the "M_SWITCH" macro

    MACRO
    M_CASE_OFFSET $rnd, $predictType
        M_CASE      Case$predictType.Rnd$rnd.Offset0
        M_CASE      Case$predictType.Rnd$rnd.Offset1
        M_CASE      Case$predictType.Rnd$rnd.Offset2
        M_CASE      Case$predictType.Rnd$rnd.Offset3
    MEND
;// ***************************************************************************
    ;// Description:
    ;// Populates all 2 kinds of rounding "cases" for each predictType in the 
    ;// "switch" to prediction processing code segment
    ;//
    ;// Syntax:
    ;// M_CASE_OFFSET $predictType
    ;// 
    ;// Inputs:
    ;// $predictType    The prediction mode
    ;// 
    ;// Outputs:
    ;// Populated list of "M_CASE_OFFSET" macros

    MACRO
    M_CASE_MCRECONBLOCK $predictType
        M_CASE_OFFSET  0, $predictType ;// 0 for rounding
        M_CASE_OFFSET  1, $predictType ;// 1 for no rounding
    MEND
;// ***************************************************************************
    ;// Description:
    ;// Populates all 8 kinds of rounding and offset combinations handling macros 
    ;// for the specified predictType. In case of "IntegerPixel" predictType, 
    ;// rounding is not required so same code segment handles both cases
    ;//
    ;// Syntax:
    ;// M_MCRECONBLOCK    $predictType
    ;// 
    ;// Inputs:
    ;// $predictType    The prediction mode
    ;// 
    ;// Outputs:
    ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified 
    ;// predictType. Each 
    ;//                 M_MCRECONBLOCK_<predictType> $rnd, $offset 
    ;// is an code segment (starting with a label indicating the predictType, 
    ;// rounding and offset combination)
    ;// Four calls of this macro with the 4 prediction modes populate all the 32 
    ;// handlers

    MACRO
    M_MCRECONBLOCK $predictType
        M_MCRECONBLOCK_$predictType 0, 0
        M_MCRECONBLOCK_$predictType 0, 1
        M_MCRECONBLOCK_$predictType 0, 2
        M_MCRECONBLOCK_$predictType 0, 3
    IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference
        M_MCRECONBLOCK_$predictType 1, 0
        M_MCRECONBLOCK_$predictType 1, 1
        M_MCRECONBLOCK_$predictType 1, 2
        M_MCRECONBLOCK_$predictType 1, 3
    ENDIF
    MEND
;// ***************************************************************************
;// Input/Output Registers
pSrc                  RN 0
srcStep               RN 1
arg_pSrcResidue       RN 2
pSrcResidue           RN 12
pDst                  RN 3
dstStep               RN 2
predictType           RN 10
rndVal                RN 11
mask                  RN 11

;// Local Scratch Registers
zero                  RN 12
y                     RN 14

tmp1                  RN 4
tmp2                  RN 5
tmp3                  RN 6
tmp4                  RN 7
tmp5                  RN 8
tmp6                  RN 9
tmp7                  RN 10
tmp8                  RN 11
tmp9                  RN 12

t00                   RN 4
t01                   RN 5
t10                   RN 6
t11                   RN 7
t20                   RN 8
t21                   RN 9
t30                   RN 10
t31                   RN 11
tmp                   RN 12

yMask                 RN 14

dst                   RN 1
return                RN 0

    ;// Allocate memory on stack
    M_ALLOC4    Stk_pDst,           4
    M_ALLOC4    Stk_pSrcResidue,    4
    ;// Function header
    M_START     omxVCM4P2_MCReconBlock, r11
    ;// Define stack arguments
    M_ARG       Arg_dstStep,        4
    M_ARG       Arg_predictType,    4
    M_ARG       Arg_rndVal,         4
    ;// Save on stack
    M_STR       pDst, Stk_pDst
    M_STR       arg_pSrcResidue, Stk_pSrcResidue
    ;// Load argument from the stack
    M_LDR       dstStep, Arg_dstStep
    M_LDR       predictType, Arg_predictType
    M_LDR       rndVal, Arg_rndVal
    
    MOV         y, #8
    
    AND         tmp1, pSrc, #3
    ORR         predictType, tmp1, predictType, LSL #3
    ORR         predictType, predictType, rndVal, LSL #2
    ;// Truncating source pointer to align to 4 byte location
    BIC         pSrc, pSrc, #3

    ;// Implementation takes care of all combinations of different 
    ;// predictTypes, rounding cases and source pointer offsets to alignment 
    ;// of 4 bytes in different code bases unless one of these parameter wasn't 
    ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK
    ;// macros branch into 8 M_CASE macros for all combinations of the 2 
    ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte 
    ;// alignment. 
    M_SWITCH    predictType
        M_CASE_MCRECONBLOCK IntegerPixel
        M_CASE_MCRECONBLOCK HalfPixelX
        M_CASE_MCRECONBLOCK HalfPixelY
        M_CASE_MCRECONBLOCK HalfPixelXY
    M_ENDSWITCH

    ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8 
    ;// particular macros (4 in case of IntegerPixel as rounding makes no 
    ;// difference there) to generate the code for all cases of rounding and 
    ;// offsets. LTORG is used to segment the code as code size bloated beyond 
    ;// 4KB.
    M_MCRECONBLOCK IntegerPixel
    M_MCRECONBLOCK HalfPixelX
    LTORG
    M_MCRECONBLOCK HalfPixelY
    M_MCRECONBLOCK HalfPixelXY
SwitchPredictTypeEnd

    ;// Residue Addition
    ;// This is done in 2 lane SIMD though loads are further optimized and
    ;// 4 bytes are loaded in case of destination buffer. Algorithmic 
    ;// details are in inlined comments
    M_LDR       pSrcResidue, Stk_pSrcResidue
    CMP         pSrcResidue, #0
    BEQ         pSrcResidueConditionEnd
pSrcResidueNotNull    
    M_LDR       pDst, Stk_pDst
    MOV         y, #8
    SUB         dstStep, dstStep, #4
Yloop_pSrcResidueNotNull
    SUBS        y, y, #1
    LDR         dst, [pDst]                ;// dst = [dcba]
    LDMIA       pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA]
    PKHBT       tmp3, tmp1, tmp2, LSL #16  ;// Deltaval1 = [C A]
    PKHTB       tmp4, tmp2, tmp1, ASR #16  ;// DeltaVal2 = [D B]
    UXTB16      tmp1, dst                  ;// tmp1 = [0c0a]
    UXTB16      tmp2, dst, ROR #8          ;// tmp2 = [0d0b]
    QADD16      tmp1, tmp1, tmp3           ;// Add and saturate to 16 bits
    QADD16      tmp2, tmp2, tmp4
    USAT16      tmp1, #8, tmp1
    USAT16      tmp2, #8, tmp2             ;// armClip(0, 255, tmp2)
    ORR         tmp1, tmp1, tmp2, LSL #8   ;// tmp1 = [dcba]
    STR         tmp1, [pDst], #4
    
    LDR         dst, [pDst]
    LDMIA       pSrcResidue!, {tmp1, tmp2}
    PKHBT       tmp3, tmp1, tmp2, LSL #16
    PKHTB       tmp4, tmp2, tmp1, ASR #16
    UXTB16      tmp1, dst
    UXTB16      tmp2, dst, ROR #8
    QADD16      tmp1, tmp1, tmp3
    QADD16      tmp2, tmp2, tmp4
    USAT16      tmp1, #8, tmp1
    USAT16      tmp2, #8, tmp2
    ORR         tmp1, tmp1, tmp2, LSL #8
    STR         tmp1, [pDst], dstStep
    
    BGT         Yloop_pSrcResidueNotNull
pSrcResidueConditionEnd

    MOV         return, #OMX_Sts_NoErr

    M_END
    ENDIF ;// ARM1136JS

;// ***************************************************************************
;// CortexA8 implementation
;// ***************************************************************************
    END
;// ***************************************************************************
;// omxVCM4P2_MCReconBlock ends
;// ***************************************************************************