/*
 * Copyright 2020-2025 Yuntu Microelectronics co.,ltd
 * All rights reserved.
 *
 * YUNTU Confidential. This software is owned or controlled by YUNTU and may only be
 * used strictly in accordance with the applicable license terms. By expressly
 * accepting such terms or by downloading, installing, activating and/or otherwise
 * using the software, you are agreeing that you have read, and that you agree to
 * comply with and are bound by, such license terms. If you do not agree to be
 * bound by the applicable license terms, then you may not retain, install,
 * activate or otherwise use the software. The production use license in
 * Section 2.3 is expressly granted for this software.
 */

/******************************************************************************
* Test summary:
* -------------
* Tests the functionality of the FP MAC module.
*
* Internal multiplier array is exercised by 768 test vectors via VMUL 
* instruction.
* single-precision FP operands with fractional parts having the form:
*   X = 0.(b7,b6,b5,b4)(b7,b6,b5,b4)(b7,b6,b5,b4)...(b7,b6,b5,b4)
*   Y = 1.(b7,b6,b5,b4)(b7,b6,b5,b4)(b7,b6,b5,b4)...(b7,b6,b5,b4)
*   Z = 1.(b3,b2,b1,b0)(b3,b2,b1,b0)(b3,b2,b1,b0)...(b3,b2,b1,b0)
*
* where b7,b6,b5,b4,b3,b2,b1,b0 represent all 256 combinations of 8 bits. Tests
* vectors are generated by the ALU module. Additionally, the following
* functionality is tested:
*
*   1) Handling of Default-NaN mode
*   2) Handling of Flush-to-Zero mode
*   3) Trivial result computation
*   4) Result sign computation
*   5) Functionality of internal CLZ modules
*   6) Functionality of internal fraction shifter
*   7) Rounding logic
*   8) Overflow detection logic
*
* SP FPU Decoder coverage:
* ------------------------
* - VMUL, VMLA, VMLS, VNMUL, VNMLA, VNMLS, VFMA, VFMS, VFNMA, VFNMS
*
* Instruction operands are selected to toggle all bits in the Ra, Rm, Rn, and
* Rd opcode fields.
******************************************************************************/

#include "CorTst_Compiler.h"
#include "CorTst_M33_Cfg.h"

#if (CORTST_M33_FPU_ENABLE==1)

    /* Compatible with ABI. */
    CST_PRES8
    /* Symbols defined in the current module but to be visible to outside */
    CST_EXPORT M33_Cst_SpfpuMacTest
    
    /* Symbols defined outside but used within current module */
    CST_EXTERN m33_cst_test_tail_fpu_end
    CST_EXTERN m33_cst_write_fpu_regs_from_r0
    CST_EXTERN m33_cst_sum_fpu_s15_s31_to_r1_destr_r2
    CST_EXTERN m33_cst_sum_fpu_s24_s31_to_r1_destr_r2
    CST_EXTERN m33_cst_sum_fpu_s25_s31_to_r1_destr_r2
    CST_EXTERN m33_cst_sum_fpu_s26_s31_to_r1_destr_r2
    CST_EXTERN m33_cst_sum_fpu_s29_s31_to_r1_destr_r2
    CST_EXTERN m33_cst_sum_fpu_s30_s31_to_r1_destr_r2

    CST_SET(PRESIGNATURE_SEED_0, 0x02CC)
    CST_SET(PRESIGNATURE_SEED_1, 0x1DC6)

    /* Values stored in registers:
       - R1  ... accumulated signature
       - R12 ... FPSCR register
    */

    /*------------------------------------------------------------------------*/
    CST_SECTION_EXEC(mcal_text)
    /*------------------------------------------------------------------------*/
    /* The ".type" directive instructs the assembler/linker that the label 
       "M33_Cst_SpfpuMacTest" designates a function.
       This would cause setting the least significant bit to '1' within any 
       pointer to this function, causing change to Thumb mode whenever this 
       function is called. */
    CST_TYPE(M33_Cst_SpfpuMacTest, function)
    CST_THUMB2
M33_Cst_SpfpuMacTest:
    
    PUSH    {R4-R12,R14}
    MRS     R1,CONTROL  /* Store CONTROL prior first FPU instruction */
    PUSH    {R1}
    VPUSH   {S16-S31}
    
    /*------------------------------------------------------------------------*/
    /* Test - preparation                                                     */
    /*------------------------------------------------------------------------*/
    MOV     R1,#PRESIGNATURE_SEED_0
    MOVT    R1,#PRESIGNATURE_SEED_1

    /*------------------------------------------------------------------------*/
    /* Test - start                                                           */
    /*------------------------------------------------------------------------*/
    VMRS    R12,FPSCR   /* Store FPSCR register */
    
    /**************************************************************************/
    /* Check handling of Default-NaN (DN) mode                                */
    /**************************************************************************/
    /* When DN mode is on:

        -   The Default NaN is the result of all floating-point operations that
            either:
            -   Generate untrapped Invalid Operation floating-point exceptions.
            -   Have one or more quiet NaN inputs, but no signaling NaN inputs.
            
        Default-NaN is encoded as 0x7FC0000 for single precision.
     **************************************************************************/
    MOV     R0,#0
    BL      m33_cst_write_fpu_regs_from_r0
    
    /* Prepare Test Vectors */
    /* MOV         R0,#0x00000000 - see above */
    MOV     R2,#0x80000000      
    MOV     R4,#0xAAAAAAAA
    MOV     R5,#0x7F800000
    ORR     R3,R5,R4,LSR #8         /* R3=0x7FAAAAAA  bit22=0 - signaling NaN */
    ORR     R4,R5,R4,ASR #1         /* R4=0xFFD55555  bit22=1 - quiet NaN     */
    ORR     R6,R5,R2                
    VMOV    S0,R0                   /* S0 = +0.0  */
    VMOV    S1,R2                   /* S1 = -0.0  */
    VMOV    S2,R3                   /* S2 = +sNaN */
    VMOV    S3,R4                   /* S3 = -qNaN */
    VMOV    S4,R5                   /* S4 = +Inf  */
    VMOV    S5,R6                   /* S5 = -Inf  */
   
    MOV     R0,#(1<<25)             /* Set FPSCR.DN bit */
    VMSR    FPSCR,R0                /* Clear FPSCR & Enable Default NaN */

    /* Test */
    VMUL.F32    S31,S0,S3           /* MUL(+0, -qNaN) = +dNAN */
    VMUL.F32    S30,S3,S0           /* MUL(-qNaN, +0) = +dNAN */
    VMUL.F32    S29,S1,S2           /* MUL(-0, +sNaN) = +dNAN */
    VMUL.F32    S28,S2,S1           /* MUL(+sNaN, -0) = +dNAN */
    VMUL.F32    S27,S0,S4           /* MUL(+0, +Inf)  = +dNAN */
    VMUL.F32    S26,S5,S1           /* MUL(-Inf, -0)  = +dNAN */

    /**************************************************************************/
    /* When DN mode is off:
    
        - All FP operations with qNaN operand, but no sNaN operand, have
          the first qNaN input as their output operand.
          
        - If both operands are qNaNs, the result is the first operand.  

        - If an Invalid Operation floating-point exception is produced because 
          one of the operands is a sNaN, the qNaN result is equal to the sNaN 
          with its most significant fraction bit changed to 1. 
          
        - If both operands are sNaNs, the result is produced in this way from 
          the first operand.
     **************************************************************************/
    MOV     R0,#0                   /* Reset FPSCR */      
    VMSR    FPSCR,R0                /* Clear FPSCR register & Clear DN bit */
    
    /* Prepare Test Vectors */
    BIC     R4,R4,#(1<<31)      
    ORR     R3,R3,#(1<<31)      
    VMOV    S6,R4                   /* S6 = +qNaN */
    VMOV    S7,R3                   /* S7 = -sNaN */

    /* Test */
    VMUL.F32    S25,S0,S3           /* MUL(+0, -qNaN)   = -qNaN */
    VMUL.F32    S24,S3,S0           /* MUL(-qNaN, +0)   = -qNaN */
    VMUL.F32    S23,S3,S6           /* MUL(-qNaN,+qNaN) = -qNaN */
    VMUL.F32    S22,S6,S3           /* MUL(+qNaN,-qNaN) = +qNaN */
    /* Check FPSCR bit is not set */
    VMRS    R2,FPSCR                /* Load FPSCR register  */
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    VMSR    FPSCR,R0                /* Clear FPSCR register */
    /* Test that instructions set FPSCR.IOC bit */
    VMUL.F32    S21,S1,S2           /* MUL(-0, +sNaN)  = +qNAN  change bit22 */
    VMUL.F32    S20,S2,S1           /* MUL(+sNaN, -0)  = +qNAN  change bit22 */
    VMUL.F32    S19,S2,S7           /* MUL(sNaN,-sNaN) = +qNaN  change bit22 */
    VMUL.F32    S18,S7,S2           /* MUL(-sNaN,sNaN) = -qNaN  change bit22 */
    /* Check FPSCR bit is set */
    VMRS    R2,FPSCR                /* Check FPSCR.IOC bit is set */
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    VMSR    FPSCR,R0                /* Clear FPSCR register */
    /* Test that instructions set FPSCR.IOC bit */
    VMUL.F32    S17,S0,S4           /* MUL(+0, +Inf)   = +dNaN */
    VMUL.F32    S16,S5,S1           /* MUL(-Inf, -0)   = +dNaN */
    VMRS    R2,FPSCR                /* Check FPSCR.IOC bit is set */
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    
    /*  Test that dNaN is passed to the ALU pipeline if Fused MAC is
        being processed with input operands of infinity and zero.
        Expected result S15 equals to 0x7FC00000 */
    VMOV.F32    S15,S0
    VMLA.F32    S15,S0,S4
    
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s15_s31_to_r1_destr_r2


    /**************************************************************************/
    /* Check handling of Flush-to-Zero (FZ) mode                              */
    /**************************************************************************/
    /*  When FZ mode is on, then all denormal inputs/outputs of floating-
        point operations are replaced by zero.
     **************************************************************************/
    MOV     R0,#0
    BL      m33_cst_write_fpu_regs_from_r0
    
    /* Prepare Test Vectors */
    /* MOV  R0,#0x00000000 see above */
    MOV     R2,#0x3F800000      
    LDR     R3,=0x807FFFFF      
    MOV     R4,#0x20000000      
    LDR     R5,=0x9FFFFFFF      
    VMOV    S0,R0                   /* S0 = 0                                 */
    VMOV    S1,R2                   /* S1 = +1.00 * 2^0    normalized input   */        
    VMOV    S2,R3                   /* S2 = -0.99 * 2^-126 denormalized input */
    VMOV    S3,R4                   /* S3 = +1.00 * 2^-63  normalized input   */
    VMOV    S4,R5                   /* S4 = -1.99 * 2^-64  normalized input   */

    MOV     R0,#(1<<24)             /* Set FPSCR.FZ bit */
    VMSR    FPSCR,R0                /* Clear FPSCR & Enable Flush-to-zero */

    /* Test that instructions set FPSCR.IDC and FPSCR.UFC bits
       Expected results S31-S28 equal to 0x80000000 */
    VMUL.F32    S31,S1,S2           /* MUL(+norm,-denorm) = -0.0 */
    VMRS    R2,FPSCR                /* Check FPSCR.IDC bit is set */
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    VMSR    FPSCR,R0                /* Clear FPSCR register & Set FZ bit */
    VMUL.F32    S30,S2,S1           /* MUL(-denorm,+norm) = -0.0 */
    VMRS    R2,FPSCR                /* Check FPSCR.IDC bit is set */
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    VMSR    FPSCR,R0                /* Clear FPSCR register & Set FZ bit */
    VMUL.F32    S29,S3,S4           /* MUL(+norm,-norm)   = -0.0 */
    VMRS    R2,FPSCR                /* Check FPSCR.UFC bit is set */
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    VMSR    FPSCR,R0                /* Clear FPSCR register & Set FZ bit */
    VMUL.F32    S28,S4,S3           /* MUL(-norm,+norm)   = -0.0 */
    VMRS    R2,FPSCR                /* Check FPSCR.UFC bit is set */
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    
    MOV     R0,#0                   /* Reset FPSCR */
    VMSR    FPSCR,R0                /* Clear FPSCR & Set Round to Nearest */
    
    /* Test that instructions set FPSCR.UFC bit even if denormal result is 
       normalized due to rounding.
       Expected results S27-S26 equal to 0x80800000 */
    VMUL.F32    S27,S3,S4           /* MUL(+norm,-norm)   = -norm */
    VMRS    R2,FPSCR                /* Check FPSCR.UFC bit is set */
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    VMSR    FPSCR,R0                /* Clear FPSCR register & Set Round to Nearest */
    VMUL.F32    S26,S4,S3           /* MUL(-norm,+norm)   = -norm */
    VMRS    R2,FPSCR                /* Check FPSCR.UFC bit is set */
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s26_s31_to_r1_destr_r2
    
    
    /**************************************************************************/
    /* Check trivial result calculation                                       */
    /**************************************************************************/
    /* Check result generation logic for these cases:
        - zero result                       (x*0, 0*x)
        - infinite result                   (x*inf, inf*x)
     **************************************************************************/
    MOV     R0,#0
    BL      m33_cst_write_fpu_regs_from_r0
     
    /* Prepare Test Vectors */
    /* MOV  R0,#0x00000000 - See above */  
    MOV     R2,#0x40FFFFFF      
    MOV     R3,#0x7F800000      
    ORR     R4,R0,#(1<<31)      
    ORR     R5,R3,#(1<<31)      
    VMOV    S0,R0                   /* S0 = 0         */
    VMOV    S1,R2                   /* S1 = 7.9999995 */
    VMOV    S2,R3                   /* S2 = +Inf      */
    VMOV    S3,R4                   /* S3 = -0        */
    VMOV    S4,R5                   /* S4 = -Inf      */
  
    /* Test */
    VMUL.F32    S31,S0,S1           /* VMUL(+0.0, +Num) = +0.0 */
    VMUL.F32    S30,S1,S0           /* VMUL(+Num, +0.0) = +0.0 */
    VMUL.F32    S29,S1,S2           /* VMUL(+Num, +INF) = +INF */
    VMUL.F32    S28,S2,S1           /* VMUL(+INF, +Num) = +INF */
    VMUL.F32    S27,S3,S1           /* VMUL(-0.0, +Num) = -0.0 */
    VMUL.F32    S26,S1,S3           /* VMUL(+Num, -0.0) = -0.0 */
    VMUL.F32    S25,S1,S4           /* VMUL(+Num, -INF) = -INF */
    VMUL.F32    S24,S4,S1           /* VMUL(-INF, +Num) = -INF */
    
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s24_s31_to_r1_destr_r2


    /**************************************************************************/
    /* Check handling of sign                                                 */
    /**************************************************************************/
    /*  - Test all possible combinations of signs of two input operands.
        - Test sign modifications caused by multiplicative instructions.

        VMUL:   Sd = Sn * Sm            
        VMLA:   Sd = Sd + Sn*Sm     
        VMLS:   Sd = Sd + (-Sn)*Sm
        VNMLA:  Sd = -(Sd + Sn*Sm)
        VNMLS:  Sd = -(Sd + (-Sn)*Sm)    
        VNMUL:  Sd = -(Sn * Sm)    
        VFMA:   Sd = Sd + Sn*Sm
        VFMS:   Sd = Sd - Sn*Sm
        VFNMA:  Sd = -Sd + (-Sn)*Sm
        VFNMS:  Sd = -Sd + Sn*Sm
     **************************************************************************/
    MOV     R0,#0
    BL      m33_cst_write_fpu_regs_from_r0
    
    /* Prepare Test Vectors */
    VMOV.F32    S2,#1.5
    VMOV.F32    S3,#-1.5
    
    /* Test */
    VMUL.F32    S31,S2,S2       /*              +1.5 * +1.5  =  +2.25 */
    VMUL.F32    S30,S2,S3       /*              +1.5 * -1.5  =  -2.25 */
    VMUL.F32    S29,S3,S2       /*              -1.5 * +1.5  =  -2.25 */
    VMUL.F32    S28,S3,S3       /*              -1.5 * -1.5  =  +2.25 */
    VNMUL.F32   S27,S2,S2       /*            -(+1.5 * +1.5) =  -2.25 */
    VNMUL.F32   S26,S2,S3       /*            -(+1.5 * -1.5) =  +2.25 */
    VNMUL.F32   S25,S3,S2       /*            -(-1.5 * +1.5) =  +2.25 */
    VNMUL.F32   S24,S3,S3       /*            -(-1.5 * -1.5) =  -2.25 */
    VMLS.F32    S27,S2,S2       /*   -2.25 +  (-1.5) *  1.5  =  -4.5  */    
    VMLA.F32    S26,S2,S2       /*   +2.25 +    1.5  *  1.5  =  +4.5  */    
    VMLS.F32    S25,S2,S3       /*   +2.25 +  (-1.5) * -1.5  =  +4.5  */    
    VMLA.F32    S24,S2,S3       /*   -2.25 +    1.5  * -1.5  =  -4.5  */
    VNMLS.F32   S27,S27,S3      /*  -(-4.5 + (--4.5) * -1.5) = +11.25 */
    VNMLA.F32   S26,S26,S2      /*  -(+4.5 +    4.5  *  1.5) = -11.25 */
    VNMLS.F32   S25,S25,S3      /*  -(+4.5 +  (-4.5) * -1.5) = -11.25 */
    VNMLA.F32   S24,S24,S2      /*  -(-4.5 +   -4.5  *  1.5) = +11.25 */
   
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s24_s31_to_r1_destr_r2

    VFMA.F32    S31,S2,S2       /*   2.25 +   1.5  *   1.5  =  4.5 */
    VFMA.F32    S30,S2,S3       /*  -2.25 +   1.5  * (-1.5) = -4.5 */
    VFMA.F32    S29,S3,S2       /*  -2.25 + (-1.5) *   1.5  = -4.5 */
    VFMA.F32    S28,S3,S3       /*   2.25 + (-1.5) * (-1.5) =  4.5 */
    
    VFMS.F32    S27,S2,S2       /*  11.25 -   1.5  *   1.5  =  9.0 */
    VFMS.F32    S26,S2,S3       /* -11.25 -   1.5  * (-1.5) = -9.0 */
    VFMS.F32    S25,S3,S2       /* -11.25 - (-1.5) *   1.5  = -9.0 */
    VFMS.F32    S24,S3,S3       /*  11.25 - (-1.5) * (-1.5) =  9.0 */
    
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s24_s31_to_r1_destr_r2

    VFNMA.F32   S31,S2,S2       /*  -4.5 + ( -1.5) *   1.5  = -6.75 */
    VFNMA.F32   S30,S2,S3       /* --4.5 + ( -1.5) * (-1.5) =  6.75 */
    VFNMA.F32   S29,S3,S2       /* --4.5 + (--1.5) *   1.5  =  6.75 */
    VFNMA.F32   S28,S3,S3       /*  -4.5 + (--1.5) * (-1.5) = -6.75 */
    
    VFNMS.F32   S27,S2,S2       /*  -9.0 +   1.5  *   1.5  =  -6.75 */
    VFNMS.F32   S26,S2,S3       /* --9.0 +   1.5  * (-1.5) =   6.75 */
    VFNMS.F32   S25,S3,S2       /* --9.0 + (-1.5) *   1.5  =   6.75 */
    VFNMS.F32   S24,S3,S3       /*  -9.0 + (-1.5) * (-1.5) =  -6.75 */
    
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s24_s31_to_r1_destr_r2

    /**************************************************************************/
    /* Check CLZ modules                                                      */
    /**************************************************************************/
    /*  CLZ modules count number of leading zeros of the input mantissas.
        Their outputs are used for result exponent calculation and/or
        to control fraction shifter.

        -   Set exponent fp_a = 0 (denormalized number)
        -   Set mantissa fp_a = 0.1000..00, 0.0100..00, ... (vary LZ count)
        -   Set exponent fp_b = 0x4B
        -   Set mantissa fp_b = 1.0000...

        Check that fp_b exponent is decremented by:
        - (0x7F + CLZ(mantissa(fp_a))) for single-precision 
     **************************************************************************/
    MOV     R0,#0
    BL      m33_cst_write_fpu_regs_from_r0
     
    /* Prologue */
    MOV     R5,#0x4B000000
    VMOV    S1,R5
    
    /*  Constant of 9 is subtracted from SP exponent to eliminate effect of 
        leading zeros in the sign and exponent (not counted by FPU CLZ hardware
        module, but counted by CLZ instruction in the test). */
    MOV     R4,#(0x7F-9)
    SUB     R4,R5,R4,LSL #23
    MOV     R7,#(1<<22)     /* Prepare Mantissa fp_a */
    
m33_cst_spfpu_mac_test_clz_loop:

    /* Prepare Test Vector */
    VMOV    S2,R7

    /* Test */
    VMUL.F32    S31,S2,S1
    VMUL.F32    S30,S1,S2
    
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s30_s31_to_r1_destr_r2

    /* Double-check S31 equals to R6 */
    VMOV    R11,S31
    CLZ     R3,R7
    SUB     R6,R4,R3,LSL#23
    CMP     R11,R6
    BNE     m33_cst_spfpu_mac_test_end

    /* Double-check S30 equals to R6 */
    VMOV    R11,S30
    CMP     R11,R6
    BNE     m33_cst_spfpu_mac_test_end

    /* Generate new test vector fragment and check for end of loop */
    LSRS    R7,R7,#1
    BNE     m33_cst_spfpu_mac_test_clz_loop     /* End of loop */

    
    /**************************************************************************/
    /* Check exponent generation (adder check)                                */
    /**************************************************************************/
    /*  Exponents of the two inputs are summed up and bias is subtracted
        to get the product exponent.

        - expA + expB - bias  (bias is 0x7F for SP)

        Ideally, each bit position should be stimulated with all possible
        inputs. However, there are some implications while working with
        exponent adder, such that the sum of two input exponents may not
        be smaller than 0x3F8, otherwise underflow occurs and the result
        is rounded to zero. Therefore, test vectors need to be accommodated
        to this limitation.

        Inputs 
        A | B       Input Vector to Be Applied         Note
        ------------------------------------------------------------
        0 | 0       0x00 + 0x00                    Case 1
        0 | 1       0x00 + 0xD5; 0x00 + 0xAA;      Case 2
        1 | 0       0xD5 + 0x00; 0xAA + 0x00;      Case 3
        1 | 1       0x55 + 0x55; 0xAA + 0xAA;      Case 4

        This check also verifies input/output ports of the ca53dpu_fp_mul
        module and decoding of Rd,Rn,Rm instruction field by using specific
        set of source and destination registers.
     **************************************************************************/
    MOV     R0,#0
    BL      m33_cst_write_fpu_regs_from_r0

    /* Prepare Test Vectors */
    MOV     R2,#0x00400000
    MOV     R3,#0x6A800000
    MOV     R4,#0x55000000
    MOV     R5,#0x2A800000
    VMOV    S2,R2                   /* S2 exp=0x00 */
    VMOV    S3,R3                   /* S3 exp=0xD5 */
    VMOV    S4,R4                   /* S4 exp=0xAA */
    VMOV    S5,R5                   /* S5 exp=0x55 */

    /* Test */
    VMUL.F32    S31,S2,S2           /* Case 1 - 0x00 + 0x00 */
    VMUL.F32    S30,S2,S3           /* Case 2 - 0x00 + 0xD5 */
    VMUL.F32    S29,S2,S4           /* Case 2 - 0x00 + 0xAA */
    VMUL.F32    S28,S3,S2           /* Case 3 - 0xD5 + 0x00 */
    VMUL.F32    S27,S4,S2           /* Case 3 - 0xAA + 0x00 */
    VMUL.F32    S26,S4,S4           /* Case 4 - 0xAA + 0xAA */
    VMUL.F32    S25,S5,S5           /* Case 4 - 0x55 + 0x55 */

    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s25_s31_to_r1_destr_r2


    /**************************************************************************/
    /* Check multiplier array                                                 */
    /**************************************************************************/
    /* Apply 768 test vectors in the following form:

        fp_a = S_exp_0.(c7,c6,c5,c4)(c7,c6,c5,c4)....(c7,c6,c5,c4)
        fp_b = S_exp_1.(c7,c6,c5,c4)(c7,c6,c5,c4)....(c7,c6,c5,c4)
        fp_c = S_exp_1.(c3,c2,c1,c0)(c3,c2,c1,c0)....(c3,c2,c1,c0)

        such that following multiplications are tested in:
        fp_a * fp_c
        fp_c * fp_a
        fp_b * fp_c

        Where c7,c6,..,c0 represent all 256 combinations of 8 bits. This
        check also verifies input ports of the fp_mul module and decoding of 
        Rn,Rm instruction fields by using specific set of source registers. 
     **************************************************************************/
    MOV     R0,#0
    BL      m33_cst_write_fpu_regs_from_r0
    
    /* MOV  R0,#0 - See above */
    MOV     R3,#0x0             /* Operand B initial value */
    MOV     R4,#0x11111111      /* Constant to generate test vectors */
    MOV     R5,#0x0             /* Operand A initial value */
    MOV     R6,#0x3F800000      /* Exponent (general, SP) */
    MOV     R7,#0x10            /* Counter for operand A */
    MOV     R8,#0x10            /* Counter for operand B */

m33_cst_fpu_mac_multiplier_loop:

    /* Prepare Test Vectors */
    ADD     R9,R6,R5,LSR #9     
    ADD     R10,R6,R3,LSR #9    
    ADD     R11,R0,R5,LSR #9    
    VMOV    S5,R9               /* fp_b */
    VMOV    S6,R10              /* fp_c */
    VMOV    S7,R11              /* fp_a */

    /* Test */
    VMUL.F32    S31,S5,S6       /* fp_b * fp_c */
    VMUL.F32    S30,S7,S6       /* fp_a * fp_c */
    VMUL.F32    S29,S6,S7       /* fp_c * fp_a */
    
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s29_s31_to_r1_destr_r2

    /* Create the next test vector (update R3) */
    ADD     R3,R3,R4
    SUBS    R8,R8,#1    /* Decrement inner loop counter 16x */
    BNE     m33_cst_fpu_mac_multiplier_loop /* End of inner loop */

    /* Reset operand A (R5) and the counter (the inner loop is exhausted) */
    MOV     R3,#0x0     /* Operand A */
    MOV     R8,#0x10    /* Counter for operand A */

    /* Generate new test vector fragment and check for end of loop */
    ADD     R5,R5,R4
    SUBS    R7,R7,#1    /* Decrement outer loop counter 16x */
    BNE     m33_cst_fpu_mac_multiplier_loop /* End of outer loop */


    /**************************************************************************/
    /* Check internal shifter                                                 */
    /**************************************************************************/
    /*  Test shifting left/right by 0-32 bits using following test vectors:

        fp_a * fp_b
        fp_c * fp_d

        where:
            -   Exponent fp_a = 0 (denormalized number)
            -   Mantissa fp_a = 0.1010...

            -   Exponent fp_b = 0x34-0x7F  (controlled by loop counter)
            -   Mantissa fp_b = 1.0000...

            -   Exponent fp_c = 0 (denormalized number)
            -   Mantissa fp_c = 0.00..0001

            -   Exponent fp_d = 0x96-0x7F  (controlled by loop counter)
            -   Mantissa fp_d = 1.0000... 
     **************************************************************************/
    MOV     R0,#0
    BL      m33_cst_write_fpu_regs_from_r0
    
    /* Set Round-to-Zero mode to truncate the result */
    MOV     R0,#(3<<22)         /* Prepare RMode bits */
    VMSR    FPSCR,R0            /* Clear FPSCR & Set Round to Zero */

    /* Prepare Test Vectors */
    MOV     R0,#0xAAAAAAAA          
    LSR     R0,R0,#9            /* fp_a fraction 0x555555 */
    MOV     R3,#0x00000001      /* fp_c fraction 0x000001 */
    MOV     R4,#0x3F800000      /* fp_b/fp_d base value */
    VMOV    S2,R0
    VMOV    S3,R3

    /* Initialize loop counter */
    MOV     R5,#23
m33_cst_spfpu_mac_test_shifter_loop:
    /* Prepare Test Vectors */
    SUB     R6,R4,R5,LSL #23    /* fp_b */
    ADD     R7,R4,R5,LSL #23    /* fp_d */
    VMOV    S6,R6
    VMOV    S7,R7

    /* Test */
    VMUL.F32    S31,S2,S6       /* fp_a * fp_b */
    VMUL.F32    S30,S3,S7       /* fp_c * fp_d */
    
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s30_s31_to_r1_destr_r2
    
    /* !! Double-check results !! */
    LSL     R11,R3,R5   /* Prepare expected results */
    LSR     R9,R0,R5    /* Prepare expected results */
    VMOV    R8,S31
    VMOV    R10,S30
    CMP     R8,R9       /* Check S31 equals R9 */
    BNE     m33_cst_spfpu_mac_test_end
    CMP     R10,R11     /* Check S30 equals R11 */
    BNE     m33_cst_spfpu_mac_test_end

    /* Generate new test vector fragment and check for end of loop */
    SUBS    R5,R5,#1    /* Decrement loop counter 24x */
    BGE     m33_cst_spfpu_mac_test_shifter_loop  /* End of loop */
    

    /**************************************************************************/
    /* Check rounding generation logic - Round to Nearest                     */
    /**************************************************************************/
    /*  Having a product of two floating point mantissas in format,

        x1 x0 , fx ... f1 f0 || R S

        where individual bits have following meaning:
        ||      ... denotes visible and invisible bits of the product
        x1,x0   ... two most significant bits of the product.
        fx      ... fraction bits (23 bits for SP)
        R       ... hidden rounding bit
        S       ... hidden sticky bit

        Bits f0, R and S are used to determine rounding depending on the
        selected rounding mode. See below.

        ------------------------------------------------------------------

        In Round to Nearest (RN) rounding mode, rounding bit
        is obtained from the following equation:

        r = R * (S + f0)

        Check the following bit combinations:

        f0  R  S | r
        ----------------------------------
         0  1  0 | 0 (Case1, result is binary 0)
         1  1  0 | 1 (Case2, result is binary 2)
         0  1  1 | 1 (Case3, result is binary 1)
         1  0  1 | 0 (Case4, result is binary 1)
    ***************************************************************************/
    MOV     R0,#0
    BL      m33_cst_write_fpu_regs_from_r0
    
    MOV     R0,#0       /* Prepare RMode bits */
    VMSR    FPSCR,R0    /* Clear FPSCR & Set Round to Nearest */

    /* Prepare Test Vectors */
    MOV     R2,#((127-23)<<23)
    MOV     R3,#0x00400000
    VMOV    S2,R2
    VMOV    S3,R3
    
    /* Test */
    VMUL.F32    S31,S3,S2       /* Case 1 */

    /* Prepare Test Vectors */
    MOV     R3,#((127-22)<<23)  /* 22 = multiplicand fraction shift (SP) */
    VMOV    S2,R3
    MOV     R3,#0x00600000
    VMOV    S3,R3

    /* Test */ 
    VMUL.F32    S30,S3,S2       /* Case2 */
    
    /* !! Check results !! */
    VMOV    R7,S31      /* S31 must be binary 0 */
    VMOV    R6,S30      /* S30 must be binary 2 */
    CMP     R7,#0
    BNE     m33_cst_spfpu_mac_test_end         
    CMP     R6,#2
    BNE     m33_cst_spfpu_mac_test_end         
    
    /* Prologue */
    MOV     R5,#1       /* Sticky bit to be shifted till bit 21 */
m33_cst_spfpu_mac_test_rounding_loop1:
    /* Prepare Test Vectors */
    ORR     R3,R5,#0x00200000       /* Set shifted R bit */
    ORR     R4,R5,#0x00400000       /* Set shifted f0 bit */
    VMOV    S3,R3
    VMOV    S4,R4

    /* Test */
    VMUL.F32    S29,S3,S2       /* Case3 */
    VMUL.F32    S28,S4,S2       /* Case4 */
    
    /* !! Check results !! */
    VMOV    R7,S29              /* S29 must be binary 1 */
    VMOV    R6,S28              /* S28 must be binary 1 */
    CMP     R7,#1
    BNE     m33_cst_spfpu_mac_test_end         
    CMP     R6,#1
    BNE     m33_cst_spfpu_mac_test_end 

    /* Generate new test vector fragment and check for end of loop */
    LSL     R5,R5,#1
    CMP     R5,#0x200000 /* (1<<21)-1*/
    BNE     m33_cst_spfpu_mac_test_rounding_loop1

    /**************************************************************************/
    /* Check rounding logic - Round towards Plus/Minus Infinity               */
    /**************************************************************************/
    /*  In Round to Plus/Minus infinity (RM/RP) rounding mode,
        rounding bit is obtained from the following equation:

        r = R + S

        Check the following combinations:

        f0  R  S | r
        ----------------------------------
         1  0  0 | 0 (Case1)
         0  1  0 | 1 (Case1)
         0  0  1 | 1 (Case1)
         0  0  0 | 0 (Case2)
    ***************************************************************************/
    MOV     R0,#0x00400000  /* Prepare RMode bits */
    VMSR    FPSCR,R0        /* Clear FPSCR & Set Round towards Plus Inf */

    /* Prologue  */
    MOV     R3,#0x00000001
m33_cst_spfpu_mac_test_rounding_loop2:

    /* Test */
    VMOV    S3,R3
    VMUL.F32    S27,S3,S2   /* Case1 */

    /* !! Check result !! */ 
    VMOV    R2,S27  /* S27 must be binary 1 */
    CMP     R2,#1
    BNE     m33_cst_spfpu_mac_test_end
    
   /* Generate new test vector fragment and check for end of loop */
    LSL     R3,R3,#1
    CMP     R3,#0x400000 /*(1<<22)-1*/
    BNE     m33_cst_spfpu_mac_test_rounding_loop2   /* End of loop */
    
    /* Test */
    MOV     R3,#((127-20)<<23)
    MOV     R4,#0x00400000
    VMOV    S3,R3
    VMOV    S4,R4
    VMUL.F32    S26,S4,S3   /* Case2 */

    /* !! Check result !! */
    VMOV    R2,S26  /* S26 must be binary 4 */
    CMP     R2,#4
    BNE     m33_cst_spfpu_mac_test_end
    
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s26_s31_to_r1_destr_r2
    
    
    /**************************************************************************/
    /* Check overflow detection logic                                         */
    /**************************************************************************/
    /*  Test if overflow is detected when rounding generates carry.
        Test also status flags in FPSCR register:
        - IXC -> Inexact flag is set when round or sticky bit is set
        - OFC -> Overflow flag is set when result does not fit the precision
     **************************************************************************/
    MOV     R0,#0
    BL      m33_cst_write_fpu_regs_from_r0
    
    /* Reset rounding mode back to nearest */
    MOV     R0,#0           /* Prepare RMode bits */       
    VMSR    FPSCR,R0        /* Clear FPSCR register & Round to Nearest */ 
     
    /* Prepare Test Vectors */
    LDR     R2,=0x7F000001
    MOV     R3,#0x3FFFFFFF
    MOV     R4,#0x40000000
    LDR     R5,=0x3FBFFFFF
    VMOV    S1,R2
    VMOV    S2,R3
    VMOV    S3,R4
    VMOV    S4,R5

    /* Test */
    VMUL.F32    S31,S2,S1           /* Overflow due to rounding */
    VMRS    R2,FPSCR
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    VMSR    FPSCR,R0                /* Clear FPSCR register */
    VMUL.F32    S30,S3,S1           /* Overflow due to out of range exp. */
    VMRS    R2,FPSCR
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    VMSR    FPSCR,R0                /* Clear FPSCR register */
    VMUL.F32    S29,S4,S1           /* No overflow due to rounding */
    VMRS    R2,FPSCR
    EOR     R1,R2,R1,ROR #1         /* !! Update Signature !! */
    VMSR    FPSCR,R0                /* Clear FPSCR register */
    
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s29_s31_to_r1_destr_r2
   
   
    /*------------------------------------------------------------------------*/
    /* Test - end                                                             */
    /*------------------------------------------------------------------------*/
    /* Test result is returned in R0, according to the conventions */
m33_cst_spfpu_mac_test_end:
    VMSR    FPSCR,R12   /* Restore FPSCR register */
    /* Test result is returned in R0, according to the conventions */
    MOV     R0,R1 
    B       m33_cst_test_tail_fpu_end 
    
    
    CST_ALIGN_BYTES_4
    /* Marks the current location for dumping psuedoinstruction pools containing
       numeric values for used symbolic names used within LDR instruction. */
    CST_LTORG

#endif  /* CORTST_M33_FPU_ENABLE */

    CST_FILE_END

