/*
 * Copyright 2020-2025 Yuntu Microelectronics co.,ltd
 * All rights reserved.
 *
 * YUNTU Confidential. This software is owned or controlled by YUNTU and may only be
 * used strictly in accordance with the applicable license terms. By expressly
 * accepting such terms or by downloading, installing, activating and/or otherwise
 * using the software, you are agreeing that you have read, and that you agree to
 * comply with and are bound by, such license terms. If you do not agree to be
 * bound by the applicable license terms, then you may not retain, install,
 * activate or otherwise use the software. The production use license in
 * Section 2.3 is expressly granted for this software.
 */

/******************************************************************************
* Test summary:
* -------------
*
* Tests the forwarding logic functionality.
*   Case1: Forwarding from w0, w1 write ports to to r0,r1,r2,r3 read ports 
*   Case2: Data forwarded from loads (VLDR)
*       - Forwarding data from VLDRs currently in f1, to an instruction in f0
*       - Forwarding load data into non VLDRs in f1
*   Case3: Forward accumulator
*       - The accumulator of a chained SP-MAC can only forwarded from another 
*         SP-MAC
*       - The accumulator of a fused SP-MAC can also forward from a non-SP-MAC 
*         instruction in F3
*   Case4: Forwarding MUL result for MAC (Forward from back end of the MUL part 
*          of MAC).
*       - Chained
*       - Fused
******************************************************************************/

#include "CorTst_Compiler.h"
#include "CorTst_M33_Cfg.h"

#if (CORTST_M33_FPU_ENABLE==1)

    /* Compatible with ABI. */
    CST_PRES8
    /* Symbols defined in the current module but to be visible to outside */
    CST_EXPORT M33_Cst_SpfpuForwardingTest

    /* Symbols defined outside but used within current module */
    CST_EXTERN m33_cst_test_tail_fpu_end
    CST_EXTERN m33_cst_write_fpu_regs_from_r0
    CST_EXTERN m33_cst_write_fpu_s28_s31_from_r0
    CST_EXTERN m33_cst_sum_fpu_s30_s31_to_r1_destr_r2
    CST_EXTERN m33_cst_sum_fpu_s28_s31_to_r1_destr_r2
    CST_EXTERN m33_cst_sum_fpu_s24_s31_to_r1_destr_r2
    CST_EXTERN m33_cst_sum_fpu_s16_s31_to_r1_destr_r2
    CST_EXTERN CST_RAM_TARGET0
    
    CST_SET(PRESIGNATURE_SEED_0, 0xC6C0)
    CST_SET(PRESIGNATURE_SEED_1, 0xA590)
    
    
    /* Values stored in registers:
       - R1  ... accumulated signature
       - R11 ... PRIMASK register
       - R12 ... FPSCR register
    */
    
    /*------------------------------------------------------------------------*/
    CST_SECTION_EXEC(mcal_text)
    /*------------------------------------------------------------------------*/
    /* The ".type" directive instructs the assembler/linker that the label 
       "M33_Cst_SpfpuForwardingTest" designates a function.
       This would cause setting the least significant bit to '1' within any 
       pointer to this function, causing change to Thumb mode whenever this 
       function is called. */
    CST_TYPE(M33_Cst_SpfpuForwardingTest, function)
    CST_THUMB2
M33_Cst_SpfpuForwardingTest:

    PUSH    {R4-R12,R14}
    MRS     R1,CONTROL  /* Store CONTROL prior first FPU instruction */
    PUSH    {R1}
    VPUSH   {S16-S31}
    
    /*------------------------------------------------------------------------*/
    /* Test - preparation                                                     */
    /*------------------------------------------------------------------------*/
    MOV     R1,#PRESIGNATURE_SEED_0
    MOVT    R1,#PRESIGNATURE_SEED_1
    
    /*------------------------------------------------------------------------*/
    /* Test - start                                                           */
    /*------------------------------------------------------------------------*/
    MRS     R11,PRIMASK /* Store PRIMASK register */
    VMRS    R12,FPSCR   /* Store FPSCR register */
    MOV     R0,#0
    VMSR    FPSCR,R0    /* Write RM mode and clear FPSCR */
    
    /* Load CST Memory addresses */
    LDR     R8,=CST_RAM_TARGET0
    
    
    /**************************************************************************/
    /* Case1: Data forwarded from w0,w1 to r0,r1,r2,r3                        */
    /**************************************************************************/
    BL      m33_cst_write_fpu_regs_from_r0
    MOV     R4,#0x3F800000  /* 1.0 */
    
    /* Stimulate forwarding path with 10 different values */
    MOV     R2,#0x55555555      /* First value to be forwarded */                
    BL      m33_cst_spfpu_forwarding_test_fwd
    
    /* Stimulate forwarding path with 10 different values */
    MOV     R2,#0xAAAAAAAA      /* First value to be forwarded */
    BL      m33_cst_spfpu_forwarding_test_fwd
   
   
    /**************************************************************************/
    /* Case2: Data forwarded from loads                                       */
    /**************************************************************************/
    /* Clear all FPU registers */
    BL      m33_cst_write_fpu_regs_from_r0
    
    /* Stimulate forwarding path with 10 different values */
    MOV     R2,#0x55555555      /* First value to be forwarded */
    BL      m33_cst_spfpu_forwarding_test_vldr
    
    /* Stimulate forwarding path with 10 different values */
    MOV     R2,#0xAAAAAAAA      /* First value to be forwarded */
    BL      m33_cst_spfpu_forwarding_test_vldr
    
    
    /**************************************************************************/
    /* Case3: Forward accumulator                                             */
    /**************************************************************************/
    /* Clear all FPU registers */
    BL      m33_cst_write_fpu_regs_from_r0
    MOV     R3,#0x3F000000  /* 0.5 */
    MOV     R4,#0x3F800000  /* 1.0 */
    
    /* Stimulate forwarding path with 10 different values */
    MOV     R2,#0x55555555      /* First value to be forwarded */
    BL      m33_cst_spfpu_forwarding_test_accumulator
    
    /* Stimulate forwarding path with 10 different values */
    MOV     R2,#0xAAAAAAAA      /* First value to be forwarded */
    BL      m33_cst_spfpu_forwarding_test_accumulator
    
    
    /**************************************************************************/
    /* Case4: Forwarding MUL result for MAC                                   */      
    /**************************************************************************/
    /* Stimulate forwarding path with 10 different values */
    MOV     R2,#0xAAAAAAAA      /* First value to be forwarded */
    BL      m33_cst_spfpu_forwarding_test_mac
    
    /* Stimulate forwarding path with 10 different values */
    MOV     R2,#0x55555555      /* First value to be forwarded followed */
    BL      m33_cst_spfpu_forwarding_test_mac
    
    
    /*------------------------------------------------------------------------*/
    /* Test - end                                                             */
    /*------------------------------------------------------------------------*/
m33_cst_spfpu_forwarding_test_end:
    VMSR    FPSCR,R12   /* Restore FPSCR register */
    /* Test result is returned in R0, according to the conventions */
    MOV     R0,R1 
    B       m33_cst_test_tail_fpu_end 
    
    
    /*------------------------------------------------------------------------*/
    /* Internal test routines                                                 */
    /*------------------------------------------------------------------------*/
    /**************************************************************************
     * Case1: Forwarding from w0, w1 write ports to to r0,r1,r2,r3 read ports 
     **************************************************************************/
m33_cst_spfpu_forwarding_test_fwd:
    MOV     R10,#10         /* Initialize Loop counter */
    PUSH    {R14}
    VMOV    S1,R4           /* S1 = 1.0 */
m33_cst_spfpu_forwarding_test_loop1:
    VMOV    S2,R2
    /*------------------------------------------------------------------------*/
    CST_PREPARE_PIPELINE 
    /*------------------------------------------------------------------------*/
    VMUL.F32    S3,S2,S1    /* Slot0 - Data is ready in F3 on w0 (S3) */
    VMOV.F32    S10,S0      /* Slot1 */
    /* !! Hazard !! */
    VMUL.F32    S31,S3,S1   /* Slot0 - Data is required in F1 on r0 */
    VMOV.F32    S0,S0       /* Slot1*/
    
    VMUL.F32    S4,S2,S1    /* Slot0 - Data is ready in F3 on w0 (S4) */
    VMOV.F32    S3,S0       /* Slot1 */
    /* !! Hazard !! */
    VMOV.F32    S0,S0       /* Slot1*/
    VADD.F32    S30,S4,S0   /* Slot0 - Data is required in F1 on r2 */
    
    VMOV.F32    S4,S0       /* Slot0 */
    VADD.F32    S5,S2,S0    /* Slot1 - Data is ready in F3 on w1 (S5) */
    /* !! Hazard !! */
    VMUL.F32    S29,S5,S1   /* Slot0 - Data is required in F1 on r0 */
    VMOV.F32    S0,S0       /* Slot1*/
    
    VMOV.F32    S5,S0       /* Slot0 */
    VADD.F32    S6,S2,S0    /* Slot1 - Data is ready in F3 on w1 (S6) */
    /* !! Hazard !! */
    VMOV.F32    S0,S0       /* Slot0*/
    VADD.F32    S28,S6,S0   /* Slot1 - Data is required in F1 on r2 */
    
    VMUL.F32    S7,S2,S1    /* Slot0 - Data is ready in F3 on w0 (S7) */
    VMOV.F32    S6,S0       /* Slot1 */
    /* !! Hazard !! */
    VMUL.F32    S27,S1,S7   /* Slot0 - Data is required in F1 on r1 */
    VMOV.F32    S0,S0       /* Slot1*/
    
    VMUL.F32    S8,S2,S1    /* Slot0 - Data is ready in F3 on w0 (S8) */
    VMOV.F32    S7,S0       /* Slot1 */
    /* !! Hazard !! */
    VMOV.F32    S0,S0       /* Slot1*/
    VADD.F32    S26,S0,S8   /* Slot0 - Data is required in F1 on r3 */
    
    VMOV.F32    S8,S0       /* Slot0 */
    VADD.F32    S9,S2,S0    /* Slot1 - Data is ready in F3 on w1 (S9) */
    /* !! Hazard !! */
    VMUL.F32    S25,S1,S9   /* Slot0 - Data is required in F1 on r1 */
    VMOV.F32    S0,S0       /* Slot1*/
    
    VMOV.F32    S9,S0       /* Slot0 */
    VADD.F32    S10,S2,S0   /* Slot1 - Data is ready in F3 on w1 (S10) */
    /* !! Hazard !! */
    VMOV.F32    S0,S0       /* Slot0*/
    VADD.F32    S24,S0,S10  /* Slot1 - Data is required in F1 on r3 */
    /*------------------------------------------------------------------------*/
    MSR     PRIMASK,R11     /* Restore PRIMASK register */
    /*------------------------------------------------------------------------*/
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s24_s31_to_r1_destr_r2
    
    /* Prepare value for next run */
    MOV     R2,R1
    SUBS    R10,R10,#1
    BNE     m33_cst_spfpu_forwarding_test_loop1 /* Loop 10x */
    POP     {R15}   /* Exit test subroutine */
    
    
    /**************************************************************************
     * Case2: Data forwarded from loads                                       
     **************************************************************************/
m33_cst_spfpu_forwarding_test_vldr:    
    MOV     R10,#10         /* Initialize Loop counter */
    PUSH    {R14}
m33_cst_spfpu_forwarding_test_loop2:
    STR     R2,[R8]
    DSB
    /*------------------------------------------------------------------------*/
    CST_PREPARE_PIPELINE 
    /*------------------------------------------------------------------------*/
    VLDR        S1,[R8]     /* Slot0 */
    VMOV.F32    S4,S0       /* Slot1 */
    /* !! Hazard !! */
    VADD.F32    S16,S1,S0   /* Slot0 */
    VADD.F32    S17,S1,S0   /* Slot1 */
    VADD.F32    S18,S1,S0   /* Slot0 */ 
    VADD.F32    S19,S1,S0   /* Slot1 */ 
    
    VLDR        S2,[R8]     /* Slot0 */
    VMOV.F32    S1,S0       /* Slot1 */
    /* !! Hazard !! */
    VADD.F32    S20,S0,S2   /* Slot0 */
    VADD.F32    S21,S0,S2   /* Slot1 */
    VADD.F32    S22,S0,S2   /* Slot0 */ 
    VADD.F32    S23,S0,S2   /* Slot1 */

    VMOV.F32    S2,S0       /* Slot0 */
    VLDR        S3,[R8]     /* Slot1 */
    /* !! Hazard !! */
    VADD.F32    S24,S3,S0   /* Slot0 */
    VADD.F32    S25,S3,S0   /* Slot1 */
    VADD.F32    S26,S3,S0   /* Slot0 */ 
    VADD.F32    S27,S3,S0   /* Slot1 */
    
    VMOV.F32    S3,S0       /* Slot0 */
    VLDR        S4,[R8]     /* Slot1 */
    /* !! Hazard !! */
    VADD.F32    S28,S0,S4   /* Slot0 */
    VADD.F32    S29,S0,S4   /* Slot1 */
    VADD.F32    S30,S0,S4   /* Slot0 */ 
    VADD.F32    S31,S0,S4   /* Slot1 */
    /*------------------------------------------------------------------------*/
    MSR     PRIMASK,R11     /* Restore PRIMASK register */
    /*------------------------------------------------------------------------*/
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s16_s31_to_r1_destr_r2
    
    /* Prepare value for next run */
    MOV     R2,R1
    SUBS    R10,R10,#1
    BNE     m33_cst_spfpu_forwarding_test_loop2 /* Loop 10x */
    POP     {R15}                               /* Exit subroutine */
    
    /**************************************************************************
     * Case3a: Forward accumulator                                            
     * Case3b: Forward accumulator to fused MAC                               
     **************************************************************************/
m33_cst_spfpu_forwarding_test_accumulator:
    MOV     R10,#10         /* Initialize Loop counter */
    PUSH    {R14}
    VMOV    S1,R4           /* S1=1   */
    VMOV    S3,R3           /* S3=0.5 */
m33_cst_spfpu_forwarding_test_loop3:
    VMOV    S2,R2
    /*------------------------------------------------------------------------*/
    CST_PREPARE_PIPELINE 
    /*------------------------------------------------------------------------*/
    VMLA.F32    S31,S2,S1   /* Slot0 f3 */
    VMOV.F32    S0,S0       /* Slot1 f3 */         
    /* !! Hazard !! */
    VMLA.F32    S31,S2,S3   /* Slot0 f1 */
    VMOV.F32    S0,S0       /* Slot1 f1 */
    
    VMLA.F32    S30,S2,S1   /* Slot0 f3 */
    VMOV.F32    S0,S0       /* Slot1 f3 */
    VMOV.F32    S0,S0       /* Slot0 f2 */
    VMOV.F32    S0,S0       /* Slot1 f2 */
    VMLA.F32    S30,S2,S3   /* Slot0 f1 */
    VMOV.F32    S0,S0       /* Slot1 f1 */
    
    VADD.F32    S29,S2,S0   /* Slot0 */
    VMOV.F32    S0,S0       /* Slot1 */
    /* !! Hazard !!*/
    VFMA.F32    S29,S2,S1   /* Slot0 */
    VMOV.F32    S0,S0       /* Slot1 */
    
    VADD.F32    S28,S2,S0   /* Slot0 */
    VMOV.F32    S0,S0       /* Slot1 */
    VMOV.F32    S0,S0       /* Slot0 */
    VMOV.F32    S0,S0       /* Slot1 */
    VFMA.F32    S28,S2,S1   /* Slot0 */
    VMOV.F32    S0,S0       /* Slot1 */
    /*------------------------------------------------------------------------*/
    MSR     PRIMASK,R11     /* Restore PRIMASK register */
    /*------------------------------------------------------------------------*/
    /* !! Update Signature !! */
    BL      m33_cst_sum_fpu_s28_s31_to_r1_destr_r2
    
    /* Prepare value for next run */
    MOV     R2,R1
    BL      m33_cst_write_fpu_s28_s31_from_r0
    SUBS    R10,R10,#1
    BNE     m33_cst_spfpu_forwarding_test_loop3 /* Loop 10x */
    POP     {PC}                                /* Exit subroutine */
    
    /**************************************************************************
     * Case4a: Chained MAC                                                    
     * Case4b: Fused MAC                                                      
     **************************************************************************/    
m33_cst_spfpu_forwarding_test_mac:
    MOV     R10,#10         /* Initialize Loop counter */
    PUSH    {R14} 
m33_cst_spfpu_forwarding_test_loop4:
    VMOV    S2,R2
    /*------------------------------------------------------------------------*/
    CST_PREPARE_PIPELINE 
    /*------------------------------------------------------------------------*/
    VMLA.F32    S31,S1,S2   /* Slot0 */
    NOP                     /* Slot1 MUL result is chained to ADD (Case4a) */
    VFMA.F32    S30,S1,S2   /* Slot0 */ 
    NOP                     /* Slot1 MUL result is chained to ADD (Case4b) */
    /*------------------------------------------------------------------------*/
    MSR     PRIMASK,R11     /* Restore PRIMASK register */
    /*------------------------------------------------------------------------*/
    /* !! Update signature !! */
    BL      m33_cst_sum_fpu_s30_s31_to_r1_destr_r2
    
    /* Prepare value for next run */
    MOV     R2,R1
    VMOV    D15,R1,R2
    SUBS    R10,R10,#1
    BNE     m33_cst_spfpu_forwarding_test_loop4 /* Loop 10x */
    POP     {PC}                                /* Exit subroutine */


    CST_ALIGN_BYTES_4
    /* Marks the current location for dumping psuedoinstruction pools containing
       numeric values for used symbolic names used within LDR instruction. */
    CST_LTORG

#endif  /* CORTST_M33_FPU_ENABLE */

    CST_FILE_END

