﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

/////////////////////////////////////////////////////////////////////////////
//
// latticeMP.cpp
//
// Draws a lattice using multiple cores/threads
//
//////////////////////////////////////////////////////////////////////////////

#include <cstdio>
#include <cstring>
#include <cmath>
#include <algorithm>

#include <gfx/demo.h>

#include <nn/os.h>
#include <nn/nn_Result.h>
#include <nnt.h>
#include <nnt/nnt_Argument.h>

static const char * const SHADER_FILE = "shaders/latticeMp/transform";

static DEMOGfxPipeline s_AppPipeline;

// Synchronization
static nn::os::BarrierType s_Rendezvous;

// Thread Setup
static const int APP_STACK_SIZE = 1024 * 1024;
static nn::os::ThreadType s_DefaultThreadSub1;
static nn::os::ThreadType s_DefaultThreadSub2;
static int s_MainCoreId = 1;
static int s_SubCoreId1 = 0;
static int s_SubCoreId2 = 2;
NN_ALIGNAS( 4096 ) static char s_pSub1Stack[ APP_STACK_SIZE ];
NN_ALIGNAS( 4096 ) static char s_pSub2Stack[ APP_STACK_SIZE ];

// Draw Tasks
static const int NUM_TASKS = 16;

typedef struct _DisplayListInfo DisplayListInfo;
typedef struct _DynamicMatrix DynamicMatrix;
typedef struct _DrawTaskInfo
{
    u32 tag;
    union
    {
        struct
        {
            s32 xOffset;
            f32 animTransZ;
        };
        u32 ticks;
    };
    DisplayListInfo* pDLInfo;
    u32 ( *Callback )( void* data );
    DynamicMatrix* pMatrix;
} DrawTaskInfo;

static DrawTaskInfo s_TaskInfo[ NUM_TASKS ];
static nn::os::MessageQueueType s_MessageQueue;
static nn::os::SemaphoreType s_TaskQueueSemaphore;
static nn::os::MutexType s_TaskQueueMutex;
static uintptr_t s_MessageQueueBuffer[ NUM_TASKS + 1 ];
static int s_NextTask = 0;
static bool s_DemoIsRunning = true;

// Constant Buffers
static int s_StaticMatrixLoc;
static int s_DynamicMatrixLoc;

//---------------------------------------------------------------------------*
//  Model Data
//---------------------------------------------------------------------------*/
static const int STRUT_LN = 130;     /// long side of strut
static const int STRUT_SD = 4;       /// short side of strut
static const int JOINT_SD = 10;      /// joint is a cube

/// Vertices
static f32 s_ModelVertices[] =
{
    //      x             y             z          No:Id
    -STRUT_SD, STRUT_SD, -STRUT_SD,   // 0:0
    //   r          g          b          a
    0.5f, 0.5f, 0.42f, 1.0f,
    STRUT_SD, STRUT_SD, -STRUT_SD,   // 1:1
    0.5f, 0.5f, 0.42f, 1.0f,
    STRUT_SD, STRUT_SD, -STRUT_LN,   // 2:2
    0.5f, 0.5f, 0.42f, 1.0f,
    -STRUT_SD, STRUT_SD, -STRUT_SD,   // 0:3
    0.5f, 0.5f, 0.42f, 1.0f,
    STRUT_SD, STRUT_SD, -STRUT_LN,   // 2:4
    0.5f, 0.5f, 0.42f, 1.0f,
    -STRUT_SD, STRUT_SD, -STRUT_LN,   // 3:5
    0.5f, 0.5f, 0.42f, 1.0f,

    STRUT_SD, STRUT_SD, -STRUT_SD,   // 1:6
    0.3f, 0.3f, 0.3f, 1.0f,
    STRUT_SD, -STRUT_SD, -STRUT_SD,   // 4:7
    0.3f, 0.3f, 0.3f, 1.0f,
    STRUT_SD, -STRUT_SD, -STRUT_LN,   // 5:8
    0.3f, 0.3f, 0.3f, 1.0f,
    STRUT_SD, STRUT_SD, -STRUT_SD,   // 1:9
    0.3f, 0.3f, 0.3f, 1.0f,
    STRUT_SD, -STRUT_SD, -STRUT_LN,   // 5:10
    0.3f, 0.3f, 0.3f, 1.0f,
    STRUT_SD, STRUT_SD, -STRUT_LN,   // 4:11
    0.3f, 0.3f, 0.3f, 1.0f,

    STRUT_SD, STRUT_SD, -STRUT_SD,   // 1:12
    0.5f, 0.5f, 0.42f, 1.0f,
    STRUT_SD, STRUT_SD, STRUT_SD,   // 6:13
    0.5f, 0.5f, 0.42f, 1.0f,
    STRUT_LN, STRUT_SD, STRUT_SD,   // 7:14
    0.5f, 0.5f, 0.42f, 1.0f,
    STRUT_SD, STRUT_SD, -STRUT_SD,   // 1:15
    0.5f, 0.5f, 0.42f, 1.0f,
    STRUT_LN, STRUT_SD, STRUT_SD,   // 7:16
    0.5f, 0.5f, 0.42f, 1.0f,
    STRUT_LN, STRUT_SD, -STRUT_SD,   // 8:17
    0.5f, 0.5f, 0.42f, 1.0f,

    STRUT_SD, STRUT_SD, STRUT_SD,   // 6:18
    0.2f, 0.2f, 0.2f, 1.0f,
    STRUT_SD, -STRUT_SD, STRUT_SD,   //10:19
    0.2f, 0.2f, 0.2f, 1.0f,
    STRUT_LN, STRUT_SD, STRUT_SD,   // 7:20
    0.2f, 0.2f, 0.2f, 1.0f,
    STRUT_SD, -STRUT_SD, STRUT_SD,   //10:21
    0.2f, 0.2f, 0.2f, 1.0f,
    STRUT_LN, -STRUT_SD, STRUT_SD,   // 9:22
    0.2f, 0.2f, 0.2f, 1.0f,
    STRUT_LN, STRUT_SD, STRUT_SD,   // 7:23
    0.2f, 0.2f, 0.2f, 1.0f,


    STRUT_SD, STRUT_SD, -STRUT_SD,   //  1:24
    0.3f, 0.3f, 0.3f, 1.0f,
    STRUT_SD, STRUT_LN, STRUT_SD,   // 11:25
    0.3f, 0.3f, 0.3f, 1.0f,
    STRUT_SD, STRUT_SD, STRUT_SD,   //  6:26
    0.3f, 0.3f, 0.3f, 1.0f,
    STRUT_SD, STRUT_SD, -STRUT_SD,   //  1:27
    0.3f, 0.3f, 0.3f, 1.0f,
    STRUT_SD, STRUT_LN, -STRUT_SD,   // 12:29
    0.3f, 0.3f, 0.3f, 1.0f,
    STRUT_SD, STRUT_LN, STRUT_SD,   // 11:28
    0.3f, 0.3f, 0.3f, 1.0f,

    STRUT_SD, STRUT_SD, STRUT_SD,   //  6:30
    0.2f, 0.2f, 0.2f, 1.0f,
    STRUT_SD, STRUT_LN, STRUT_SD,   // 11:31
    0.2f, 0.2f, 0.2f, 1.0f,
    -STRUT_SD, STRUT_SD, STRUT_SD,   // 14:32
    0.2f, 0.2f, 0.2f, 1.0f,
    -STRUT_SD, STRUT_SD, STRUT_SD,   // 14:33
    0.2f, 0.2f, 0.2f, 1.0f,
    STRUT_SD, STRUT_LN, STRUT_SD,   // 11:34
    0.2f, 0.2f, 0.2f, 1.0f,
    -STRUT_SD, STRUT_LN, STRUT_SD,   // 13:35
    0.2f, 0.2f, 0.2f, 1.0f,

    -JOINT_SD, JOINT_SD, -JOINT_SD,   // 20:36
    0.5f, 0.5f, 0.42f, 1.0f,
    -JOINT_SD, JOINT_SD, JOINT_SD,   // 21:38
    0.5f, 0.5f, 0.42f, 1.0f,
    JOINT_SD, JOINT_SD, -JOINT_SD,   // 22:37
    0.5f, 0.5f, 0.42f, 1.0f,
    JOINT_SD, JOINT_SD, -JOINT_SD,   // 20:39
    0.5f, 0.5f, 0.42f, 1.0f,
    -JOINT_SD, JOINT_SD, JOINT_SD,   // 22:40
    0.5f, 0.5f, 0.42f, 1.0f,
    JOINT_SD, JOINT_SD, JOINT_SD,   // 23:41
    0.5f, 0.5f, 0.42f, 1.0f,

    JOINT_SD, JOINT_SD, -JOINT_SD,   // 21:42
    0.3f, 0.3f, 0.3f, 1.0f,
    JOINT_SD, JOINT_SD, JOINT_SD,   // 23:43
    0.3f, 0.3f, 0.3f, 1.0f,
    JOINT_SD, -JOINT_SD, -JOINT_SD,   // 24:44
    0.3f, 0.3f, 0.3f, 1.0f,
    JOINT_SD, JOINT_SD, JOINT_SD,   // 23:45
    0.3f, 0.3f, 0.3f, 1.0f,
    JOINT_SD, -JOINT_SD, JOINT_SD,   // 25:46
    0.3f, 0.3f, 0.3f, 1.0f,
    JOINT_SD, -JOINT_SD, -JOINT_SD,   // 24:47
    0.3f, 0.3f, 0.3f, 1.0f,

    -JOINT_SD, JOINT_SD, JOINT_SD,   // 22:48
    0.2f, 0.2f, 0.2f, 1.0f,
    -JOINT_SD, -JOINT_SD, JOINT_SD,   // 26:49
    0.2f, 0.2f, 0.2f, 1.0f,
    JOINT_SD, JOINT_SD, JOINT_SD,   // 23:50
    0.2f, 0.2f, 0.2f, 1.0f,
    JOINT_SD, JOINT_SD, JOINT_SD,   // 23:51
    0.2f, 0.2f, 0.2f, 1.0f,
    -JOINT_SD, -JOINT_SD, JOINT_SD,   // 26:52
    0.2f, 0.2f, 0.2f, 1.0f,
    JOINT_SD, -JOINT_SD, JOINT_SD,   // 25:53
    0.2f, 0.2f, 0.2f, 1.0f,

};
static DEMOGfxBuffer s_ModelVertexBuffer;

//static const int MODEL_VTX_COUNT = 54 * 3;
static const int MODEL_VTX_SIZE = ( sizeof( s_ModelVertices ) );
static const int MODEL_VTX_STRIDE = ( sizeof( f32 ) * 3 + sizeof( f32 ) * 4 );
static const int MODEL_POS_OFFSET = 0;
static const int MODEL_CLR_OFFSET = 3;


// ----------------------------------------------------
//
//            13----------- 11
//             |          /|
//             |         / |
//             |        /  |
//             |        12 |
//             |        |  |
//             |        |  |
//             |        |  |
//             |        |  |
//            14--------|-- 6--------------- 7
//                      | /|                /|
//                      |/ |               / |
//                      |  |              /  |
//         0 ----------- 1----------------8  |
//         /           /|  | 10              |
//        /           / |  ------------------ 9
//       /           /  |
//      /           /   |
//     /           /   /4
//   3 ------------ 2 /
//                |  /
//                | /
//                |/
//                5
//
//       22 ------------ 23
//        /|             /|
//       / |            / |
//      /  |           /  |
//     /   |          /   |
//  20 --------------- 21 |
//         |          |   |
//       26 ----------|--- 25
//                    |  /
//                    | /
//                    |/
//                    24
//  y
//   ^  ^ z
//   | /
//   |/
//   ------> x
//
//---------------------------------------------------------

/// Indices
static const int MODEL_IDX_COUNT = 54;
static const int MODEL_IDX_SIZE = ( sizeof( u32 ) * MODEL_IDX_COUNT );

static u32 s_ModelIndices[] = {
    0, 1, 2, 3, 4, 5, // STRUT1 #0
    6, 7, 8, 9, 10, 11, // STRUT1 #1
    12, 13, 14, 15, 16, 17, // STRUT2 #2
    18, 19, 20, 21, 22, 23, // STRUT2 #3
    24, 25, 26, 27, 28, 29, // STRUT3 #4
    30, 31, 32, 33, 34, 35, // STRUT3 #5

    36, 37, 38, 39, 40, 41, // JOINT #6
    42, 43, 44, 45, 46, 47, // JOINT #7
    48, 49, 50, 51, 52, 53, // JOINT #8
};
static DEMOGfxBuffer s_ModelIndexBuffer;

//static const int NUM_ATTRIB = 2;
static const int BUFFER_IDX = 0;

// -----

// Matricies
typedef struct _StaticMatrix
{
    Mtx44   viewMtx44;
    Mtx44   projMtx44;
} StaticMatrix;

static DEMOGfxBuffer s_StaticUniforms;

typedef struct _DynamicMatrix
{
    Mtx44   modelMtx44;
} DynamicMatrix;
static size_t s_AlignedDynamicSize = 0;

static f32 s_AnimTranZ;
static u32 s_Ticks;

//---------------------------------------------------------------------------*
//  Display list data
//---------------------------------------------------------------------------*/
static const int NUM_MODEL_DLS = ( 2 * NUM_TASKS );
static const int MODEL_DL_SIZE = ( 128 * 1024 );
static const int MODEL_DL_POOL_SIZE = NUM_MODEL_DLS * MODEL_DL_SIZE;
static DEMOGfxMemPool* pCommandMemory = NULL;
static u32 modelDLCount = 0;  // current display list

//  Control Memory Chunks
static const int CONTROL_MEMORY_SIZE = 4 * 1024;
static const int CONTROL_MEMORY_POOL_SIZE = 2 * NUM_TASKS * CONTROL_MEMORY_SIZE;
static void* s_pControlMemory = NULL;
static size_t s_CurrentControlChunk = 0;

typedef struct _UniformView
{
    nn::gfx::GpuAddress gpuAddress[ 2 ];
    int size;
} UniformView;

// Have one uniform block per frame to hold all of the data
static DEMOGfxBuffer s_DynamicUniforms[ 2 ];

typedef struct _DisplayListInfo
{
    nn::gfx::Fence fence;
    UniformView* pView;
    nn::gfx::CommandBuffer cb;
    int maxDynamicUniforms;
    u32   offset;
    u32   maxSize;  // maximum size of the display list buffer in bytes
    bool  used;
} DisplayListInfo;

static int s_FrameCount = 0;

static DisplayListInfo s_ModelDLInfo[ NUM_MODEL_DLS ];

// Viewport/Scissor
static nn::gfx::ViewportScissorState s_ViewportScissor;
static void* s_pViewportScissorData;

// Prototype
static void CameraInit( Mtx44 resultProjMtx44, Mtx44 resultViewMtx44 );
static int SceneInit();
static u32 FrameInit( void* data );
static u32 DrawModel( void* data );
static void SubmitDLs( u32 numTasks );
static int SceneDraw();
static void ModelTick( Mtx44 resultMtx44, s32 offsetX, s32 offsetY, s32 offsetZ );
static void AnimTick();
static void PrintInfo();
static DisplayListInfo* GetModelDL();
static void InitModelDLPool();

static nn::os::MutexType s_OutOfMemoryCallbackMutex;

static void OutOfCommandMemoryEventCallback(
    nn::gfx::CommandBuffer* pCommandBuffer, const nn::gfx::OutOfMemoryEventArg& )
{
    nn::os::LockMutex( &s_OutOfMemoryCallbackMutex );
    DisplayListInfo* pModelDL = GetModelDL();
    pCommandBuffer->AddCommandMemory( pCommandMemory->GetPool(),
        pCommandMemory->GetBaseOffset() + pModelDL->offset, pModelDL->maxSize );
    nn::os::UnlockMutex( &s_OutOfMemoryCallbackMutex );
}

static void OutOfControlMemoryEventCallback(
    nn::gfx::CommandBuffer* pCommandBuffer, const nn::gfx::OutOfMemoryEventArg& )
{
    nn::os::LockMutex( &s_OutOfMemoryCallbackMutex );
    s_CurrentControlChunk++;
    if ( s_CurrentControlChunk >= CONTROL_MEMORY_POOL_SIZE / CONTROL_MEMORY_SIZE )
    {
        s_CurrentControlChunk = 0;
    }
    pCommandBuffer->AddControlMemory( nn::util::BytePtr( s_pControlMemory,
        ( s_CurrentControlChunk * CONTROL_MEMORY_SIZE ) ).Get(), CONTROL_MEMORY_SIZE );
    nn::os::UnlockMutex( &s_OutOfMemoryCallbackMutex );
}

// Init function for setting projection matrix
static void CameraInit( Mtx44 resultProjMtx44, Mtx44 resultViewMtx44 )
{
    // row major matricies
    Mtx   lookAtMtx34;

    Vec     up = { 0.20f, 0.97f, 0.0f };
    Vec  objPt = { -110.0f, -70.0f, -190.0f };
    Vec camLoc = { 90.0f, 110.0f, 13.0f };

    f32   pers = 50.0f;
    f32 aspect = ( f32 ) DEMOColorBufferInfo.GetWidth() / ( f32 ) DEMOColorBufferInfo.GetHeight();
    f32  znear = 50.0f;
    f32   zfar = 2000.0f;

    // Compute perspective matrix
    MTXPerspective( resultProjMtx44, pers, aspect, znear, zfar );

    // Compute lookAt matrix
    MTXLookAt( lookAtMtx34, &camLoc, &up, &objPt );
    MTX34To44( lookAtMtx34, resultViewMtx44 );
}

// Update Animation
static void AnimTick()
{
    u32    animSteps = STRUT_LN;
    f32    animLoopBack = ( f32 ) STRUT_LN;
    f32    animStepFwd = animLoopBack / ( f32 ) animSteps;

    s_AnimTranZ += animStepFwd;

    if ( ( s_Ticks % animSteps ) == 0 )
        s_AnimTranZ = animLoopBack;

    s_Ticks++;
}

// Update Model function for setting model matrix
static void ModelTick( Mtx44 resultMtx44, s32 offsetX, s32 offsetY, s32 offsetZ )
{
    MTX44Identity( resultMtx44 );
    resultMtx44[ 0 ][ 3 ] = offsetX + 2.0f * STRUT_LN;
    resultMtx44[ 1 ][ 3 ] = static_cast< f32 >( offsetY );
    resultMtx44[ 2 ][ 3 ] = offsetZ + s_AnimTranZ;
}

// DL Info: Initialize the model display list memory pool
static void InitModelDLPool()
{
    u32 idx;

    // DL Info: Allocate the memory used for display lists
    pCommandMemory = DEMOGfxSharedPool->AllocSubPool( MODEL_DL_POOL_SIZE, nn::gfx::CommandBuffer::GetCommandMemoryAlignment( &DEMODevice ) );
    ASSERT( pCommandMemory != NULL );

    s_pControlMemory = DEMOGfxAllocMEM2( CONTROL_MEMORY_POOL_SIZE, nn::gfx::CommandBuffer::GetControlMemoryAlignment( &DEMODevice ) );
    ASSERT( s_pControlMemory != NULL );

    // determine the alignment
    {
        size_t alignment = 1;
        nn::gfx::Buffer::InfoType info;
        info.SetDefault();
        info.SetGpuAccessFlags( nn::gfx::GpuAccess_ConstantBuffer );
        info.SetSize( sizeof( DynamicMatrix ) );

        alignment = nn::gfx::Buffer::GetBufferAlignment( &DEMODevice, info );
        s_AlignedDynamicSize = ( sizeof( DynamicMatrix ) + alignment - 1 ) & ( ~( alignment - 1 ) );
    }

    s_DynamicUniforms[ 0 ].Initialize( 12 * 11 * 16 * s_AlignedDynamicSize, NULL, nn::gfx::GpuAccess_ConstantBuffer, 0 );
    s_DynamicUniforms[ 1 ].Initialize( 12 * 11 * 16 * s_AlignedDynamicSize, NULL, nn::gfx::GpuAccess_ConstantBuffer, 0 );

    // DL Info: Initialize the display list info structures for each display list
    for ( idx = 0; idx < NUM_MODEL_DLS; idx++ )
    {
        nn::gfx::CommandBuffer::InfoType cbInfo;
        cbInfo.SetDefault();
        cbInfo.SetQueueCapability( nn::gfx::QueueCapability_Graphics );
        cbInfo.SetCommandBufferType( nn::gfx::CommandBufferType_Direct );

        s_ModelDLInfo[ idx ].cb.Initialize( &DEMODevice, cbInfo );

        s_ModelDLInfo[ idx ].cb.SetOutOfCommandMemoryEventCallback( OutOfCommandMemoryEventCallback );
        s_ModelDLInfo[ idx ].cb.SetOutOfControlMemoryEventCallback( OutOfControlMemoryEventCallback );

        // Split control memory up
        s_ModelDLInfo[ idx ].cb.AddControlMemory( nn::util::BytePtr( s_pControlMemory, idx * CONTROL_MEMORY_SIZE ).Get(), CONTROL_MEMORY_SIZE );

        nn::gfx::Fence::InfoType fenceInfo;
        fenceInfo.SetDefault();
        s_ModelDLInfo[ idx ].fence.Initialize( &DEMODevice, fenceInfo );

        s_ModelDLInfo[ idx ].offset = ( MODEL_DL_SIZE * idx );
        s_ModelDLInfo[ idx ].maxSize = MODEL_DL_SIZE;
        s_ModelDLInfo[ idx ].used = false;
        s_ModelDLInfo[ idx ].maxDynamicUniforms = 12 * 11 * 16; // Total x/y/z of the grid
        s_ModelDLInfo[ idx ].pView = new UniformView[ s_ModelDLInfo[ idx ].maxDynamicUniforms ];
        for ( int uniformId = 0; uniformId < s_ModelDLInfo[ idx ].maxDynamicUniforms; uniformId++ )
        {
            s_ModelDLInfo[ idx ].pView[ uniformId ].gpuAddress[ 0 ] = s_DynamicUniforms[ 0 ].gpuAddress;
            s_ModelDLInfo[ idx ].pView[ uniformId ].gpuAddress[ 1 ] = s_DynamicUniforms[ 1 ].gpuAddress;
            s_ModelDLInfo[ idx ].pView[ uniformId ].gpuAddress[ 0 ].Offset( uniformId * s_AlignedDynamicSize );
            s_ModelDLInfo[ idx ].pView[ uniformId ].gpuAddress[ 1 ].Offset( uniformId * s_AlignedDynamicSize );
            s_ModelDLInfo[ idx ].pView[ uniformId ].size = sizeof( DynamicMatrix );
        }
    }
}

// DL Info: Get a pointer to the display list info structure to use for rendering the model
static DisplayListInfo* GetModelDL()
{
    DisplayListInfo* pDLInfo;

    modelDLCount++;
    if ( modelDLCount >= NUM_MODEL_DLS )
    {
        modelDLCount = 0;
    }
    pDLInfo = &s_ModelDLInfo[ modelDLCount ];

    // Check to see if the command buffer is free to reuse
    if ( s_ModelDLInfo[modelDLCount].used )
    {
        // DL Info: wait for the HW to finish with the DL
        //DEMOPrintf( "Waiting for DL finish...\n" );

        nn::gfx::SyncResult result = s_ModelDLInfo[ modelDLCount ].fence.Sync( nn::TimeSpan::FromSeconds( 1 ) );
        ASSERT( result == nn::gfx::SyncResult_Success );
        NN_UNUSED( result );
    }

    pDLInfo->used = true;

    return pDLInfo;
}

// The init function for the rendering portions of this app
static int SceneInit()
{

    s_AnimTranZ = 0;
    s_Ticks = 0;

    DEMOGfxLoadShadersFromFile(&s_AppPipeline.shaders, 0, SHADER_FILE);

    // Uniform Location Lookup
    s_StaticMatrixLoc = s_AppPipeline.shaders.GetInterfaceSlot( nn::gfx::ShaderStage_Vertex, nn::gfx::ShaderInterfaceType_ConstantBuffer, "ub_static" );
    s_DynamicMatrixLoc = s_AppPipeline.shaders.GetInterfaceSlot( nn::gfx::ShaderStage_Vertex, nn::gfx::ShaderInterfaceType_ConstantBuffer, "ub_dynamic" );

    // position setup
    DEMOGfxInitShaderAttribute( &s_AppPipeline.shaders, "a_position", BUFFER_IDX, MODEL_POS_OFFSET * sizeof( f32 ), nn::gfx::AttributeFormat_32_32_32_Float );
    DEMOGfxInitShaderAttribute( &s_AppPipeline.shaders, "a_color", BUFFER_IDX, MODEL_CLR_OFFSET * sizeof( f32 ), nn::gfx::AttributeFormat_32_32_32_32_Float );
    DEMOGfxInitShaderVertexBuffer( &s_AppPipeline.shaders, BUFFER_IDX, MODEL_VTX_STRIDE, 0 );

    // Vertex buffer invalidation
    s_ModelVertexBuffer.Initialize( MODEL_VTX_SIZE, &s_ModelVertices[0], nn::gfx::GpuAccess_VertexBuffer, 0 );

    // Index buffer invalidation
    s_ModelIndexBuffer.Initialize( MODEL_IDX_SIZE, &s_ModelIndices[0], nn::gfx::GpuAccess_IndexBuffer, 0 );

    // Setup the static matrix buffers
    {
        s_StaticUniforms.Initialize( sizeof( StaticMatrix ), NULL, nn::gfx::GpuAccess_ConstantBuffer, 0 );
        StaticMatrix* pMatrix = s_StaticUniforms.Map< StaticMatrix >();
        CameraInit( pMatrix->projMtx44, pMatrix->viewMtx44 );
#if NN_GFX_IS_TARGET_GX
        GX2EndianSwap( pMatrix, sizeof( StaticMatrix ) );
#endif
        s_StaticUniforms.Unmap();
    }

    // DL Info: Initialize the model DL pool
    InitModelDLPool();

    s_AppPipeline.SetDefaults();

    s_AppPipeline.colorTargetStateCount = 1;
    s_AppPipeline.blendTargetStateCount = 1;
    s_AppPipeline.blendTargetStateInfoArray[ 0 ].SetDefault();
    s_AppPipeline.colorTargetStateInfoArray[ 0 ].SetDefault();
    s_AppPipeline.colorTargetStateInfoArray[ 0 ].SetFormat( DEMOColorBufferInfo.GetImageFormat() );

    // Setup the pipeline
    s_AppPipeline.depthStencilStateInfo.SetDepthComparisonFunction( nn::gfx::ComparisonFunction_Less );
    s_AppPipeline.Initialize( &DEMODevice );

    // Setup the viewport
    DEMOGfxSetViewportScissorState( &s_ViewportScissor, &s_pViewportScissorData,
        0.0f, 0.0f,
        static_cast< float >( DEMOColorBufferInfo.GetWidth() ),
        static_cast< float >( DEMOColorBufferInfo.GetHeight() ),
        0.0f, 1.0f,
        static_cast< float >( DEMOColorBufferInfo.GetHeight() ), false );

    return 1;
}

static void PrintInfo()
{
    DEMOGfxSetDefaultRenderTarget();
    DEMOGfxSetDefaultViewportScissor();

    // Set Demo Font String
    DEMOFontPrintf( 2, 1, "<Lattice Display List>" );
    DEMOFontPrintf( 3, 2, "- frames:%d", s_Ticks );

}

static u32 FrameInit( void* data )
{
    DrawTaskInfo* pDrawTask = reinterpret_cast< DrawTaskInfo* >( data );
    DisplayListInfo* pModelDLInfo = pDrawTask->pDLInfo;
    nn::gfx::CommandBuffer* pCB = &pModelDLInfo->cb;

    nn::gfx::ColorTargetView* pCurrentScanBuffer = DEMOGetColorBufferView();

    // Start the CommandBuffer
    pCB->AddCommandMemory( pCommandMemory->GetPool(), pCommandMemory->GetBaseOffset() + pModelDLInfo->offset, pModelDLInfo->maxSize );
    pCB->Begin();

    // Clear buffers
    pCB->ClearColor( pCurrentScanBuffer, 0.1f, 0.1f, 0.1f, 1.0f, NULL );
    pCB->ClearDepthStencil( &DEMODepthBufferView, 1.0f, 0, nn::gfx::DepthStencilClearMode_DepthStencil, NULL );

#if NN_GFX_IS_TARGET_GX
    GX2SetShaderMode( GX2_SHADER_MODE_UNIFORM_BLOCK );
#endif
    // Setup the render targets
    const nn::gfx::ColorTargetView* colorBuffers[] = { pCurrentScanBuffer };
    pCB->SetRenderTargets( 1, colorBuffers, &DEMODepthBufferView );

    pCB->SetViewportScissorState( &s_ViewportScissor );

    // Set the Pipeline
    pCB->SetPipeline( &s_AppPipeline.pipeline );

    // Update View Projection Matrix Uniforms
    pCB->SetConstantBuffer( s_StaticMatrixLoc, nn::gfx::ShaderStage_Vertex, s_StaticUniforms.gpuAddress, s_StaticUniforms.size );

    // Bind vertex & color buffer
    pCB->SetVertexBuffer( BUFFER_IDX, s_ModelVertexBuffer.gpuAddress, MODEL_VTX_STRIDE, s_ModelVertexBuffer.size );

    // DL Info: Stop adding commands
    pCB->End();

    nn::os::FenceMemoryAnyAny();
    nn::os::SendMessageQueue( &s_MessageQueue, reinterpret_cast< uintptr_t >( &pDrawTask->tag ) );

    return 0;
}

static u32 DrawModel( void* data )
{
    DrawTaskInfo* pDrawTask = reinterpret_cast< DrawTaskInfo* >( data );
    DisplayListInfo* pModelDLInfo = pDrawTask->pDLInfo;
    nn::gfx::CommandBuffer* pCB = &pModelDLInfo->cb;
    u32 dynamicUnformIdx = 11 * 16 * ( pDrawTask->tag - 1 );
    s32 x = pDrawTask->xOffset;
    s32 y = 0;
    s32 z = 0;

    // Calculate the starting matrix address to update
    DynamicMatrix* pMatrix = reinterpret_cast< DynamicMatrix* >(
        nn::util::BytePtr( pDrawTask->pMatrix, s_AlignedDynamicSize * dynamicUnformIdx ).Get() );

    // Start the command buffer
    pCB->AddCommandMemory( pCommandMemory->GetPool(), pCommandMemory->GetBaseOffset() + pModelDLInfo->offset, pModelDLInfo->maxSize );
    pCB->Begin();

    for ( y = -10 * STRUT_LN; y < STRUT_LN; y += STRUT_LN )
    {
        for ( z = STRUT_LN; z > -15 * STRUT_LN; z -= STRUT_LN )
        {
            // Sanity check
            ASSERT( dynamicUnformIdx < (u32)pModelDLInfo->maxDynamicUniforms );

            // Update Model Matrix Uniform
            ModelTick( pMatrix->modelMtx44, x, y, z );
#if NN_GFX_IS_TARGET_GX
            GX2EndianSwap( pMatrix, sizeof( DynamicMatrix ) );
#endif
            // Set the uniform block
            pCB->SetConstantBuffer( s_DynamicMatrixLoc, nn::gfx::ShaderStage_Vertex,
                pModelDLInfo->pView[ dynamicUnformIdx ].gpuAddress[ s_FrameCount % 2 ], pModelDLInfo->pView[ dynamicUnformIdx ].size );

            // Draw 1 Structure
            pCB->DrawIndexed( nn::gfx::PrimitiveTopology_TriangleList, nn::gfx::IndexFormat_Uint32, s_ModelIndexBuffer.gpuAddress, MODEL_IDX_COUNT, 0 );

            dynamicUnformIdx++;
            pMatrix = reinterpret_cast< DynamicMatrix* >( nn::util::BytePtr( pMatrix, s_AlignedDynamicSize ).Get() );
        }
    }

    pCB->End();

    nn::os::FenceMemoryAnyAny();
    nn::os::SendMessageQueue( &s_MessageQueue, reinterpret_cast< uintptr_t >( &pDrawTask->tag ) );

    return 0;
}

static void SubmitDLs( u32 numTasks )
{
    // Wait for all the tasks to complete
    u32 count = 0;

    while ( count < numTasks )
    {
        uintptr_t data;
        nn::os::ReceiveMessageQueue( &data, &s_MessageQueue );
        count++;
    }

    // Finished working on dynamic uniforms
    s_DynamicUniforms[ s_FrameCount % 2 ].Unmap();

    // Execute all the tasks in a row
    for ( count = 0; count < numTasks; count++ )
    {
        DEMOQueue.ExecuteCommand( &s_TaskInfo[ count ].pDLInfo->cb, &s_TaskInfo[ count ].pDLInfo->fence );
    }
}

// The draw function for the rendering portions of this app
static int SceneDraw()
{
    u32 taskNum = 0;
    s32 x = 0;
    DisplayListInfo* pModelDLInfo;

    DEMOGfxBeforeRender();

    // Stop the default command buffer
    DEMOCommandBuffer.End();
    DEMOQueue.ExecuteCommand( &DEMOCommandBuffer, NULL );

    // Reset Task Queue
    s_NextTask = 0;

    pModelDLInfo = GetModelDL();

    s_TaskInfo[ taskNum ].tag = taskNum;
    s_TaskInfo[ taskNum ].xOffset = x;
    s_TaskInfo[ taskNum ].animTransZ = s_AnimTranZ;
    s_TaskInfo[ taskNum ].pDLInfo = pModelDLInfo;
    s_TaskInfo[ taskNum ].Callback = FrameInit;
    taskNum++;
    nn::os::FenceMemoryAnyAny();
    nn::os::ReleaseSemaphore( &s_TaskQueueSemaphore );

    // Draw Model
    DynamicMatrix* pMatrix = s_DynamicUniforms[ s_FrameCount % 2 ].Map< DynamicMatrix >();

    for ( x = -10 * STRUT_LN; x < 2 * STRUT_LN; x += STRUT_LN )
    {
        pModelDLInfo = GetModelDL();

        s_TaskInfo[ taskNum ].tag = taskNum;
        s_TaskInfo[ taskNum ].xOffset = x;
        s_TaskInfo[ taskNum ].animTransZ = s_AnimTranZ;
        s_TaskInfo[ taskNum ].pDLInfo = pModelDLInfo;
        s_TaskInfo[ taskNum ].Callback = DrawModel;
        s_TaskInfo[ taskNum ].pMatrix = pMatrix;
        taskNum++;

        nn::os::FenceMemoryAnyAny();
        nn::os::ReleaseSemaphore( &s_TaskQueueSemaphore );
    }

    DEMOAssert( taskNum < NUM_TASKS );

    // DL Info: Execute the display list by placing it directly in the
    // graphics ring buffer
    SubmitDLs( taskNum );

    // Begin recording for the default command buffer
    DEMOCommandBuffer.Begin();

    // Draw Infomation
    PrintInfo();
    DEMOGfxDoneRender();

    // Update Animation
    AnimTick();

    s_FrameCount++;

    return 1; // 0 makes it exit
}

void GfxMain( void* )
{
    int argc = nnt::GetHostArgc();
    char** argv = nnt::GetHostArgv();

    DEMOTestInit( argc, argv );
    DEMOGfxInit( argc, argv );
    DEMOFontInit();
    SceneInit();

    nn::os::AwaitBarrier( &s_Rendezvous );

    while ( DEMOIsRunning() )
    {
        SceneDraw();
    }

    // Wake-up any waiters in SubMain and tell them to exit
    s_DemoIsRunning = false;
    nn::os::FenceMemoryAnyAny();
    nn::os::ReleaseSemaphore( &s_TaskQueueSemaphore );
    nn::os::ReleaseSemaphore( &s_TaskQueueSemaphore );

    // Free shaders/pipeline
    s_AppPipeline.Finalize( &DEMODevice );

    // Free the vertex/index buffer
    s_ModelVertexBuffer.Finalize();
    s_ModelIndexBuffer.Finalize();

    // Free the static uniform
    s_StaticUniforms.Finalize();

    // Free the viewport
    s_ViewportScissor.Finalize( &DEMODevice );
    DEMOGfxFreeMEM2( s_pViewportScissorData );

    // Free the command buffers
    for ( int i = 0; i < NUM_MODEL_DLS; i++ )
    {
        s_ModelDLInfo[ i ].fence.Finalize( &DEMODevice );
        s_ModelDLInfo[ i ].cb.Finalize( &DEMODevice );
        delete[] s_ModelDLInfo[ i ].pView;
    }
    s_DynamicUniforms[ 0 ].Finalize();
    s_DynamicUniforms[ 1 ].Finalize();

    pCommandMemory->Finalize();
    DEMOGfxFreeMEM2( s_pControlMemory );

    DEMOFontShutdown();
    DEMOTestShutdown();
    DEMOGfxShutdown();

    DEMOPrintf( "GX2 shutdown; GX2 main core id: %d\n", nn::os::GetCurrentCoreNumber() );

    // Sync across cores before returning
    // (required for proper, orderly shutdown)
    nn::os::AwaitBarrier( &s_Rendezvous );
}

static void SubMain( void* arg )
{
    u32* pCoreID = reinterpret_cast< u32* >( arg );
    DEMOPrintf( "SubMain started.  CoreId=%d\n", *pCoreID );

    nn::os::AwaitBarrier( &s_Rendezvous );

    while ( s_DemoIsRunning )
    {
        // Check the task semaphore and get the next id
        nn::os::AcquireSemaphore( &s_TaskQueueSemaphore );

        // We may be woken up at some point to exit
        if ( !s_DemoIsRunning )
        {
            break;
        }

        // Get the next task ID
        nn::os::LockMutex( &s_TaskQueueMutex );
        int currentTask = s_NextTask;
        s_NextTask++;
        nn::os::FenceMemoryAnyAny();
        nn::os::UnlockMutex( &s_TaskQueueMutex );

        s_TaskInfo[ currentTask ].Callback( &s_TaskInfo[ currentTask ] );
    }

    // Sync across cores before returning
    // (required for proper, orderly shutdown)
    nn::os::AwaitBarrier( &s_Rendezvous );
}

static void SetupCoreId()
{
    s_MainCoreId = nn::os::GetCurrentCoreNumber();

    switch ( s_MainCoreId )
    {
        case 0:
        {
            s_SubCoreId1 = 1;
            s_SubCoreId2 = 2;
            break;
        }

        case 1:
        {
            s_SubCoreId1 = 0;
            s_SubCoreId2 = 2;
            break;
        }

        default:
        {
            s_SubCoreId1 = 0;
            s_SubCoreId2 = 1;
            break;
        }
    }

    DEMOPrintf( "MainCoreId=%d Subs=%d and %d\n",
        s_MainCoreId, s_SubCoreId1, s_SubCoreId2 );
}

//extern "C" void nnMain()
TEST(GfxLatticeMp, Run)
{
    nn::Result res;

    // Init DEMO lib
    DEMOInit();

    // Setup the core IDs
    SetupCoreId();

    // WaitThread, YieldThread, SleepThread, ChangeThreadPriority(HighestThreadPriority,LowestThreadPriority)
    nn::os::ThreadType* pThreadType = nn::os::GetCurrentThread();

    // Setup communication between processors
    nn::os::InitializeMessageQueue( &s_MessageQueue, s_MessageQueueBuffer, NUM_TASKS + 1 );
    nn::os::InitializeSemaphore( &s_TaskQueueSemaphore, 0, NUM_TASKS );
    nn::os::InitializeMutex( &s_TaskQueueMutex, false, nn::os::MutexLockLevelMin );
    nn::os::InitializeMutex( &s_OutOfMemoryCallbackMutex, false, nn::os::MutexLockLevelMin );

    // Setup s_Rendezvous for now assume 3 processors
    nn::os::InitializeBarrier( &s_Rendezvous, 3 );

    // set worker thread priority lower than GfxMain's
    int mainCoreThreadPriority = nn::os::GetThreadPriority( pThreadType );
    int subThreadPriority = std::max( std::min( nn::os::LowestThreadPriority, mainCoreThreadPriority + 1 ), nn::os::HighestThreadPriority );

    // Create threads
    res = nn::os::CreateThread( &s_DefaultThreadSub1, GfxMain, &s_SubCoreId1, s_pSub1Stack, APP_STACK_SIZE, subThreadPriority, s_SubCoreId1 );
    DEMOAssert( res.IsSuccess() );
    res = nn::os::CreateThread( &s_DefaultThreadSub2, SubMain, &s_SubCoreId2, s_pSub2Stack, APP_STACK_SIZE, subThreadPriority, s_SubCoreId2 );
    DEMOAssert( res.IsSuccess() );

    // First, launch main graphics thread
    nn::os::StartThread( &s_DefaultThreadSub1 );
    nn::os::StartThread( &s_DefaultThreadSub2 );

    SubMain( &s_MainCoreId );

    // nn::os::BarrierType
    // nn::os::FenceMemory...
    // nn::os::MessageQueue, Mutex

    // Make sure the threads have exited before finalizing the barrier
    nn::os::WaitThread( &s_DefaultThreadSub1 );
    nn::os::WaitThread( &s_DefaultThreadSub2 );
    nn::os::FinalizeBarrier( &s_Rendezvous );
    nn::os::FinalizeSemaphore( &s_TaskQueueSemaphore );
    nn::os::FinalizeMutex( &s_TaskQueueMutex );
    nn::os::FinalizeMutex( &s_OutOfMemoryCallbackMutex );

    DEMOShutdown();

    SUCCEED();
}
