﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#include <g3ddemo_GPUMetric.h>
#include <cafe/gx2/gx2Constant.h>
#include <cafe/gx2/gx2PerfEnum.h>
#include <g3ddemo_DemoUtility.h>
#include <nw/g3d/fnd/g3d_GfxManage.h>
#include <nw/g3d/ut/g3d_Inlines.h>

#include <limits.h>

#define GX2_PERF_DATA_RESERVED_SIZE 0x8a0

namespace nw { namespace g3d { namespace demo {
namespace
{
#ifdef _WIN32
    struct GX2PerfData
    {
        u8 reserved[GX2_PERF_DATA_RESERVED_SIZE];
    };
#endif

#ifndef _WIN32
void GetResultInternal(
    GX2PerfData * perfData,
    GX2PerfType type,
    u32 id,
    void* pResult)
{
    GX2Boolean result = GX2PerfGetResultByFrame(perfData, type, id, reinterpret_cast<GX2MetricResult*>(pResult));

    if (result != GX2_TRUE)
    {
        //OSReport("gx2 perf faild\n");
    }
}
#endif
}

void GPUMetric::Setup()
{
    NW_G3D_ASSERT(m_pData[0] == NULL && m_pData[1] == NULL);
    m_pData[0] = static_cast<GX2PerfData*>(AllocMem2(sizeof(GX2PerfData) * 2, GX2_DEFAULT_BUFFER_ALIGNMENT));
    m_pData[1] = nw::g3d::AddOffset(m_pData, sizeof(GX2PerfData));

#ifndef _WIN32
    static const u32 PERF_MAX_TAGS = 1;
    MEMInitAllocatorForDefaultHeap(&m_Allocator);
    for (int i = 0; i < 2; ++i)
    {
        GX2PerfData* data = static_cast<GX2PerfData*>(m_pData[i]);
        GX2PerfInit(data, PERF_MAX_TAGS, &m_Allocator);
        GX2PerfSetCollectionMethod(data, GX2_PERF_COLLECT_TAGS_ACCUMULATE);
    }
#endif

    for (int i = 0; i < 2; ++i)
    {
        Clear(i);
    }
}

void GPUMetric::Cleanup()
{
#ifndef _WIN32
    for (int i = 0; i < 2; ++i)
    {
        GX2PerfData* data = static_cast<GX2PerfData*>(m_pData[i]);
        GX2PerfFree(data);
    }
#endif
    FreeMem2(m_pData[0]);
    m_pData[0] = m_pData[1] = NULL;
}

void GPUMetric::Begin()
{
#ifndef _WIN32
    GX2PerfData* data = static_cast<GX2PerfData*>(m_pData[m_CurrentPerf]);
    if(m_FrameCount[m_CurrentPerf] <= 0)
    {
        m_CurrentPerf = !m_CurrentPerf;

        Clear(m_CurrentPerf);
        data = static_cast<GX2PerfData*>(m_pData[m_CurrentPerf]);
        m_FrameCount[m_CurrentPerf] = GX2PerfGetNumPasses(data);
        GX2PerfFrameStart(data);
    }
    GX2PerfPassStart(data);
    GX2PerfTagStart(data, 0);
#endif
}

void GPUMetric::End()
{
#ifndef _WIN32
    GX2PerfData* data = static_cast<GX2PerfData*>(m_pData[m_CurrentPerf]);
    GX2PerfTagEnd(data, 0);
    GX2PerfPassEnd(data);

    if(m_FrameCount[m_CurrentPerf] == 1)
    {
        // passCountの回数だけPassStart/Endしたから
        // ここでFrameEnd
        GX2PerfFrameEnd(data);
    }

    for (int i = 0; i < 2; ++i)
    {
        --m_FrameCount[i];
    }
#endif
}

bool GPUMetric::IsCalcComplete()
{
    return m_IsCalcComplete;
}

void GPUMetric::Calc(bool useDisplayList)
{
    (void)useDisplayList;
#ifndef _WIN32
    int calcFrame = useDisplayList ? !m_CurrentPerf : m_CurrentPerf;
    int delayFrame = useDisplayList ? 1 : 0;
    if(m_FrameCount[calcFrame] + delayFrame != 0)
    {
        return;
    }
    m_IsCalcComplete = true;

    GX2PerfData* data = static_cast<GX2PerfData*>(m_pData[calcFrame]);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_TIME                    , &m_Result.m_Time);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_TIME                    , &m_Result.m_Time);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_GPU_TIME                , &m_Result.m_GPUTime);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_GPU_BUSY                , &m_Result.m_GPUBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_SHADER_BUSY             , &m_Result.m_ShaderBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_REUSED_INDICES_VS       , &m_Result.m_ReusedIndices);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_SHADER_BUSY_VS          , &m_Result.m_VS.m_Busy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_SHADER_BUSY_GS          , &m_Result.m_GS.m_Busy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_SHADER_BUSY_PS          , &m_Result.m_FS.m_Busy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_ALU_BUSY                , &m_Result.m_ALUBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_TEX_BUSY                , &m_Result.m_TexBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_VS_VERTICES_IN          , &m_Result.m_VSVerticesIn);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_VS_TEX_INST_COUNT       , &m_Result.m_VS.m_TexInstCount);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_VS_TEX_BUSY             , &m_Result.m_VS.m_TexBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_VS_ALU_INST_COUNT       , &m_Result.m_VS.m_ALUInstCount);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_VS_ALU_BUSY             , &m_Result.m_VS.m_ALUBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_VS_ALU_EFFICIENCY       , &m_Result.m_VS.m_ALUEfficiency);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_VS_ALU_TEX_RATIO        , &m_Result.m_VS.m_ALUTexRatio);

    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_GS_TEX_INST_COUNT       , &m_Result.m_GS.m_TexInstCount);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_GS_TEX_BUSY             , &m_Result.m_GS.m_TexBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_GS_ALU_INST_COUNT       , &m_Result.m_GS.m_ALUInstCount);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_GS_ALU_BUSY             , &m_Result.m_GS.m_ALUBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_GS_ALU_EFFICIENCY       , &m_Result.m_GS.m_ALUEfficiency);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_GS_ALU_TEX_RATIO        , &m_Result.m_GS.m_ALUTexRatio);

    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_PRIMITIVE_ASSEMBLY_BUSY , &m_Result.m_PABusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_PRIMITIVES_IN           , &m_Result.m_PAIn);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_PA_STALLED_ON_RASTERIZER, &m_Result.m_PAStalledOnRas);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_INTERP_BUSY             , &m_Result.m_InterpolatorBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_PS_PIXELS_IN            , &m_Result.m_FSPixelsIn);

    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_PS_TEX_INST_COUNT       , &m_Result.m_FS.m_TexInstCount);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_PS_TEX_BUSY             , &m_Result.m_FS.m_TexBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_PS_ALU_INST_COUNT       , &m_Result.m_FS.m_ALUInstCount);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_PS_ALU_BUSY             , &m_Result.m_FS.m_ALUBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_PS_ALU_EFFICIENCY       , &m_Result.m_FS.m_ALUEfficiency);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_PS_ALU_TEX_RATIO        , &m_Result.m_FS.m_ALUTexRatio);

    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_PS_PIXELS_OUT           , &m_Result.m_FSPixelsOut);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_PS_EXPORT_STALLS        , &m_Result.m_FSExportStall);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_TEX_UNIT_BUSY           , &m_Result.m_TexUnitBusy);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_TEXEL_FETCH_COUNT       , &m_Result.m_TexelFetch);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_TEX_CACHE_STALLED       , &m_Result.m_TexCacheStall);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_TEX_MISS_RATE           , &m_Result.m_TexMissRate);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_TEX_MEM_BYTES_READ      , &m_Result.m_TexReadMemByte);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_DEPTH_STENCIL_TEST_BUSY , &m_Result.m_DepthStencilTestBusy);

    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_HIZ_TRIVIAL_ACCEPT      , &m_Result.m_HiZTrivialAccept);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_HIZ_REJECT              , &m_Result.m_HiZReject);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_PRE_Z_SAMPLES_PASSING   , &m_Result.m_PreZSamplePassing);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_PRE_Z_SAMPLES_FAILING_S , &m_Result.m_PreZSampleFailingS);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_PRE_Z_SAMPLES_FAILING_Z , &m_Result.m_PreZSampleFailingZ);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_POST_Z_SAMPLES_PASSING  , &m_Result.m_PostZSamplePassing);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_POST_Z_SAMPLES_FAILING_S, &m_Result.m_PostZSampleFailingS);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_POST_Z_SAMPLES_FAILING_Z, &m_Result.m_PostZSampleFailingZ);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_F32_Z_UNIT_STALLED          , &m_Result.m_ZUnitStalled);

    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_PIXELS_AT_CB            , &m_Result.m_ColorPixelsWritten);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_PIXELS_CB_MEM_WRITTEN   , &m_Result.m_ColorWriteByte);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_IA_VERTICES             , &m_Result.m_IAVertices);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_IA_PRIMITIVES           , &m_Result.m_IAPrimitive);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_VS_INVOCATIONS          , &m_Result.m_VSInvocations);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_GS_INVOCATIONS          , &m_Result.m_GSInvocations);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_GS_PRIMITIVES           , &m_Result.m_GSPrimitives);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_C_INVOCATIONS           , &m_Result.m_CInvocations);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_C_PRIMITIVES            , &m_Result.m_CPrimitives);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_PS_INVOCATIONS          , &m_Result.m_FSInvocations);
    GetResultInternal(data, GX2_PERF_TYPE_GPU_METRIC, GX2_PERF_U64_PA_INPUT_PRIM           , &m_Result.m_FSInputPrimitive);
#endif
}

void GPUMetric::Print()
{
    nw::g3d::DebugPrint("GPU Metric\n");

    nw::g3d::DebugPrint( "Time(%%)   > GPU [%8.3f]/All [%8.3f]=Busy[%8.3f]\n",
        100.f * static_cast< f32 >( m_Result.m_GPUTime / ( 16666.666666f ) ),
        100.f * static_cast< f32 >( m_Result.m_Time    / ( 16666.666666f ) ),
        m_Result.m_GPUBusy
        );
    nw::g3d::DebugPrint( "Busy(%%)   > Shdr[%8.3f](ALU [%8.3f] Tex [%8.3f])Intp[%8.3f] PmAs[%8.3f] Tex [%8.3f] DpSt[%8.3f]\n",
        m_Result.m_ShaderBusy, m_Result.m_ALUBusy, m_Result.m_TexBusy, m_Result.m_InterpolatorBusy, m_Result.m_PABusy, m_Result.m_TexUnitBusy, m_Result.m_DepthStencilTestBusy
        );
    nw::g3d::DebugPrint( "Stall(%%)  > PARs[%8.3f] TexC[%8.3f] ZUnt[%8.3f] FSEp[%8.3f] \n",
        m_Result.m_PAStalledOnRas, m_Result.m_TexCacheStall, m_Result.m_ZUnitStalled, m_Result.m_FSExportStall
        );
    nw::g3d::DebugPrint( "VtxShader > Busy[%8.3f] Tex [%8d] TexB[%8.3f] ALU [%8d] ALUB[%8.3f] ALUE[%8.3f] ALUT[%8.3f]\n",
        m_Result.m_VS.m_Busy, static_cast< u32 >( m_Result.m_VS.m_TexInstCount ), m_Result.m_VS.m_TexBusy, static_cast< u32 >( m_Result.m_VS.m_ALUInstCount ), m_Result.m_VS.m_ALUBusy, m_Result.m_VS.m_ALUEfficiency, m_Result.m_VS.m_ALUTexRatio
        );
    nw::g3d::DebugPrint( "GeomShader> Busy[%8.3f] Tex [%8d] TexB[%8.3f] ALU [%8d] ALUB[%8.3f] ALUE[%8.3f] ALUT[%8.3f]\n",
        m_Result.m_GS.m_Busy, static_cast< u32 >( m_Result.m_GS.m_TexInstCount ), m_Result.m_GS.m_TexBusy, static_cast< u32 >( m_Result.m_GS.m_ALUInstCount ), m_Result.m_GS.m_ALUBusy, m_Result.m_GS.m_ALUEfficiency, m_Result.m_GS.m_ALUTexRatio
        );
    nw::g3d::DebugPrint( "FragShader> Busy[%8.3f] Tex [%8d] TexB[%8.3f] ALU [%8d] ALUB[%8.3f] ALUE[%8.3f] ALUT[%8.3f]\n",
        m_Result.m_FS.m_Busy, static_cast< u32 >( m_Result.m_FS.m_TexInstCount ), m_Result.m_FS.m_TexBusy, static_cast< u32 >( m_Result.m_FS.m_ALUInstCount ), m_Result.m_FS.m_ALUBusy, m_Result.m_FS.m_ALUEfficiency, m_Result.m_FS.m_ALUTexRatio
        );
    nw::g3d::DebugPrint( "Pipeline  > Vtx [%8d]=InVS[%8d]+Reus[%8d] InPA[%8d] InFS[%8d] TxFc[%8d] OutF[%8d]\n",
        static_cast< u32 >( m_Result.m_IAVertices ), static_cast< u32 >( m_Result.m_VSVerticesIn ), static_cast< u32 >( m_Result.m_ReusedIndices ),
        static_cast< u32 >( m_Result.m_PAIn ),
        static_cast< u32 >( m_Result.m_FSPixelsIn ), static_cast< u32 >( m_Result.m_TexelFetch ), static_cast< u32 >( m_Result.m_FSPixelsOut )
        );
    nw::g3d::DebugPrint( "Depth/Stcl> PreP[%8d] PreF[%8d] PstP[%8d] PstF[%8d] HiZA[%8.3f] HiZR[%8.3f]\n",
        static_cast< u32 >( m_Result.m_PreZSamplePassing ), static_cast< u32 >( m_Result.m_PreZSampleFailingZ ),
        static_cast< u32 >( m_Result.m_PostZSamplePassing ), static_cast< u32 >( m_Result.m_PostZSampleFailingZ ),
        m_Result.m_HiZTrivialAccept, m_Result.m_HiZReject
        );
    nw::g3d::DebugPrint( "Pipeline  > IAPr[%8d] GSPr[%8d] VSIv[%8d] PSIv[%8d] GSIv[%8d] CIv [%8d] CPrm[%8d]\n",
        static_cast< u32 >( m_Result.m_IAPrimitive ), static_cast< u32 >( m_Result.m_GSPrimitives ),
        static_cast< u32 >( m_Result.m_VSInvocations ), static_cast< u32 >( m_Result.m_FSInvocations ), static_cast< u32 >( m_Result.m_GSInvocations ),
        static_cast< u32 >( m_Result.m_CInvocations ),  static_cast< u32 >( m_Result.m_CPrimitives )
        );
    nw::g3d::DebugPrint( "Memory    > TxR[%9d](Miss[%8.3f])Pix[%9d] CbW[%9d]\n",
        static_cast< u32 >( m_Result.m_TexReadMemByte ), m_Result.m_TexMissRate, static_cast< u32 >( m_Result.m_ColorPixelsWritten ), static_cast< u32 >( m_Result.m_ColorWriteByte )
        );
}

void GPUMetric::Clear(int index)
{
    (void)index;
    memset(&m_Result, 0, sizeof(GPUMetricData));
    m_IsCalcComplete = false;
    m_FrameCount[index] = -INT_MAX;

#ifndef _WIN32
    GX2PerfData* data = static_cast<GX2PerfData*>(m_pData[index]);
    GX2PerfMetricsClear(data);
    GX2PerfTagEnable(data, 0, GX2_ENABLE);

    for(int i = GX2_PERF_FIRST; i <= GX2_PERF_LAST; ++i)
    {
        // Clearしたあとは必ずEnable
        GX2PerfMetric metric = static_cast<GX2PerfMetric>(i);
        bool gx2PerfEnabled = GX2PerfMetricEnable(data, GX2_PERF_TYPE_GPU_METRIC, metric);
        NW_G3D_ASSERTMSG(gx2PerfEnabled, "GX2PerfMetricEnable failed");
    }
#endif
}

}}} // namespace nw::g3d::demo
