﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#if defined(NNT_GFX_UTIL_ENABLE_LOP)

#include <nn/gfx.h>

#if !defined(NN_GFX_IS_TARGET_NVN)
#error This module can only be used for nvn hosts
#endif

#include <nn/nn_Assert.h>
#include <nn/nn_Log.h>
#include <nns/gfx/gfx_GraphicsFramework.h>

#include <nvn/nvn.h>
#include <nvn/nvn_FuncPtrInline.h>

#include "testGfxUtil_LopIntegration.h"

#include "gfxUtilGpuBenchmark_ResourceAllocator.h"

#include <nvperfapi_host.h>
#include <nvperfapi_target.h>
#include <nvperfapi_nvn_lopc.h>
#include <nvperfapi_metrics.h>
#include <nvperfapi_realtime_metrics.h>
#include <gm204_RealtimeMetricsRaw.h>
#include <gm20b_RealtimeMetricsRaw.h>

namespace profiler {

struct ProfilerHwState
{
    size_t frameNumber;
    size_t perfmonBufferOffset;
    size_t perfmonBytesValid;       // total number of unacknowledged perfmon bytes, pending in the buffer
    size_t perfmonBytesDecoding;    // number of bytes currently reported by GetNextPerfmonMemory()
};

enum HostChipType
{
    HostChipType_GM20B,
    HostChipType_GM20X,
    HostChipType_Unsupported,
};

struct ProfilerState
{
    nn::gfx::Device* pDevice;
    nn::gfx::Queue* pQueue;
    nnt::gfx::util::ResourceAllocator* pResourceAllocator;

    static const int frameDelay = 2;
    int frameCount = 0;
    HostChipType hostChipType;

    static const int NextSubmitFrameArraySize = frameDelay + 1;
    int nextSubmitFrameArray[NextSubmitFrameArraySize];
    int nextSubmitFrameArrayPushIndex;
    int nextSubmitFrameArrayPopIndex;

    size_t configImageSize = 0;
    uint8_t* pConfigImageMemory = nullptr;

    size_t counterDataImagePrefixSize = 0;
    uint8_t* pCounterDataImagePrefixMemory = nullptr;

    size_t counterDataImageSize = 0;
    uint8_t* pCounterDataImageMemory = nullptr;

    size_t counterDataScratchBufferSize = 0;
    uint8_t* pCounterDataScratchBufferMemory = nullptr;

    size_t traceArenaSize = 0;
    uint8_t* pTraceArenaMemory = nullptr;
    nn::gfx::MemoryPool traceArenaMemoryPool;

    size_t computeArenaSize = 0;
    uint8_t* pComputeArenaMemory = nullptr;
    nn::gfx::MemoryPool computeArenaMemoryPool;

    size_t perfmonBufferSize = 0;
    uint8_t* perfmonBufferMemory = nullptr;


    NVPA_NVNC_SessionOptions sessionOptions;
    NVPA_NVNC_PassDescriptor passDescriptor;

    size_t numPipelinedPasses = 0;
    size_t numIsolatedPasses = 0;

    // set to swap chain size, for instance double buffered applications, you would set this to 2
    static const int NumTraceBuffers = 4;

    // upper bound on number of ranges - too low and data will be dropped during collection
    static const int MaxNumRanges = 16;

    // If using a PushRangeDynamic for your range markers, the range descriptions must be stored
    // after each corresponding record. This provides storage for the dynamic strings in the buffer
    // If only using PushRangeStatic, this can be set to 0
    static const int MaxRangeNameLength = 64;

    NVPA_Bool allPassesSubmitted;

    ProfilerHwState profilerHwState;

    // result related data
    const nv::metrics::MetricDesc* pPlatformMetricDescArray = nullptr;
    int platformMetricDescCount = 0;

    const uint64_t* pPlatformMetricIdArray = nullptr;
    int platformMetricIdCount = 0;

    static const int requestArrayMaxCount = 512;
    NVPA_RawMetricRequest requestArray[requestArrayMaxCount];
    int requestPlatformMetricIndexArray[requestArrayMaxCount];
    int requestArrayCount = 0;

    static const int rangeNameArrayMaxCount = 16;
    const char* rangeNameArray[rangeNameArrayMaxCount];
};

namespace {

#define NV_METRICS_CLASS nv::metrics::gm204::RawMetrics
NVPA_DEFINE_METRIC_DESCS(NV_GM204_RAW_METRIC_DESCS, g_MetricsGM204);
#undef NV_METRICS_CLASS

#define NV_METRICS_CLASS nv::metrics::gm20b::RawMetrics
NVPA_DEFINE_METRIC_DESCS(NV_GM20B_RAW_METRIC_DESCS, g_MetricsGM20B);
#undef NV_METRICS_CLASS


const char* GetHostChipName()
{
#if defined(_WIN32)
    NVPA_Status status;
    size_t numDevices;
    status = NVPA_GetDeviceCount(&numDevices);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);
    NN_ASSERT(numDevices > 0);

    const size_t deviceIndex = 0;
    const char* pDeviceName = nullptr;
    const char* pChipName = nullptr;

    status = NVPA_Device_GetNames(deviceIndex, &pDeviceName, &pChipName);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);
    return pChipName;
#else
    return "GM20B";  // the SDEV has a gm20b GPU
#endif

}

HostChipType GetHostChipTypeFromName(const char* pHostChipName)
{
    if (strcmp(pHostChipName, "GM20B") == 0)
    {
        return HostChipType_GM20B;
    }

    if (!strcmp(pHostChipName, "GM200") || !strcmp(pHostChipName, "GM204") || !strcmp(pHostChipName, "GM206"))
    {
        return HostChipType_GM20X;
    }

    return HostChipType_Unsupported;
}

void CreateConfigImage(
    ProfilerState* pProfilerState,
    const char* pChipName)
{
    NVPA_Status status;

    // set configuration options
    NVPA_RawMetricsConfigOptions configOptions = { NVPA_RAW_METRICS_CONFIG_OPTIONS_STRUCT_SIZE };
    configOptions.activityKind = NVPA_ACTIVITY_KIND_PROFILER;
    configOptions.pChipName = pChipName;

    // create the config object
    NVPA_RawMetricsConfig* pRawMetricsConfig = nullptr;
    status = NVPA_RawMetricsConfig_Create(&configOptions, &pRawMetricsConfig);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    // add metrics to the config object
    NVPA_RawMetricsPassGroupOptions rawMetricsPassGroupOptions = { NVPA_RAW_METRICS_PASS_GROUP_OPTIONS_STRUCT_SIZE };
    status = NVPA_RawMetricsConfig_BeginPassGroup(pRawMetricsConfig, &rawMetricsPassGroupOptions);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);
    status = NVPA_RawMetricsConfig_AddMetrics(
        pRawMetricsConfig,
        pProfilerState->requestArray, pProfilerState->requestArrayCount);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);
    status = NVPA_RawMetricsConfig_EndPassGroup(pRawMetricsConfig);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    // generate config image
    status = NVPA_RawMetricsConfig_GenerateConfigImage(pRawMetricsConfig);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    pProfilerState->configImageSize = 0;
    status = NVPA_RawMetricsConfig_GetConfigImage(pRawMetricsConfig, 0, nullptr, &pProfilerState->configImageSize);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    pProfilerState->pConfigImageMemory =
        reinterpret_cast<uint8_t*>(pProfilerState->pResourceAllocator->AllocateMemory(pProfilerState->configImageSize, 16));
    status = NVPA_RawMetricsConfig_GetConfigImage(
        pRawMetricsConfig,
        pProfilerState->configImageSize, pProfilerState->pConfigImageMemory,
        nullptr);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    NVPA_RawMetricsConfig_Destroy(pRawMetricsConfig);

    // can now use configImage directly for BeginSession()
}

void CreateCounterDataPrefix(
    ProfilerState* pProfilerState,
    const char* pChipName)
{
    NVPA_Status status;

    // set builder options
    NVPA_CounterDataBuilderOptions counterDataBuilderOptions = { NVPA_COUNTER_DATA_BUILDER_OPTIONS_STRUCT_SIZE };
    counterDataBuilderOptions.pChipName = pChipName;

    // create the builder object
    NVPA_CounterDataBuilder* pCounterDataBuilder = nullptr;
    status = NVPA_CounterDataBuilder_Create(&counterDataBuilderOptions, &pCounterDataBuilder);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    // add metrics to the builder object
    status = NVPA_CounterDataBuilder_AddMetrics(pCounterDataBuilder,
        pProfilerState->requestArray, pProfilerState->requestArrayCount);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    // create the CounterDataPrefix image
    pProfilerState->counterDataImagePrefixSize = 0;
    status = NVPA_CounterDataBuilder_GetCounterDataPrefix(
        pCounterDataBuilder, 0, nullptr, &pProfilerState->counterDataImagePrefixSize);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    pProfilerState->pCounterDataImagePrefixMemory =
        reinterpret_cast<uint8_t*>(pProfilerState->pResourceAllocator->AllocateMemory(pProfilerState->counterDataImagePrefixSize, 16));

    size_t outBufferSize = 0;
    status = NVPA_CounterDataBuilder_GetCounterDataPrefix(
        pCounterDataBuilder,
        pProfilerState->counterDataImagePrefixSize, pProfilerState->pCounterDataImagePrefixMemory,
        &outBufferSize);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);
    NN_ASSERT(outBufferSize == pProfilerState->counterDataImagePrefixSize);

    NVPA_CounterDataBuilder_Destroy(pCounterDataBuilder);

    // can now use counterDataPrefix to create a full counterData
}

void InitializeCounterData(ProfilerState* pProfilerState)
{
    NVPA_Status status;

    // for shared configuration between iterations
    NVPA_NVNC_CounterDataImageOptions counterDataImageOptions = { NVPA_NVNC_COUNTER_DATA_IMAGE_OPTIONS_STRUCT_SIZE };
    counterDataImageOptions.pCounterDataPrefix = pProfilerState->pCounterDataImagePrefixMemory;
    counterDataImageOptions.counterDataPrefixSize = pProfilerState->counterDataImagePrefixSize;
    counterDataImageOptions.maxNumRanges = ProfilerState::MaxNumRanges;
    counterDataImageOptions.maxNumRangeTreeNodes = ProfilerState::MaxNumRanges;
    counterDataImageOptions.maxRangeNameLength = ProfilerState::MaxRangeNameLength;

    pProfilerState->counterDataImageSize = 0;
    status = NVPA_NVNC_CalculateCounterDataImageSize(&counterDataImageOptions, &pProfilerState->counterDataImageSize);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    pProfilerState->pCounterDataImageMemory =
        reinterpret_cast<uint8_t*>(pProfilerState->pResourceAllocator->AllocateMemory(pProfilerState->counterDataImageSize, 16));

    status = NVPA_NVNC_InitializeCounterDataImage(
        &counterDataImageOptions,
        pProfilerState->counterDataImageSize,
        pProfilerState->pCounterDataImageMemory);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);
}

void ResetCounterDataImage(ProfilerState* pProfilerState)
{
    NVPA_Status status;

    // for shared configuration between iterations
    NVPA_NVNC_CounterDataImageOptions counterDataImageOptions = { NVPA_NVNC_COUNTER_DATA_IMAGE_OPTIONS_STRUCT_SIZE };
    counterDataImageOptions.pCounterDataPrefix = pProfilerState->pCounterDataImagePrefixMemory;
    counterDataImageOptions.counterDataPrefixSize = pProfilerState->counterDataImagePrefixSize;
    counterDataImageOptions.maxNumRanges = ProfilerState::MaxNumRanges;
    counterDataImageOptions.maxNumRangeTreeNodes = ProfilerState::MaxNumRanges;
    counterDataImageOptions.maxRangeNameLength = ProfilerState::MaxRangeNameLength;

    size_t counterDataImageSize = 0;
    status = NVPA_NVNC_CalculateCounterDataImageSize(&counterDataImageOptions, &counterDataImageSize);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    NN_ASSERT(pProfilerState->counterDataImageSize == counterDataImageSize)

    status = NVPA_NVNC_InitializeCounterDataImage(
        &counterDataImageOptions,
        pProfilerState->counterDataImageSize,
        pProfilerState->pCounterDataImageMemory);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);
}

void CreateCounterDataScratchBuffer(ProfilerState* pProfilerState)
{
    NVPA_Status status;

    pProfilerState->counterDataScratchBufferSize = 0;
    status = NVPA_NVNC_CalculateCounterDataScratchBufferSize(
        pProfilerState->pCounterDataImageMemory, &pProfilerState->counterDataScratchBufferSize);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    pProfilerState->pCounterDataScratchBufferMemory =
        reinterpret_cast<uint8_t*>(pProfilerState->pResourceAllocator->AllocateMemory(pProfilerState->counterDataScratchBufferSize, 16));
    status = NVPA_NVNC_InitializeCounterDataScratchBuffer(
        pProfilerState->pCounterDataImageMemory,
        pProfilerState->counterDataScratchBufferSize,
        pProfilerState->pCounterDataScratchBufferMemory);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    // counterDataScratchBuffer is now ready to be passed to BeginSession
}

void CreateSession(ProfilerState* pProfilerState)
{
    // NVPA_NVN_TRACE_BUFFER_PAD_SIZE - PerfWorks defined pad size to avoid complicated boundary checks on the gpu
    // 2x - each range will write a start and end record
    const size_t traceBufferSize =
        NVPA_NVN_TRACE_BUFFER_PAD_SIZE +
        (ProfilerState::MaxNumRanges * (2 * NVPA_NVN_TRACE_RECORD_SIZE + ProfilerState::MaxRangeNameLength));

    {
        nn::gfx::MemoryPoolInfo traceArenaMemoryPoolInfo;
        traceArenaMemoryPoolInfo.SetDefault();
        traceArenaMemoryPoolInfo.SetMemoryPoolProperty(
            nn::gfx::MemoryPoolProperty_CpuUncached | nn::gfx::MemoryPoolProperty_GpuCached);

        size_t alignment = nn::gfx::MemoryPool::GetPoolMemoryAlignment(
            pProfilerState->pDevice, traceArenaMemoryPoolInfo);
        size_t granularity = nn::gfx::MemoryPool::GetPoolMemorySizeGranularity(
            pProfilerState->pDevice, traceArenaMemoryPoolInfo);

        pProfilerState->traceArenaSize = nn::util::align_up(ProfilerState::NumTraceBuffers * traceBufferSize, granularity);
        pProfilerState->pTraceArenaMemory =
            static_cast<uint8_t*>(pProfilerState->pResourceAllocator->AllocateMemory(pProfilerState->traceArenaSize, alignment));

        traceArenaMemoryPoolInfo.SetPoolMemory(pProfilerState->pTraceArenaMemory, pProfilerState->traceArenaSize);

        pProfilerState->traceArenaMemoryPool.Initialize(
            pProfilerState->pDevice, traceArenaMemoryPoolInfo);
    }

    // upper bound to the number of dispatches in the frame.
    const size_t maxDispatchesPerFrame = 8;
    const size_t computeBufferSize =
        NVPA_NVN_COMPUTE_BUFFER_PAD_SIZE + maxDispatchesPerFrame * NVPA_NVN_COMPUTE_RECORD_SIZE;

    {
        nn::gfx::MemoryPoolInfo computeArenaMemoryPoolInfo;
        computeArenaMemoryPoolInfo.SetDefault();
        computeArenaMemoryPoolInfo.SetMemoryPoolProperty(
            nn::gfx::MemoryPoolProperty_CpuUncached | nn::gfx::MemoryPoolProperty_GpuCached);

        size_t alignment = nn::gfx::MemoryPool::GetPoolMemoryAlignment(
            pProfilerState->pDevice, computeArenaMemoryPoolInfo);
        size_t granularity = nn::gfx::MemoryPool::GetPoolMemorySizeGranularity(
            pProfilerState->pDevice, computeArenaMemoryPoolInfo);

        pProfilerState->computeArenaSize = nn::util::align_up(ProfilerState::NumTraceBuffers * computeBufferSize, granularity);
        pProfilerState->pComputeArenaMemory =
            static_cast<uint8_t*>(pProfilerState->pResourceAllocator->AllocateMemory(pProfilerState->computeArenaSize, alignment));

        computeArenaMemoryPoolInfo.SetPoolMemory(pProfilerState->pComputeArenaMemory, pProfilerState->computeArenaSize);

        pProfilerState->computeArenaMemoryPool.Initialize(pProfilerState->pDevice, computeArenaMemoryPoolInfo);
    }


#if defined(_WIN32)
    // This is an upper bound for Maxwell architecture GPUs.
    const size_t numPerfmons = 100;
#elif defined(__HOS__)
    // The Tegra TX1 has 8 perfmon units.
    const size_t numPerfmons = 8;
#endif

    const size_t perfmonBufferMinSize = ProfilerState::MaxNumRanges * numPerfmons * (2 * NVPA_NVN_PERFMON_RECORD_SIZE);

#if defined(_WIN32)
    pProfilerState->perfmonBufferSize = 0;
    pProfilerState->perfmonBufferMemory = nullptr;
#else
    // the size must be aligned to NVN_MEMORY_POOL_STORAGE_ALIGNMENT;
    pProfilerState->perfmonBufferSize = nn::util::align_up(perfmonBufferMinSize, NVN_MEMORY_POOL_STORAGE_ALIGNMENT);
    pProfilerState->perfmonBufferMemory =
        reinterpret_cast<uint8_t*>(pProfilerState->pResourceAllocator->AllocateMemory(
            pProfilerState->perfmonBufferSize, NVN_MEMORY_POOL_STORAGE_ALIGNMENT));
#endif

    pProfilerState->sessionOptions = NVPA_NVNC_SessionOptions();
    pProfilerState->sessionOptions.structSize = NVPA_NVNC_SESSION_OPTIONS_STRUCT_SIZE;

    // set activity kind so PerfWorks can set up gpu into the right mode
    pProfilerState->sessionOptions.activityKind = NVPA_ACTIVITY_KIND_PROFILER;

    // must set the configuration so Perfworks knows how to program the gpu for the requested counters
    pProfilerState->sessionOptions.pConfig = pProfilerState->pConfigImageMemory;
    pProfilerState->sessionOptions.configSize = pProfilerState->configImageSize;

    pProfilerState->sessionOptions.numTraceBuffers = ProfilerState::NumTraceBuffers;

    NVNmemoryPool* pTraceArenaNvnMemoryPool =
        nn::gfx::AccessorToData(pProfilerState->traceArenaMemoryPool)->pNvnMemoryPool;
    pProfilerState->sessionOptions.traceBufferSize = traceBufferSize;
    pProfilerState->sessionOptions.pTraceArena =
        reinterpret_cast<uint8_t*>(nvnMemoryPoolMap(pTraceArenaNvnMemoryPool));
    pProfilerState->sessionOptions.traceArenaGpuAddress = nvnMemoryPoolGetBufferAddress(pTraceArenaNvnMemoryPool);
    pProfilerState->sessionOptions.pTraceArenaMemoryPool = pTraceArenaNvnMemoryPool;

    NVNmemoryPool* pComputeArenaNvnMemoryPool =
        nn::gfx::AccessorToData(pProfilerState->computeArenaMemoryPool)->pNvnMemoryPool;
    pProfilerState->sessionOptions.computeBufferSize = computeBufferSize;
    pProfilerState->sessionOptions.pComputeArena =
        reinterpret_cast<uint8_t*>(nvnMemoryPoolMap(pComputeArenaNvnMemoryPool));
    pProfilerState->sessionOptions.computeArenaGpuAddress = nvnMemoryPoolGetBufferAddress(pComputeArenaNvnMemoryPool);
    pProfilerState->sessionOptions.pComputeArenaMemoryPool = pComputeArenaNvnMemoryPool;

    pProfilerState->sessionOptions.pPerfmonBuffer = pProfilerState->perfmonBufferMemory;
    pProfilerState->sessionOptions.perfmonBufferSize = pProfilerState->perfmonBufferSize;

    pProfilerState->sessionOptions.finishOnEndPass = false;

    pProfilerState->sessionOptions.minNestingLevel = 1;
    pProfilerState->sessionOptions.numNestingLevels = 3;
}

void StartSession(ProfilerState* pProfilerState)
{
    NVPA_Status status;

    status = NVPA_NVNC_BeginSession(
        nn::gfx::AccessorToData(pProfilerState->pQueue)->pNvnQueue,
        &pProfilerState->sessionOptions);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    // a simple object that maintains the pass state machine.
    // The state machine is incremented on PassEnd and is
    // self restarting
    pProfilerState->passDescriptor = NVPA_NVNC_PassDescriptor();
    pProfilerState->passDescriptor.structSize = NVPA_NVNC_PASS_DESCRIPTOR_STRUCT_SIZE;
    status = NVPA_NVNC_PassDescriptor_Initialize(
        nn::gfx::AccessorToData(pProfilerState->pQueue)->pNvnQueue,
        &pProfilerState->passDescriptor);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    pProfilerState->numPipelinedPasses = 0;
    pProfilerState->numIsolatedPasses = 0;
    status = NVPA_NVNC_Config_GetNumPasses(
        pProfilerState->pConfigImageMemory,
        &pProfilerState->numPipelinedPasses,
        &pProfilerState->numIsolatedPasses);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    memset(&pProfilerState->profilerHwState, 0, sizeof(pProfilerState->profilerHwState));
}

void EndSession(ProfilerState* pProfilerState)
{
    NVPA_Status status;
    status = NVPA_NVNC_EndSession(nn::gfx::AccessorToData(pProfilerState->pQueue)->pNvnQueue);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);
}


void InitializeProfilerData(
    ProfilerState* pProfilerState,
    const char* pChipName)
{
    CreateConfigImage(pProfilerState, pChipName);

    CreateCounterDataPrefix(pProfilerState, pChipName);

    InitializeCounterData(pProfilerState);
    CreateCounterDataScratchBuffer(pProfilerState);

    CreateSession(pProfilerState);
}

void FinalizeProfilerData(ProfilerState* pProfilerState)
{
    if (pProfilerState->perfmonBufferMemory != nullptr)
    {
        pProfilerState->pResourceAllocator->FreeMemory(pProfilerState->perfmonBufferMemory);
        pProfilerState->perfmonBufferMemory = nullptr;
    }

    pProfilerState->computeArenaMemoryPool.Finalize(pProfilerState->pDevice);
    pProfilerState->pResourceAllocator->FreeMemory(pProfilerState->pComputeArenaMemory);
    pProfilerState->pComputeArenaMemory = nullptr;

    pProfilerState->traceArenaMemoryPool.Finalize(pProfilerState->pDevice);
    pProfilerState->pResourceAllocator->FreeMemory(pProfilerState->pTraceArenaMemory);
    pProfilerState->pTraceArenaMemory = nullptr;

    pProfilerState->pResourceAllocator->FreeMemory(pProfilerState->pCounterDataScratchBufferMemory);
    pProfilerState->pCounterDataScratchBufferMemory = nullptr;

    pProfilerState->pResourceAllocator->FreeMemory(pProfilerState->pCounterDataImageMemory);
    pProfilerState->pCounterDataImageMemory = nullptr;

    pProfilerState->pResourceAllocator->FreeMemory(pProfilerState->pCounterDataImagePrefixMemory);
    pProfilerState->pCounterDataImagePrefixMemory = nullptr;

    pProfilerState->pResourceAllocator->FreeMemory(pProfilerState->pConfigImageMemory);
    pProfilerState->pConfigImageMemory = nullptr;
}

void AdvancePerfmonBufferOffset(ProfilerHwState* pProfilerHwState, size_t perfmonBytesDecoded, size_t perfmonBufferSize)
{
    pProfilerHwState->perfmonBufferOffset += perfmonBytesDecoded;
    if (pProfilerHwState->perfmonBufferOffset == perfmonBufferSize)
    {
        pProfilerHwState->perfmonBufferOffset = 0;
    }
    NN_ASSERT(pProfilerHwState->perfmonBufferOffset < perfmonBufferSize);

    pProfilerHwState->perfmonBytesDecoding = 0;
}

NVPA_Bool GetNextPerfmonMemory(void* userdata, const uint8_t** ppPerfmonMemory, size_t* pPerfmonMemorySize)
{
    NVPA_Status status;

    NN_ASSERT(userdata != nullptr);
    ProfilerState* pProfilerState = reinterpret_cast<ProfilerState*>(userdata);

    AdvancePerfmonBufferOffset(
        &pProfilerState->profilerHwState,
        pProfilerState->profilerHwState.perfmonBytesDecoding,
        pProfilerState->sessionOptions.perfmonBufferSize);

    size_t perfmonHardwareBytesLastWritten = 0;
    status = NVPA_NVNC_QueryPerfmonHardwareBytesLastWritten(
        nn::gfx::AccessorToData(pProfilerState->pQueue)->pNvnQueue,
        &perfmonHardwareBytesLastWritten);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    pProfilerState->profilerHwState.perfmonBytesValid += perfmonHardwareBytesLastWritten;

    const size_t perfmonBufferEndSize =
        pProfilerState->sessionOptions.perfmonBufferSize - pProfilerState->profilerHwState.perfmonBufferOffset;
    pProfilerState->profilerHwState.perfmonBytesDecoding =
        (pProfilerState->profilerHwState.perfmonBytesValid < perfmonBufferEndSize)
            ? pProfilerState->profilerHwState.perfmonBytesValid
            : perfmonBufferEndSize;

    *ppPerfmonMemory =
        &pProfilerState->sessionOptions.pPerfmonBuffer[pProfilerState->profilerHwState.perfmonBufferOffset];
    *pPerfmonMemorySize = pProfilerState->profilerHwState.perfmonBytesDecoding;
    return true;
}

void DecodeCountersLive(ProfilerState* pProfilerState, size_t traceBufferIndex)
{
    NVPA_Status status;

    NVPA_NVNC_DecodeCountersOptions decodeOptions = {};
    decodeOptions.structSize = NVPA_NVNC_DECODE_COUNTERS_OPTIONS_STRUCT_SIZE;
    decodeOptions.pConfig = pProfilerState->pConfigImageMemory;
    decodeOptions.configSize = pProfilerState->configImageSize;
    decodeOptions.pCounterDataImage = pProfilerState->pCounterDataImageMemory;
    decodeOptions.counterDataImageSize = pProfilerState->counterDataImageSize;
    decodeOptions.pCounterDataScratchBuffer = pProfilerState->pCounterDataScratchBufferMemory;
    decodeOptions.counterDataScratchBufferSize = pProfilerState->counterDataScratchBufferSize;
    decodeOptions.minNestingLevel = pProfilerState->sessionOptions.minNestingLevel;
    decodeOptions.numNestingLevels = pProfilerState->sessionOptions.numNestingLevels;
    decodeOptions.pTraceBuffer =
        pProfilerState->pTraceArenaMemory + (traceBufferIndex * pProfilerState->sessionOptions.traceBufferSize);
    decodeOptions.traceBufferSize = pProfilerState->sessionOptions.traceBufferSize;
    decodeOptions.pComputeBuffer =
        pProfilerState->pComputeArenaMemory + (traceBufferIndex * pProfilerState->sessionOptions.computeBufferSize);
    decodeOptions.computeBufferSize = pProfilerState->sessionOptions.computeBufferSize;
    decodeOptions.pfnGetNextPerfmonMemory = &GetNextPerfmonMemory;
    decodeOptions.getNextPerfmonMemoryUserData = pProfilerState;

    NVPA_NVNC_DecodeCountersState decodeState = {};
    decodeState.structSize = NVPA_NVNC_DECODE_COUNTERS_STATE_STRUCT_SIZE;

    status = NVPA_NVNC_DecodeCounters(&decodeOptions, &decodeState);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    NN_ASSERT(decodeState.perfmonBytesConsumed <= pProfilerState->profilerHwState.perfmonBytesValid);
    status = NVPA_NVNC_AcknowledgePerfmonHardwareBytesConsumed(
        nn::gfx::AccessorToData(pProfilerState->pQueue)->pNvnQueue,
        decodeState.perfmonBytesConsumed);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    AdvancePerfmonBufferOffset(
        &pProfilerState->profilerHwState,
        decodeState.perfmonBytesLastDecoded,
        pProfilerState->sessionOptions.perfmonBufferSize);

    pProfilerState->profilerHwState.perfmonBytesValid -= decodeState.perfmonBytesConsumed;
}


template<typename HostGpuEvaluationContext>
void FetchMetricValues(profiler::ProfilerState* pProfilerState, profiler::ProfilerResult* pProfilerResult)
{
    NVPA_Status status;

    const int isolatedStateCount = 2;
    HostGpuEvaluationContext evaluationContextArray[isolatedStateCount];
    memset(evaluationContextArray, 0xFF,  sizeof(evaluationContextArray));

    NVPA_NVNC_UnpackRawMetricsOptions unpackRawMetricsOptions = { NVPA_NVNC_UNPACK_RAW_METRICS_OPTIONS_STRUCT_SIZE };
    unpackRawMetricsOptions.pCounterDataImage = pProfilerState->pCounterDataImageMemory;
    unpackRawMetricsOptions.rangeIndex = 0; // 後で書き換える
    unpackRawMetricsOptions.isolated = true; // 後で書き換える
    unpackRawMetricsOptions.numRawMetrics = pProfilerState->platformMetricIdCount;
    unpackRawMetricsOptions.pRawMetricIds = pProfilerState->pPlatformMetricIdArray;

    size_t numRanges = 0;
    status = NVPA_CounterData_GetNumRanges(pProfilerState->pCounterDataImageMemory, &numRanges);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);

    int rangeNameArrayOffset = 0;
    int metricResultArrayOffset = 0;

    for (size_t rangeIndex = 0; rangeIndex < numRanges; ++rangeIndex)
    {
        size_t numDescriptions = 0;
        status = NVPA_CounterData_GetRangeDescriptions(
            pProfilerState->pCounterDataImageMemory, rangeIndex,
            0, nullptr, &numDescriptions);
        NN_ASSERT(status == NVPA_STATUS_SUCCESS);

        NN_ASSERT((rangeNameArrayOffset + numDescriptions) < ProfilerState::rangeNameArrayMaxCount);

        status = NVPA_CounterData_GetRangeDescriptions(
            pProfilerState->pCounterDataImageMemory, rangeIndex,
            ProfilerState::rangeNameArrayMaxCount - rangeNameArrayOffset,
            pProfilerState->rangeNameArray + rangeNameArrayOffset,
            nullptr);

        for (int isolated = 0; isolated < isolatedStateCount; ++isolated)
        {
            HostGpuEvaluationContext* pEvaluationContext = &evaluationContextArray[isolated];
            unpackRawMetricsOptions.isolated = static_cast<NVPA_Bool>(isolated);
            unpackRawMetricsOptions.rangeIndex = rangeIndex;
            unpackRawMetricsOptions.pRawMetricValues = pEvaluationContext->values;
            unpackRawMetricsOptions.pHwUnitCounts = pEvaluationContext->counts;
            status = NVPA_NVNC_CounterData_UnpackRawMetrics(&unpackRawMetricsOptions);
            NN_ASSERT(status == NVPA_STATUS_SUCCESS);
        }

        for (int metricRequestIndex = 0; metricRequestIndex < pProfilerState->requestArrayCount; ++metricRequestIndex)
        {
            const NVPA_RawMetricRequest* pRawMetricRequest = &pProfilerState->requestArray[metricRequestIndex];

            int platformMetricIndex = pProfilerState->requestPlatformMetricIndexArray[metricRequestIndex];
            NN_ASSERT(platformMetricIndex < pProfilerState->platformMetricDescCount);

            const nv::metrics::MetricDesc* pMetricDesc = &pProfilerState->pPlatformMetricDescArray[platformMetricIndex];
            NN_ASSERT(strcmp(pMetricDesc->pName, pRawMetricRequest->pMetricName) == 0);

            HostGpuEvaluationContext* pEvaluationContext = &evaluationContextArray[pRawMetricRequest->isolated];
            double result = pMetricDesc->metricEvalFn(pEvaluationContext);

            pProfilerResult->metricResultArray[metricResultArrayOffset + metricRequestIndex] = result;
        }

        pProfilerResult->sectionResultArray[rangeIndex].rangeNameArrayOffset = rangeNameArrayOffset;
        pProfilerResult->sectionResultArray[rangeIndex].rangeNameCount = static_cast<int>(numDescriptions);
        pProfilerResult->sectionResultArray[rangeIndex].resultArrayOffset = metricResultArrayOffset;
        pProfilerResult->sectionResultArray[rangeIndex].resultCount = pProfilerState->requestArrayCount;

        metricResultArrayOffset += pProfilerState->requestArrayCount;
        rangeNameArrayOffset += static_cast<int>(numDescriptions);
    }
}

void FillRequestArray(HostChipType hostChipType, ProfilerState* pProfilerState)
{
    switch (hostChipType)
    {
    case HostChipType_GM20B:
        {
            pProfilerState->pPlatformMetricDescArray = g_MetricsGM20B;
            pProfilerState->platformMetricDescCount = g_MetricsGM20B_count;

            pProfilerState->pPlatformMetricIdArray = nv::metrics::gm20b::GetRawMetricIds();
            pProfilerState->platformMetricIdCount = nv::metrics::gm20b::RawMetricIdx::COUNT;
        }
        break;
    case HostChipType_GM20X:
        {
            pProfilerState->pPlatformMetricDescArray = g_MetricsGM204;
            pProfilerState->platformMetricDescCount = g_MetricsGM204_count;

            pProfilerState->pPlatformMetricIdArray = nv::metrics::gm204::GetRawMetricIds();
            pProfilerState->platformMetricIdCount = nv::metrics::gm204::RawMetricIdx::COUNT;
        }
        break;
    default:
        NN_UNEXPECTED_DEFAULT;
    }

    int metricRequestIndex = 0;
    int platformMetricIndex = 0;

    while ((metricRequestIndex < ProfilerState::requestArrayMaxCount)
        && (platformMetricIndex < pProfilerState->platformMetricDescCount))
    {
        const nv::metrics::MetricDesc* pMetricDesc =
            &pProfilerState->pPlatformMetricDescArray[platformMetricIndex];

        if (strstr(pMetricDesc->pName, "_avg") != nullptr)
        {
            platformMetricIndex++;
            continue;
        }

        pProfilerState->requestArray[metricRequestIndex].structSize = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
        pProfilerState->requestArray[metricRequestIndex].pMetricName = pMetricDesc->pName;
        pProfilerState->requestArray[metricRequestIndex].isolated = true;
        pProfilerState->requestArray[metricRequestIndex].keepInstances = false;

        pProfilerState->requestPlatformMetricIndexArray[metricRequestIndex] = platformMetricIndex;

        metricRequestIndex++;
        platformMetricIndex++;
    }

    pProfilerState->requestArrayCount = metricRequestIndex;
}

} // anonymous namespace

void InitializeLopLibrary()
{
    NVPA_InitializeHost();
    NVPA_InitializeTarget();
    NVPA_NVNC_LoadDriver();
}


void PushProfilerDebugGroup(nn::gfx::CommandBuffer* pCommandBuffer, const char* groupName)
{
    nn::gfx::CommandBuffer::DataType& gfxCommandBufferDataType =
        nn::gfx::AccessorToData(pCommandBuffer);

    nvnCommandBufferBarrier(
        gfxCommandBufferDataType.pNvnCommandBuffer,
        NVNbarrierBits::NVN_BARRIER_ORDER_FRAGMENTS_BIT);

    nvnCommandBufferPushDebugGroup(gfxCommandBufferDataType.pNvnCommandBuffer, groupName);
}

void PopProfilerDebugGroup(nn::gfx::CommandBuffer* pCommandBuffer)
{
    nn::gfx::CommandBuffer::DataType& gfxCommandBufferDataType =
        nn::gfx::AccessorToData(pCommandBuffer);

    nvnCommandBufferBarrier(
        gfxCommandBufferDataType.pNvnCommandBuffer,
        NVNbarrierBits::NVN_BARRIER_ORDER_FRAGMENTS_BIT);

    nvnCommandBufferPopDebugGroup(gfxCommandBufferDataType.pNvnCommandBuffer);
}

void ProfilerOptions::SetDefaults()
{
}

bool Profiler::Initialize(
    nn::gfx::Device* pDevice, nn::gfx::Queue* pQueue,
    nnt::gfx::util::ResourceAllocator* pResourceAllocator,
    const ProfilerOptions& profilerOptions)
{
    NN_UNUSED(g_MetricsGM204);
    NN_UNUSED(g_MetricsGM204_count);
    NN_UNUSED(g_MetricsGM20B);
    NN_UNUSED(g_MetricsGM20B_count);
    NN_UNUSED(profilerOptions);


    const char* pChipName = GetHostChipName();
    HostChipType hostChipType = GetHostChipTypeFromName(pChipName);

    if (hostChipType == HostChipType_Unsupported)
    {
        NN_LOG("Chip %s is not supported\n", pChipName);
        return false;
    }

    m_pState =
        new (pResourceAllocator->AllocateMemory(sizeof(ProfilerState), 16)) ProfilerState();

    m_pState->pDevice = pDevice;
    m_pState->pQueue = pQueue;
    m_pState->pResourceAllocator = pResourceAllocator;
    m_pState->frameCount = 0;
    m_pState->hostChipType = hostChipType;

    for (int i = 0; i < ProfilerState::NextSubmitFrameArraySize; ++i)
        m_pState->nextSubmitFrameArray[i] = -1;
    m_pState->nextSubmitFrameArrayPushIndex = 0;
    m_pState->nextSubmitFrameArrayPopIndex = 0;

    // generate list of metric requests
    FillRequestArray(hostChipType, m_pState);

    InitializeProfilerData(m_pState, pChipName);

    StartSession(m_pState);

    return true;
}

void Profiler::Finalize()
{
    EndSession(m_pState);
    FinalizeProfilerData(m_pState);

    nnt::gfx::util::ResourceAllocator* pResourceAllocator = m_pState->pResourceAllocator;
    m_pState->~ProfilerState();
    pResourceAllocator->FreeMemory(m_pState);
    m_pState = nullptr;
}

void Profiler::BeginPass(nn::gfx::Queue* pQueue)
{
    NVPA_Status status;
    const nn::gfx::Queue::DataType& gfxQueueDataType =
        nn::gfx::AccessorToData(pQueue);

    m_pState->allPassesSubmitted = false;

    status = NVPA_NVNC_BeginPass(gfxQueueDataType.pNvnQueue, &m_pState->passDescriptor);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);
}

void Profiler::EndPass(nn::gfx::Queue* pQueue)
{
    NVPA_Status status;
    const nn::gfx::Queue::DataType& gfxQueueDataType =
        nn::gfx::AccessorToData(pQueue);

    status = NVPA_NVNC_EndPass(
        gfxQueueDataType.pNvnQueue, &m_pState->passDescriptor,
        &m_pState->allPassesSubmitted);
    NN_ASSERT(status == NVPA_STATUS_SUCCESS);
}

void Profiler::ResetCounterValues()
{
    ResetCounterDataImage(m_pState);
}

ProfilerUpdateResult Profiler::Update(ProfilerResult* pProfilerResult)
{
    ProfilerUpdateResult result;

    m_pState->frameCount++;

    int frameIndex = m_pState->frameCount;

    if (frameIndex >= ProfilerState::frameDelay)
    {
        NVPA_Status status;
        size_t traceBufferIndex;

        status = NVPA_NVNC_GetCurrentTraceBufferIndex(
            nn::gfx::AccessorToData(m_pState->pQueue)->pNvnQueue,
            &traceBufferIndex);
        NN_ASSERT(status == NVPA_STATUS_SUCCESS);

        int numTraceBuffers = ProfilerState::NumTraceBuffers;
        int wrappedBufferIndex =
            (static_cast<int>(traceBufferIndex) + numTraceBuffers - ProfilerState::frameDelay);
        int decodeBufferindex = wrappedBufferIndex % ProfilerState::NumTraceBuffers;
        DecodeCountersLive(m_pState, decodeBufferindex);
    }

    if (m_pState->allPassesSubmitted)
    {
        m_pState->nextSubmitFrameArray[m_pState->nextSubmitFrameArrayPushIndex] =
            frameIndex + ProfilerState::frameDelay;

        m_pState->nextSubmitFrameArrayPushIndex =
            (m_pState->nextSubmitFrameArrayPushIndex + 1) % ProfilerState::NextSubmitFrameArraySize;

        NN_ASSERT(m_pState->nextSubmitFrameArrayPushIndex != m_pState->nextSubmitFrameArrayPopIndex);
    }

    if ((m_pState->nextSubmitFrameArrayPushIndex != m_pState->nextSubmitFrameArrayPopIndex)
        && (m_pState->nextSubmitFrameArray[m_pState->nextSubmitFrameArrayPopIndex] == frameIndex))
    {
        m_pState->nextSubmitFrameArrayPopIndex =
            (m_pState->nextSubmitFrameArrayPopIndex + 1) % ProfilerState::NextSubmitFrameArraySize;

        switch (m_pState->hostChipType)
        {
        case HostChipType_GM20B:
            FetchMetricValues<nv::metrics::gm20b::EvaluationContext>(m_pState, pProfilerResult);
            break;
        case HostChipType_GM20X:
            FetchMetricValues<nv::metrics::gm204::EvaluationContext>(m_pState, pProfilerResult);
            break;
        default:
            NN_UNEXPECTED_DEFAULT;
        }

        ResetCounterValues();

        result = ProfilerUpdateResult_NewResultReady;
    }
    else
    {
        result = ProfilerUpdateResult_MorePassRequired;
    }



    return result;
}

int Profiler::GetMetricCount() const
{
    return m_pState->requestArrayCount;
}

const char* Profiler::GetMetricName(int metricIndex) const
{
    NN_ASSERT(metricIndex < m_pState->requestArrayCount);
    return m_pState->requestArray[metricIndex].pMetricName;
}


} // namespace profiler

#endif // defined(NNT_GFX_UTIL_ENABLE_LOP)
