﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#include <cstring>
#include <cstdio>
#include <new>

#include <nn/nn_Common.h>

NN_PRAGMA_PUSH_WARNINGS
#pragma GCC diagnostic ignored "-Wsign-conversion"
#include <nn/nn_Abort.h>
#include <nn/os/os_Event.h>
#include <nn/os/os_ThreadApi.h>
NN_PRAGMA_POP_WARNINGS

#include "profiler_PerfCounterGroups.h"
#include "profiler_PerfCounterThread.h"

#include "profiler_Comms.h"
#include "profiler_Defines.h"
#include "profiler_Logging.h"
#include "profiler_Memory.h"

namespace nn { namespace profiler { namespace pmu {

//-------------------------------------------------------------------
// Typedefs
//-------------------------------------------------------------------


// todo: Fill this with real interrupts
//nn::svc::Interrupt s_perfOverflowCoreInterrupts[] =
//{
//    nn::dd::CTR::INTERRUPT_PMIRQ_0,
//    nn::dd::CTR::INTERRUPT_PMIRQ_1
//#if NN_VERSION_MAJOR >= 10
//    , /* In SDK 10.0.0 we got access to performance counter overflow events for cores 2 and 3 */
//    nn::dd::CTR::INTERRUPT_PMIRQ_2,
//    nn::dd::CTR::INTERRUPT_PMIRQ_3
//#endif
//};

#ifdef NN_OS_CPU_ARM_AARCH32
#define READ_PMEVCNTRN(storage, n) do \
        { \
            uintptr_t tempreg; \
            asm volatile("mrc p15,0,%0,c14,c8," NN_MACRO_STRINGIZE(n) : "=r"(tempreg) : : "memory"); \
            (storage) = static_cast<uint32_t>(tempreg); \
        } while (NN_STATIC_CONDITION(false))

#define READ_PMCCNTR(storage) do \
        { \
            uintptr_t hi, lo; \
            asm volatile("mrrc p15,0,%0,%1,c9" : "=r"(lo),"=r"(hi) : : "memory"); \
            (storage) = static_cast<uint64_t>(hi) << 32 | lo; \
        } while (NN_STATIC_CONDITION(false))

#define READ_PMCR(storage) do \
        { \
            uintptr_t tempreg; \
            asm volatile("mrc p15,0,%0,c9,c12,0" : "=r"(tempreg) : : "memory"); \
            (storage) = static_cast<uint32_t>(tempreg); \
        } while (NN_STATIC_CONDITION(false))

#define READ_PMUSERENR(storage) do \
        { \
            uintptr_t tempreg; \
            asm volatile("mrc p15,0,%0,c9,c14,0" : "=r"(tempreg) : : "memory"); \
            (storage) = static_cast<uint32_t>(tempreg); \
        } while (NN_STATIC_CONDITION(false))

#define WRITE_PMEVTYPERN(value, n) asm volatile("mcr p15,0,%0,c14,c12," NN_MACRO_STRINGIZE(n) : : "r"(value) : "memory")
#define WRITE_PMEVCNTRN(value, n) asm volatile("mcr p15,0,%0,c14,c8," NN_MACRO_STRINGIZE(n) : : "r"(value) : "memory")
#define WRITE_PMCNTENCLR(reg) asm volatile("mcr p15,0,%0,c9,c12,2" : : "r"(reg) : "memory")
#define WRITE_PMCNTENSET(reg) asm volatile("mcr p15,0,%0,c9,c12,1" : : "r"(reg) : "memory")
#define WRITE_PMCR(reg) asm volatile("mcr p15,0,%0,c9,c12,0" : : "r"(reg) : "memory")
#define WRITE_PMOVSCLR(reg) asm volatile("mcr p15,0,%0,c9,c12,3" : : "r"(reg) : "memory")
#define SOFTWARE_INCREMENT(reg) asm volatile("mcr p15,0,%0,c9,c12,4": : "r"(reg) : "memory")
#else
#define READ_PMEVCNTRN(storage, n) do \
    { \
        uintptr_t tempreg; \
        asm volatile("mrs %0, pmevcntr" NN_MACRO_STRINGIZE(n) "_el0" : "=r"(tempreg) : : "memory"); \
        (storage) = static_cast<uint32_t>(tempreg); \
    } while (NN_STATIC_CONDITION(false))
#define READ_PMCCNTR(storage) do \
    { \
        uintptr_t tempreg; \
        asm volatile("mrs %0, pmccntr_el0" : "=r"(tempreg) : : "memory"); \
        (storage) = tempreg; \
    } while (NN_STATIC_CONDITION(false))
#define READ_PMCR(storage) (storage) = __builtin_arm_rsr("pmcr_el0")
#define READ_PMUSERENR(storage) (storage) = __builtin_arm_rsr("pmuserenr_el0")
#define WRITE_PMEVTYPERN(value, n) __builtin_arm_wsr("pmevtyper" NN_MACRO_STRINGIZE(n) "_el0", value)
#define WRITE_PMEVCNTRN(value, n) __builtin_arm_wsr("pmevcntr" NN_MACRO_STRINGIZE(n) "_el0", value)
#define WRITE_PMCNTENCLR(reg) __builtin_arm_wsr("pmcntenclr_el0", reg)
#define WRITE_PMCNTENSET(reg) __builtin_arm_wsr("pmcntenset_el0", reg)
#define WRITE_PMCR(reg) __builtin_arm_wsr("pmcr_el0", reg)
#define WRITE_PMOVSCLR(reg) __builtin_arm_wsr("pmovsclr_el0", reg)
#define SOFTWARE_INCREMENT(reg) __builtin_arm_wsr("pmswinc_el0", reg)
#endif


namespace
{
    enum PMUSERENR
    {
        PMUSERENR_ENbit = (1 << 0),
    };

    enum PmcrBitFlags : uint32_t
    {
        Pmcr_LongCycle = (1 << 6),
        Pmcr_DisableCycleWhenProhibited = (1 << 5),
        Pmcr_EnableExport = (1 << 4),
        Pmcr_ClockDivider = (1 << 3),
        Pmcr_CycleCounterReset = (1 << 2),
        Pmcr_EventCounterReset = (1 << 1),
        Pmcr_Enable = (1 << 0),

        DefaultPmcr = Pmcr_LongCycle | Pmcr_DisableCycleWhenProhibited,
    };

    struct PerfCounterLayout
    {
        uint32_t counters[6];
        uint32_t wait;
    };

    struct PerformanceCounterDetails
    {
        volatile PerfCounterLayout perfLayout;
        // TODO: Perf counter overflow
        //nn::os::EventType counterOverflow;
    };

    PerformanceCounterDetails* s_counterDetails[SupportedCoreCount];

    PerfCounterGroupDefinition gPerformanceCounterGroups[] =
    {
        /* Performance Counters Disabled */
        {
            PerfCounter_Disabled,
            PerfCounter_Disabled,
            PerfCounter_Disabled,
            PerfCounter_Disabled,
            PerfCounter_Disabled,
            PerfCounter_Disabled,
        },
        /* Instructions Executed */
        {
            PerfCounter_ExecutedInstructionRetired,
            PerfCounter_ExecutedInstructionSpec,
            PerfCounter_ExecutedLoadOrStoreSpec,
            PerfCounter_ExecutedDataProcessingSpec,
            PerfCounter_ExecutedAsimdSpec,
            PerfCounter_ExecutedVfpSpec,
        },
        /* L1 I-Cache */
        {
            PerfCounter_L1InstructionCacheAccess,
            PerfCounter_L1InstructionCacheRefill,
            PerfCounter_L1InstructionTlbRefill,
            PerfCounter_Disabled,
            PerfCounter_ExecutedInstructionSpec,
            PerfCounter_ExecutedInstructionRetired,
        },
        /* L1 D-Cache */
        {
            PerfCounter_L1DataCacheAccess,
            PerfCounter_L1DataCacheRead,
            PerfCounter_L1DataCacheWrite,
            PerfCounter_L1DataCacheWriteBack,
            PerfCounter_L1DataCacheRefill,
            PerfCounter_ExecutedInstructionRetired,
        },
        /* L2 Cache */
        {
            PerfCounter_L2DataCacheAccess,
            PerfCounter_L2DataCacheRead,
            PerfCounter_L2DataCacheWrite,
            PerfCounter_L2DataCacheWriteBack,
            PerfCounter_L2DataCacheRefill,
            PerfCounter_ExecutedInstructionRetired,
        },
        /* Atomics */
        {
            PerfCounter_ExecutedExclusiveLoadSpec,
            PerfCounter_ExecutedExclusiveStorePassSpec,
            PerfCounter_ExecutedExclusiveStoreFailSpec,
            PerfCounter_ExecutedLoadAcquireSpec,
            PerfCounter_ExecutedStoreReleaseSpec,
            PerfCounter_ExecutedLoadOrStoreSpec,
        },
        /* Branches */
        {
            PerfCounter_BranchMispredicted,
            PerfCounter_BranchPredicted,
            PerfCounter_ExecutedProgramCounterWriteSpec,
            PerfCounter_ExecutedBranchImmediateSpec,
            PerfCounter_ExecutedBranchReturnSpec,
            PerfCounter_ExecutedBranchIndirectSpec,
        },
        /* System Calls */
        {
            PerfCounter_ExceptionTakenSvc,
            PerfCounter_CpuCycles_EL1,
            PerfCounter_ExceptionTaken,
            PerfCounter_ExceptionTakenFiq,
            PerfCounter_ExceptionTakenIrq,
            PerfCounter_ExecutedInstructionRetired,
        },
        /* Unaligned Data Access */
        {
            PerfCounter_UnalignedAccessReadSpec,
            PerfCounter_UnalignedAccessWriteSpec,
            PerfCounter_UnalignedAccessSpec,
            PerfCounter_ExecutedLoadOrStoreSpec,
            PerfCounter_Disabled,
            PerfCounter_ExecutedInstructionRetired,
        },
    };

    inline bool ProcessHasPmuAccess()
    {
        uintptr_t pmuserenr;
        READ_PMUSERENR(pmuserenr);
        return (pmuserenr & PMUSERENR_ENbit) != 0;
    }


    inline bool DisableCounters()
    {
        uint32_t origPmcr;
        READ_PMCR(origPmcr);
        uint32_t pmcr = DefaultPmcr;
        WRITE_PMCR(pmcr);
        return (origPmcr & Pmcr_Enable);
    }


    inline void EnableCounters()
    {
        uint32_t value = 0x8000003f;
        WRITE_PMCNTENSET(value);
        uint32_t pmcr = DefaultPmcr | Pmcr_Enable;
        WRITE_PMCR(pmcr);
    }


    inline void EnableAndClearCounters()
    {
        uint32_t cntenset = 0x8000003f;
        WRITE_PMCNTENSET(cntenset);
        uint32_t ovsclr = 0xFFFFFFFF;
        WRITE_PMOVSCLR(ovsclr);
        uint32_t pmcr = DefaultPmcr | Pmcr_CycleCounterReset | Pmcr_EventCounterReset | Pmcr_Enable;
        WRITE_PMCR(pmcr);
    }


    // Note: This function has a side-effect of disabling the counters
    inline void ClearCounters()
    {
        uint32_t value = 0xFFFFFFFF;
        WRITE_PMOVSCLR(value);
        uint32_t pmcr = DefaultPmcr | Pmcr_CycleCounterReset | Pmcr_EventCounterReset;
        WRITE_PMCR(pmcr);
    }


    void CommandTask_SetPerfCounters(PerformanceCounterDetails* details);
    bool CommandTask_ReadPerfCounters(PerformanceCounters& data, bool wasSampleThread);
    void CommandTask_SetInterrupt(PerformanceCounterDetails* details);
    void CommandTask_ResetInterrupt(PerformanceCounterDetails* details);


    inline void CommandTask_SetPerfCounters(PerformanceCounterDetails* details)
    {
        DisableCounters();

        for (int i = 0; i < PerformanceCounterCount; ++i)
        {
            DEBUG_LOG("Setting counter %d: %x\n", i, details->perfLayout.counters[i]);
        }
        uint32_t c0 = details->perfLayout.counters[0];
        WRITE_PMEVTYPERN(c0, 0);
        uint32_t c1 = details->perfLayout.counters[1];
        WRITE_PMEVTYPERN(c1, 1);
        uint32_t c2 = details->perfLayout.counters[2];
        WRITE_PMEVTYPERN(c2, 2);
        uint32_t c3 = details->perfLayout.counters[3];
        WRITE_PMEVTYPERN(c3, 3);
        uint32_t c4 = details->perfLayout.counters[4];
        WRITE_PMEVTYPERN(c4, 4);
        uint32_t c5 = details->perfLayout.counters[5];
        WRITE_PMEVTYPERN(c5, 5);

        EnableAndClearCounters();
    }



    inline bool CommandTask_ReadPerfCounters(PerformanceCounters& data, bool wasSampleThread)
    {
        bool countersWereEnabled = DisableCounters();

        uint32_t c0, c1, c2, c3, c4, c5;
        uint64_t cy;

        READ_PMEVCNTRN(c0, 0);
        READ_PMEVCNTRN(c1, 1);
        READ_PMEVCNTRN(c2, 2);
        READ_PMEVCNTRN(c3, 3);
        READ_PMEVCNTRN(c4, 4);
        READ_PMEVCNTRN(c5, 5);
        READ_PMCCNTR(cy);

        data.counters[0] = c0;
        data.counters[1] = c1;
        data.counters[2] = c2;
        data.counters[3] = c3;
        data.counters[4] = c4;
        data.counters[5] = c5;

        if (countersWereEnabled || !wasSampleThread)
        {
            data.cycles = cy;
            EnableAndClearCounters();
            return true;
        }
        else
        {
            data.cycles = 0;
            return false;
        }

    }



    inline void CommandTask_SetInterrupt(PerformanceCounterDetails* details)
    {
        // todo: Actually set PMC interrupt
        //nn::dbg::SetPMCEvent(
        //    nn::dbg::PERFORMANCE_COUNTER_NAME_CORE_0,
        //    (nn::dbg::PerformanceCounterEvent)details->perfLayout.counters[0]);

        CommandTask_ResetInterrupt(details);
    }



    inline void CommandTask_ResetInterrupt(PerformanceCounterDetails* details)
    {
        ClearCounters(); // Clear everything to zero, including overflow bit
        //nn::os::ClearEvent(&details->counterOverflow);
        uint32_t value = details->perfLayout.wait;
        WRITE_PMEVCNTRN(value, 0); // Now set the one we care about to the value we care about
        EnableCounters();
    }
}


//-------------------------------------------------------------------
// Globally defined functions
//-------------------------------------------------------------------
bool IsAvailable()
{
    return ProcessHasPmuAccess();
}


void Initialize()
{
    for (uint32_t core = 0; core < SupportedCoreCount; ++core)
    {
        DEBUG_LOG("Intializing perfcounter info on core %d\n", core);
        PerformanceCounterDetails* details = Memory::GetInstance()->Allocate<PerformanceCounterDetails>();
        new (details) PerformanceCounterDetails;
        s_counterDetails[core] = details;
        memset(details, 0, sizeof(PerformanceCounterDetails));
        //nn::os::InitializeEvent(&details->counterOverflow, false, nn::os::EventClearMode_AutoClear);
    }
}



void Finalize()
{
    for (uint32_t core = 0; core < SupportedCoreCount; ++core)
    {
        DEBUG_LOG("Finalizing perfcounter info on core %d\n", core);
        PerformanceCounterDetails* details = s_counterDetails[core];
        //nn::os::FinalizeEvent(&details->counterOverflow);
        Memory::GetInstance()->Free(details);
    }
}




void SetCounters(uint16_t counters[PerformanceCounterCount])
{
    NN_SDK_ASSERT(ProcessHasPmuAccess());
    int core = nn::os::GetCurrentCoreNumber();
    PerformanceCounterDetails* details = s_counterDetails[core];
    for (int i = 0; i < PerformanceCounterCount; ++i)
    {
        uint32_t c = counters[i];
        if (c > PerfCounter_Disabled)
        {
            uint32_t filter = c & 0xFE00;
            c = (filter << 16) | (c & 0x1FF);
        }
        details->perfLayout.counters[i] = c;
    }

    CommandTask_SetPerfCounters(details);
    DEBUG_LOG("SetCounters on core %d completed\n", core);
}



bool ReadCounters(PerformanceCounters& data, uint32_t core, bool wasSamplerThread)
{
    NN_UNUSED(core);
    NN_SDK_ASSERT(ProcessHasPmuAccess());
    return CommandTask_ReadPerfCounters(data, wasSamplerThread);
}



void StopCounters()
{
    NN_SDK_ASSERT(ProcessHasPmuAccess());
    DisableCounters();
}



nn::os::EventType* GetInterrupt(uint32_t core)
{
    NN_UNUSED(core);
    //PerformanceCounterDetails* details = s_counterDetails[core];
    //return &details->counterOverflow;
    return nullptr;
}



void SetInterrupt(
    PerfCounter type,
    uint32_t waitTime,
    uint32_t core)
{
    NN_SDK_ASSERT(ProcessHasPmuAccess());

    // todo: Bind event to interrupt when counter switches from Disabled
    // todo: Unbined event from interrupt when counter switches to Disabled

    PerformanceCounterDetails* details = s_counterDetails[core];
    DEBUG_LOG("SetInterrupt on core %d\n", core);
    uint32_t c = type;
    if (c > PerfCounter_Disabled)
    {
        uint32_t filter = c & 0xFE00;
        c = (filter << 16) | (c & 0x1FF);
    }
    details->perfLayout.counters[0] = c;
    details->perfLayout.wait = waitTime;
    CommandTask_SetInterrupt(details);
}



void ResetInterrupt(uint32_t core)
{
    NN_SDK_ASSERT(ProcessHasPmuAccess());

    PerformanceCounterDetails* details = s_counterDetails[core];
    DEBUG_LOG("ResetInterrupt on core %d\n", core);
    CommandTask_ResetInterrupt(details);
}


void SoftwareIncrement()
{
    NN_SDK_ASSERT(ProcessHasPmuAccess());
    uint32_t value = 0xFFFFFFFF;
    SOFTWARE_INCREMENT(value);
}


PerfCounterGroupDefinition* GetPerformanceCounterGroup(int index)
{
    return &(gPerformanceCounterGroups[index]);
}

} // pmu
} // profiler
} // nn
