﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#include <cstdio>
#include <atomic>
#include <new>

#include <nn/nn_Common.h>

NN_PRAGMA_PUSH_WARNINGS
#pragma GCC diagnostic ignored "-Wsign-conversion"
#include <nn/nn_TimeSpan.h>
#include <nn/nn_Version.h>
#include <nn/os.h>
#include <nn/util/util_BitPack.h>
#include <nn/svc/svc_Base.h>
#include <nn/svc/svc_Dmnt.h>
#include <nn/svc/svc_Result.h>
#include <nn/svc/svc_Synchronization.h>
NN_PRAGMA_POP_WARNINGS

#include <nn/profiler/profiler_Api.h>

#include "pmu/profiler_PerfCounterGroups.h"
#include "pmu/profiler_PerfCounterThread.h"
#include "profiler_Comms.h"
#include "profiler_CommsIpc.h"
#include "profiler_DataStream.h"
#include "profiler_Defines.h"
#include "profiler_HeaderWriter.h"
#include "profiler_IpcEvent.h"
#include "profiler_LibPrivate.h"
#include "profiler_Logging.h"
#include "profiler_Memory.h"
#include "profiler_RecordMethods.h"
#include "profiler_ResultPrivate.h"
#include "profiler_SamplingThread.h"
#include "profiler_TargetApplication.h"
#include "profiler_ThreadPriorities.h"
#include "profiler_Time.h"
#include "profiler_Timing.h"
#include "profiler_Workarea.h"
#include "profiler_WriteToBuffer.h"

namespace nn { namespace profiler {


namespace /*anonymous*/
{
    const size_t SampleThreadStackSize = 8 * 1024;
    const size_t RecordFunctionsBufferSize = 8;

    enum SamplingThreadTimer : int
    {
        SamplingThreadTimer_Total,
        SamplingThreadTimer_Sleep,
        SamplingThreadTimer_Record,
        SamplingThreadTimer_RecordLoop,
        SamplingThreadTimer_RecordStack,

        SamplingThreadTimer_Count,
    };

    enum TriggerFlags : uint32_t
    {
        TriggerFlags_TakeSample = (1u << 0),
        TriggerFlags_SampleNoThread = (1u << 1),
        TriggerFlags_SampleUnknownThread = (1u << 2),
        TriggerFlags_SampleSamplerThread = (1u << 3),

        TriggerFlags_StopProfiling = (1u << 31),
    };

    struct SamplingThread
    {
        uint8_t *stackMemory;
        nn::os::ThreadType thread;

        uint8_t *stackMemoryGuard;
        nn::os::ThreadType threadGuard;

        volatile bool shouldExit;
        volatile bool shouldGuardExit;
        volatile bool isProfiling;
        volatile bool isActive;
        bool isControlThread;

        nn::os::EventType guardStartEvent;
        nn::os::EventType guardExitEvent;

        ProfilerRecordFuncPtr recordFuncs[RecordFunctionsBufferSize];

        std::atomic<uint32_t> triggerFlags;
        nn::os::EventType trigger;

        nn::os::ThreadId threadToSample;
        nn::Bit32 threadFlags;

#ifdef NN_PROFILER_TIMING_ENABLE
        nn::profiler::Timer timers[SamplingThreadTimer_Count];
#endif
    };
    SamplingThread* sSamplingThreads[SupportedCoreCount];

    struct Globals
    {
        nn::os::EventType startProfilingEvent;
        nn::os::TimerEventType sampleTimerEvent;
        nn::os::BarrierType threadBarrier;
        nn::os::BarrierType samplerThreadStartedBarrier;
        nn::os::LightEventType sampleTaken;
        nn::os::LightEventType workBufferClosed;
        std::atomic<int32_t> sampleWait;
        std::atomic<uint32_t> workBufferOpen;
        std::atomic<uint32_t> coreCount;
        bool isInstrumented;
        bool useProfilerProcess;
        volatile bool stopSampling;
    };
    Globals globals;


    // ------------------------------------------------------
    // Local function declarations
    void SamplingThread_ThreadFunc(void* arg);
    void SamplingThread_ThreadGuardFunc(void* arg);
    void SamplingThread_ProfilingLoop();
    void SamplingThread_WaitForSampling(WorkArea* ws, SamplingThread* coreSettings);

    void SamplingThread_RecordNoThread(WorkArea* ws, uint32_t core);
    void SamplingThread_RecordUnknownThread(WorkArea* ws, uint32_t core);
    void SamplingThread_RecordProfilerThread(WorkArea* ws, uint32_t core);
    void SamplingThread_RecordTimedBasicEvent(WorkArea* ws, uint32_t core, uint32_t event);
    void SamplingThread_RecordSample(WorkArea* ws, uint32_t core, nn::os::ThreadId thread);

    void SamplingThread_FillRecordFuncBuffer(ProfilerRecordFuncPtr* funcs);
    void SamplingThread_SetupRecordFuncList(
        WorkArea* ws,
        SamplingThread* threadDetails,
        bool* hasPerfCounters);
    nn::Result SamplingThread_RunRecordFuncList(ProfilerRecordFuncPtr* funcs, WorkArea* ws);

    void SetupCoreHeader(uint32_t core, SettingsFromThePcGui* settings);

    void SamplingThread_Active(uint32_t core, WorkArea* ws, SamplingThread* coreSettings);
    void SamplingThread_Inactive(uint32_t core, WorkArea* ws, SamplingThread* coreSettings);
    // ------------------------------------------------------


    void SamplingThread_Active(uint32_t core, WorkArea* ws, SamplingThread* coreSettings)
    {
        bool success = false;
        bool hasPerfCounters = false;

        globals.workBufferOpen |= (1 << core);

        success = ObtainAndSetupSampleBuffer(SampleBufferIndex(core));
        if (!success)
        {
            ERROR_LOG("Could not obtain memory for core %d\n", core);
            coreSettings->isActive = false;
            nn::os::AwaitBarrier(&globals.samplerThreadStartedBarrier);
            //--globals.allCoreSyncCount;
            return;
        }

        nn::os::ClearEvent(&coreSettings->trigger);
        SamplingThread_SetupRecordFuncList(ws, coreSettings, &hasPerfCounters);

        {
            SettingsFromThePcGui *settings = ws->settings;
            uint32_t flags = settings->flags;
            if ((flags & SettingsFromThePcGui::SampleByPerfCounter))
            {
                if (settings->perf_counters[0] != pmu::PerfCounter_Disabled)
                {
                    DEBUG_LOG("Setting up sample by perf on core %d\n", core);
                    ws->fields.SetBit(WorkArea::SampleUsingPerfCounters, true);
                    ws->fields.SetBit(WorkArea::UseInfiniteTime, true);
                    pmu::SetInterrupt(
                        static_cast<pmu::PerfCounter>(settings->perf_counters[0]),
                        settings->requested_time_between_samples_in_nanoseconds,
                        core);
                }
            }

            if (flags & SettingsFromThePcGui::UseInstrumented)
            {
                ws->fields.SetBit(WorkArea::UseInfiniteTime, true);
            }

            ws->record_cores = static_cast<uint8_t>(settings->coreMask);

            DUMP_CURRENT_LINE();

            ws->fields.SetBit(WorkArea::IsUsingPerfCounters, hasPerfCounters);
        }

        DUMP_CURRENT_LINE();
        SetupCoreHeader(core, ws->settings);

        coreSettings->isProfiling = true;
        //--globals.allCoreSyncCount;
        nn::os::AwaitBarrier(&globals.samplerThreadStartedBarrier);

        DUMP_CURRENT_LINE();

        // Set up timers
        for (int timerIdx = 0; timerIdx < SamplingThreadTimer_Count; ++timerIdx)
        {
            NN_PROFILER_TIMING_CLEAR(&coreSettings->timers[timerIdx]);
        }

        DUMP_CURRENT_LINE();
        SamplingThread_ProfilingLoop();
        DUMP_CURRENT_LINE();

    #if defined(NN_PROFILER_TIMING_ENABLE)
        for (int timerIdx = 0; timerIdx < SamplingThreadTimer_Count; ++timerIdx)
        {
            nn::TimeSpan ts = nn::os::ConvertToTimeSpan(nn::os::Tick((int64_t)coreSettings->timers[timerIdx].GetAverageTime()));
            NN_UNUSED(ts);
            FORCE_LOG("Timer %d on core %d: %lluns\n", timerIdx, core, ts.GetNanoSeconds());
        }
    #endif

        DEBUG_LOG("Finished profiling on core %d\n", core);
        ws->record_cores = 0;
        coreSettings->isProfiling = false;
        coreSettings->isActive = false;

        // Stop performance counters
        if (ws->fields.GetBit(WorkArea::SampleUsingPerfCounters))
        {
            pmu::StopCounters();
            // Unbind from the interrupt overflow now that profiling has ceased
            pmu::SetInterrupt(pmu::PerfCounter_Disabled, 0, core);
        }

        // Close the buffer
        {
            uint8_t *writePtr = FETCH_ADD(ws->curPtr, 16);
            writePtr = WriteToBuffer(writePtr, static_cast<uint32_t>(PayloadEvents_Base));
            writePtr = WriteToBuffer(writePtr, GetCurrentTime());
            writePtr = WriteToBuffer(writePtr, static_cast<uint32_t>(PayloadGenericEvents_End));

            FinalizeCoreWorkArea(SampleBufferIndex(core));
        }
    }


    void SamplingThread_Inactive(uint32_t core, WorkArea* ws, SamplingThread* coreSettings)
    {
        NN_UNUSED(ws);
        NN_UNUSED(coreSettings);

        if (core == 0)
        {
            nn::svc::LastThreadContext context;
            nn::Bit64 tmpTid;

            while (!coreSettings->shouldExit)
            {
                auto debugHandle = TargetApplication::GetCurrent()->GetDebugHandle();
                auto waitTime = TargetApplication::GetCurrent()->GetWaitTime();

                if (waitTime == nn::svc::WAIT_INFINITE)
                {
                    break;
                }

                nn::svc::GetDebugFutureThreadInfo(&context, &tmpTid, debugHandle, waitTime);
            }
        }
    }


    void SamplingThread_ThreadGuardFunc(void *arg)
    {
        SamplingThread* coreSettings = reinterpret_cast<SamplingThread*>(arg);

        while (!coreSettings->shouldExit)
        {
            nn::os::WaitEvent(&coreSettings->guardStartEvent);

            while (!coreSettings->shouldGuardExit)
            {
                // spin
            }

            nn::os::SignalEvent(&coreSettings->guardExitEvent);
        }
    }


    void SamplingThread_SuspendGuardThread(SamplingThread* coreSettings)
    {
        svc::Handle svcHandle(coreSettings->threadGuard._handle);
        nn::svc::SetThreadActivity(svcHandle, svc::ThreadActivity_Paused);
    }


    void SamplingThread_ResumeGuardThread(SamplingThread* coreSettings)
    {
        svc::Handle svcHandle(coreSettings->threadGuard._handle);
        nn::svc::SetThreadActivity(svcHandle, svc::ThreadActivity_Runnable);
    }


    void SamplingThread_ThreadFunc(void *arg)
    {
        uint32_t core = static_cast<uint32_t>(nn::os::GetCurrentCoreNumber());

        SamplingThread* coreSettings = reinterpret_cast<SamplingThread*>(arg);
        WorkArea* ws = GetWorkAreaForCore(SampleBufferIndex(core));

        coreSettings->isProfiling = false;

        DumpThreadInformation();

        while (!coreSettings->shouldExit)
        {
            nn::os::WaitEvent(&globals.startProfilingEvent);

            if (coreSettings->shouldExit) { break; } // Exit!

            if (coreSettings->isActive)
            {
                SamplingThread_Active(core, ws, coreSettings);
            }
            else
            {
                SamplingThread_Inactive(core, ws, coreSettings);
            }

            globals.workBufferOpen &= ~(1 << core);
            nn::os::SignalLightEvent(&globals.workBufferClosed);

            // If core 0, wait for all cores closed
            if (core == 0)
            {
                //
                // If some core calls GetDebugFutureThreadInfo(-1) after core 0 exits, the core waits forever.
                // To prevent this, core 0 should call GetDebugFutureThreadInfo(timeout).
                //
                while (globals.workBufferOpen != 0)
                {
                    nn::svc::LastThreadContext context;
                    nn::Bit64 tmpTid;

                    auto debugHandle = TargetApplication::GetCurrent()->GetDebugHandle();
                    nn::svc::GetDebugFutureThreadInfo(&context, &tmpTid, debugHandle, 1000 * 1000); // 1ms
                }
            }

            nn::os::AwaitBarrier(&globals.threadBarrier);
            nn::os::ClearEvent(&globals.startProfilingEvent);
        }
    }



    NN_NOINLINE void SamplingThread_ProfilingLoop()
    {
        nn::Result result;
        uint32_t core = static_cast<uint32_t>(nn::os::GetCurrentCoreNumber());
        DUMP_CURRENT_LINE();

        SamplingThread* coreSettings = sSamplingThreads[core];
        WorkArea* ws = GetWorkAreaForCore(SampleBufferIndex(core));

        coreSettings->triggerFlags = 0;

        bool shouldExit = false;
        coreSettings->shouldGuardExit = false;

        // setup temp memory for recording stack
        ws->tempStack = reinterpret_cast<uintptr_t>(Memory::GetInstance()->Allocate(ReadStackStorageSize));

        if (ws->fields.GetBit(WorkArea::IsUsingPerfCounters))
        {
            pmu::PerformanceCounters data;
            nn::profiler::pmu::ReadCounters(data, core, false);
        }

        SamplingThread_SuspendGuardThread(coreSettings);
        nn::os::SignalEvent(&coreSettings->guardStartEvent);

        while (!shouldExit)
        {
            DUMP_CURRENT_LINE();
            SamplingThread_WaitForSampling(ws, coreSettings);

            uint32_t flags = coreSettings->triggerFlags.exchange(0);
            NN_SDK_ASSERT(flags != 0, "Received trigger even though no flags were set\n");

            VERBOSE_LOG("Sample requested on core %d, triggerFlags = %08x\n", core, flags);

            // Stop should always be handled first
            if (flags & TriggerFlags_StopProfiling)
            {
                shouldExit = true;
                continue;
            }

            NN_PROFILER_TIMING_BEGIN(&coreSettings->timers[SamplingThreadTimer_Record]);
            if (flags & TriggerFlags_TakeSample)
            {
                SamplingThread_RecordSample(ws, core, coreSettings->threadToSample);
            }

            if (flags & TriggerFlags_SampleNoThread)
            {
                SamplingThread_RecordNoThread(ws, core);
            }

            if (flags & TriggerFlags_SampleUnknownThread)
            {
                SamplingThread_RecordUnknownThread(ws, core);
            }

            if (flags & TriggerFlags_SampleSamplerThread)
            {
                SamplingThread_RecordProfilerThread(ws, core);
            }

            --globals.sampleWait;
            if (globals.sampleWait <= 0)
            {
                nn::os::SignalLightEvent(&globals.sampleTaken);
            }
            NN_PROFILER_TIMING_END(&coreSettings->timers[SamplingThreadTimer_Record]);
        }

        // guard thread exits spin loop
        coreSettings->shouldGuardExit = true;
        SamplingThread_ResumeGuardThread(coreSettings);

        // Wait guard thread exited
        nn::os::WaitEvent(&coreSettings->guardExitEvent);

        Memory::GetInstance()->Free(reinterpret_cast<void*>(ws->tempStack));

        // Set to the negative of the tracked core count to try to ensure that the Wait cannot lock indefinitely
        // using INT_MIN could cause an underflow
        globals.sampleWait = -static_cast<int32_t>(SupportedCoreCount);
        nn::os::SignalLightEvent(&globals.sampleTaken);
    }



    NN_NOINLINE void SamplingThread_WaitForSampling(WorkArea* ws, SamplingThread* coreSettings)
    {
        DUMP_CURRENT_LINE();
        NN_PROFILER_TIMING_END(&coreSettings->timers[SamplingThreadTimer_Total]);
        NN_PROFILER_TIMING_BEGIN(&coreSettings->timers[SamplingThreadTimer_Sleep]);

#if 0
        //
        // If leaf mode does not need stack depth, we can skip suspend/resume the guard threads.
        //
        if ((ws->settings->flags & SettingsFromThePcGui::UseSimple) == 0)
        {
            SamplingThread_SuspendGuardThread(coreSettings);
        }
#else
        SamplingThread_SuspendGuardThread(coreSettings);
#endif

        nn::svc::LastThreadContext context;
        nn::Bit64 tmpTid;
        auto debugHandle = TargetApplication::GetCurrent()->GetDebugHandle();
        auto waitTime = TargetApplication::GetCurrent()->GetWaitTime();
        if (ws->core_number != 0)
        {
            waitTime = nn::svc::WAIT_INFINITE;
        }
        else if (waitTime == nn::svc::WAIT_INFINITE)
        {
            waitTime = 0; // The wait time is only infinite if profiling is inactive.
                          // Let profiling stop.
        }

        //
        // core 0 sets timer, and core 1 and 2 wait for the same timer.
        //
        auto result = nn::svc::GetDebugFutureThreadInfo(&context, &tmpTid, debugHandle, waitTime);

        // Guard threads should be resumed immediately to prevent from application threads running
#if 0
        if ((ws->settings->flags & SettingsFromThePcGui::UseSimple) == 0)
        {
            SamplingThread_ResumeGuardThread(coreSettings);
        }
#else
        SamplingThread_ResumeGuardThread(coreSettings);
#endif

        if (result <= nn::svc::ResultUnknownThread())
        {
            // Other process
            coreSettings->threadToSample = DebugEventOtherProcessThreadId;
        }
        else if (result <= nn::svc::ResultNoThread())
        {
            // Idle
            coreSettings->threadToSample = DebugEventSystemSleepThreadId;
        }
        else
        {
            // Sampleing target
            ws->context.fp = context.fp;
            ws->context.lr = context.lr;
            ws->context.sp = context.sp;
            ws->context.pc = context.pc;
            coreSettings->threadToSample = tmpTid;
        }

        NN_PROFILER_TIMING_END(&coreSettings->timers[SamplingThreadTimer_Sleep]);
        NN_PROFILER_TIMING_BEGIN(&coreSettings->timers[SamplingThreadTimer_Total]);

        nn::os::ThreadId tid = coreSettings->threadToSample;
        switch (tid)
        {
            case DebugEventSystemSleepThreadId:
                coreSettings->triggerFlags |= TriggerFlags_SampleNoThread;
                break;
            case DebugEventOtherProcessThreadId:
                coreSettings->triggerFlags |= TriggerFlags_SampleUnknownThread;
                break;
            default:
                coreSettings->triggerFlags |= TriggerFlags_TakeSample;
                break;
        }
    }



    void SamplingThread_RecordNoThread(WorkArea* ws, uint32_t core)
    {
        DUMP_CURRENT_LINE();
        SamplingThread_RecordTimedBasicEvent(ws, core, PayloadEvents_NoActiveThread);
    }



    void SamplingThread_RecordUnknownThread(WorkArea* ws, uint32_t core)
    {
        DUMP_CURRENT_LINE();
        SamplingThread_RecordTimedBasicEvent(ws, core, PayloadEvents_UnknownActiveThread);
    }



    void SamplingThread_RecordProfilerThread(WorkArea* ws, uint32_t core)
    {
        DUMP_CURRENT_LINE();
        SamplingThread_RecordTimedBasicEvent(ws, core, PayloadEvents_ProfilerActiveThread);
    }



    void SamplingThread_RecordSample(WorkArea* ws, uint32_t core, nn::os::ThreadId thread)
    {
        DUMP_CURRENT_LINE();

        bool shouldContinue = true;

        shouldContinue = CheckAndExpandBuffersIfNeeded(SampleBufferIndex(core));
        if (shouldContinue)
        {
            DUMP_CURRENT_LINE();

            ws->thread_to_profile = thread;
            SamplingThread_RunRecordFuncList(sSamplingThreads[core]->recordFuncs, ws);

            if (ws->fields.GetBit(WorkArea::SampleUsingPerfCounters))
            {
                nn::profiler::pmu::ResetInterrupt(core);
            }
        }

        if (!shouldContinue)
        {
            StopProfilingSamplingThreads();
        }
    }



    NN_NOINLINE void SamplingThread_RecordTimedBasicEvent(WorkArea* ws, uint32_t core, uint32_t event)
    {
        bool shouldContinue = true;

        shouldContinue = CheckAndExpandBuffersIfNeeded(SampleBufferIndex(core));
        if (shouldContinue)
        {
            DUMP_CURRENT_LINE();

            uint8_t *writePtr = FETCH_ADD(ws->curPtr, 12);
            writePtr = WriteToBuffer(writePtr, event);
            writePtr = WriteToBuffer(writePtr, GetCurrentTime());

            if (ws->fields.GetBit(WorkArea::IsUsingPerfCounters))
            {
                RecordPerfCounters(ws);
            }

            if (ws->fields.GetBit(WorkArea::SampleUsingPerfCounters))
            {
                nn::profiler::pmu::ResetInterrupt(core);
            }
        }

        if (!shouldContinue)
        {
            StopProfilingSamplingThreads();
        }
    }



    void SamplingThread_FillRecordFuncBuffer(ProfilerRecordFuncPtr* funcs)
    {
        for (size_t idx = 0; idx < RecordFunctionsBufferSize; ++idx)
        {
            funcs[idx] = &RecordDefaultFuncPtr;
        }
    }



    nn::Result SamplingThread_RecordStackStart(WorkArea* ws)
    {
        NN_UNUSED(ws);
        NN_PROFILER_TIMING_BEGIN(&sSamplingThreads[ws->core_number]->timers[SamplingThreadTimer_RecordStack]);
        return nn::ResultSuccess();
    }



    nn::Result SamplingThread_RecordStackStop(WorkArea* ws)
    {
        NN_UNUSED(ws);
        NN_PROFILER_TIMING_END(&sSamplingThreads[ws->core_number]->timers[SamplingThreadTimer_RecordStack]);
        return nn::ResultSuccess();
    }



    //
    // SetupRecordFuncList
    //  Current maximum depth is 4
    //  If this ever gets above 8 we have a problem
    //
    void SamplingThread_SetupRecordFuncList(
        WorkArea* ws,
        SamplingThread* threadDetails,
        bool* hasPerfCounters)
    {
        int32_t idx = 0;
        SettingsFromThePcGui* settings = ws->settings;

        SamplingThread_FillRecordFuncBuffer(threadDetails->recordFuncs);

        INFO_LOG("\n\n____ Record Functions:\n");

        // All paths start by recording a common header
        INFO_LOG("\tRecord Sample Header\n");
        threadDetails->recordFuncs[idx++] = &RecordSampleHeader;

        // Record performance counters if needed
        if ((settings->flags & SettingsFromThePcGui::SampleByPerfCounter) == 0)
        {
            bool setupRecordPerfCounters = false;
            if (settings->perf_counter_cycle != 0)
            {
                ws->fields.SetBit(WorkArea::RecordPerformanceCounterCycles, true);
                setupRecordPerfCounters = true;
            }

            if (settings->perf_counters[0] != pmu::PerfCounter_Disabled)
            {
                ws->fields.SetBit(WorkArea::RecordPerformanceCounters, true);
                setupRecordPerfCounters = true;
            }

            *hasPerfCounters = setupRecordPerfCounters;
            if (setupRecordPerfCounters)
            {
                nn::profiler::pmu::SetCounters(settings->perf_counters);

                INFO_LOG("\tRecord Performance Counters\n");
                threadDetails->recordFuncs[idx++] = &RecordPerfCounters;
            }
        }

        // All paths record the current program counter
        INFO_LOG("\tRecord Program Counter\n");
        threadDetails->recordFuncs[idx++] = &RecordPC;
        ws->fields.SetBit(WorkArea::Is64Bit, TargetApplication::GetCurrent()->Is64Bit());

        // If requested, record the stack
        if ((settings->flags & SettingsFromThePcGui::UseSimple) == 0)
        {
            INFO_LOG("\tRecord Callstack\n");
            threadDetails->recordFuncs[idx++] = &SamplingThread_RecordStackStart;
            threadDetails->recordFuncs[idx++] = &RecordStack;
            threadDetails->recordFuncs[idx++] = &SamplingThread_RecordStackStop;
        }
        else if (settings->flags & SettingsFromThePcGui::RecordSimpleStackDepths)
        {
            INFO_LOG("\tRecord Stack Depth\n");
            threadDetails->recordFuncs[idx++] = &SamplingThread_RecordStackStart;
            threadDetails->recordFuncs[idx++] = &RecordStackDepth;
            threadDetails->recordFuncs[idx++] = &SamplingThread_RecordStackStop;
        }

        NN_SDK_ASSERT(static_cast<size_t>(idx) <= RecordFunctionsBufferSize);

        INFO_LOG("\n\n");
    }



    nn::Result SamplingThread_RunRecordFuncList(ProfilerRecordFuncPtr* funcs, WorkArea* ws)
    {
        nn::Result result = nn::ResultSuccess();

        NN_PROFILER_TIMING_BEGIN(&sSamplingThreads[ws->core_number]->timers[SamplingThreadTimer_RecordLoop]);

        for (size_t i = 0; i < RecordFunctionsBufferSize; ++i)
        {
            result = funcs[i](ws);
            if (result.IsFailure()) { break; }
        }

        if (nn::profiler::ResultSampleLoopComplete::Includes(result)) { result = nn::ResultSuccess(); }

        NN_PROFILER_TIMING_END(&sSamplingThreads[ws->core_number]->timers[SamplingThreadTimer_RecordLoop]);

        return result;
    }



    void SetupCoreHeader(uint32_t core, SettingsFromThePcGui* settings)
    {
        // Now fill the header into the buffer
        Header* header = GetCoreHeader(core);
        header->Initialize();
        header->WriteControlValueOnly(HeaderSpecialValues_CoreHeaderBegin);

        // Section code for Dlls
        // Neb_Note: This is inserted here, but managed by the PC side.
        // ref: ProfilerInput.cs@832 in SaveProfileToFile()
        header->Write(HeaderSpecialValues_Dlls, static_cast<uint32_t>(0) /*false*/);

        header->Write(HeaderSpecialValues_CoreNumber, core);

        header->Write(HeaderSpecialValues_Flags, settings->flags);

        header->Write(
            HeaderSpecialValues_PerformanceCounterSlots,
            settings->perf_counters,
            pmu::PerformanceCounterCount,
            settings->perf_counter_cycle);

        header->Write(HeaderSpecialValues_BaseTime, GetCurrentTime());

        header->Write(
            HeaderSpecialValues_RequestedTimeBetweenSamples,
            settings->requested_time_between_samples_in_nanoseconds);

        header->WriteControlValueOnly(HeaderSpecialValues_HeaderEnd);
    }

} // anonymous


nn::Result InitializeSamplingThreads()
{
    nn::Result result = nn::ResultSuccess();
    uint32_t core = 0;

    InitializeWorkAreas();

    memset(&globals, 0, sizeof(Globals));

    if (!SampleBuffers::GetInstance()->IsInitialized())
    {
        return nn::profiler::ResultNotInitialized();
    }

    nn::os::InitializeEvent(
        &globals.startProfilingEvent,
        false,
        nn::os::EventClearMode_ManualClear);
    globals.stopSampling = false;

    nn::os::InitializeTimerEvent(&globals.sampleTimerEvent, nn::os::EventClearMode_ManualClear);
    nn::os::InitializeLightEvent(&globals.workBufferClosed, false, nn::os::EventClearMode_AutoClear);
    nn::os::InitializeLightEvent(&globals.sampleTaken, false, nn::os::EventClearMode_AutoClear);

    uint32_t coreMask = TargetApplication::GetCurrent()->GetCoreMask();
    int actualCoreCount = 0;

    for (core = 0; core < SupportedCoreCount; ++core)
    {
        if ((coreMask & (1 << core)) == 0)
        {
            sSamplingThreads[core] = nullptr;
            continue;
        }

        SamplingThread *st = Memory::GetInstance()->Allocate<SamplingThread>();
        new (st) SamplingThread;

        sSamplingThreads[core] = st;

        memset(st, 0, sizeof(*st));

        nn::os::InitializeEvent(&st->guardStartEvent, false, nn::os::EventClearMode_AutoClear);
        nn::os::InitializeEvent(&st->guardExitEvent, false, nn::os::EventClearMode_AutoClear);

        st->stackMemory = reinterpret_cast<uint8_t*>(
            Memory::GetInstance()->Allocate(SampleThreadStackSize, nn::os::GuardedStackAlignment));
        st->stackMemoryGuard = reinterpret_cast<uint8_t*>(
            Memory::GetInstance()->Allocate(SampleThreadStackSize, nn::os::GuardedStackAlignment));

        result = nn::os::CreateThread(
            &st->thread,
            &SamplingThread_ThreadFunc,
            st,
            st->stackMemory,
            SampleThreadStackSize,
            ThreadPriority_Sampler,
            static_cast<int32_t>(core));

        if (result.IsFailure()) { break; }

        result = nn::os::CreateThread(
            &st->threadGuard,
            &SamplingThread_ThreadGuardFunc,
            st,
            st->stackMemoryGuard,
            SampleThreadStackSize,
            ThreadPriority_SamplerGuard,
            static_cast<int32_t>(core));

        if (result.IsFailure()) { break; }

        ++actualCoreCount;

        {
            // Set the (different) thread names for each core's sampler thread
            char threadName[nn::os::ThreadNameLengthMax];
            snprintf(threadName, sizeof(threadName), "[profiler] Sampler %d", core);
            nn::os::SetThreadName(&st->thread, threadName);
        }

        {
            // Set the (different) thread names for each core's guard thread
            char threadName[nn::os::ThreadNameLengthMax];
            snprintf(threadName, sizeof(threadName), "[profiler] Guard %d", core);
            nn::os::SetThreadName(&st->threadGuard, threadName);
        }

        nn::os::StartThread(&st->thread);
        nn::os::StartThread(&st->threadGuard);

        nn::os::InitializeEvent(&st->trigger, false, nn::os::EventClearMode_AutoClear);
    }

    if (result.IsFailure())
    {
        ERROR_LOG("Error initializing sampling threads\n");
        DumpResultInformation(LOG_AS_ERROR, result);
        NN_ABORT();
    }

    nn::os::InitializeBarrier(&globals.threadBarrier, actualCoreCount);

    return result;
}



nn::Result FinalizeSamplingThreads()
{
    nn::Result result = nn::ResultSuccess();

    StopProfilingSamplingThreads();

    uint32_t coreMask = TargetApplication::GetCurrent()->GetCoreMask();

    for (uint32_t core = 0; core < SupportedCoreCount; ++core)
    {
        if ((coreMask & (1 << core)) == 0) { continue; }

        sSamplingThreads[core]->shouldExit = true;
        sSamplingThreads[core]->isActive = false;
    }

    // Kick the sampling threads so that they can exit
    nn::os::SignalEvent(&globals.startProfilingEvent);

    for (uint32_t core = 0; core < SupportedCoreCount; ++core)
    {
        if ((coreMask & (1 << core)) == 0) { continue; }

        SamplingThread* st = sSamplingThreads[core];
        sSamplingThreads[core] = nullptr;

        nn::os::WaitThread(&st->thread);
        nn::os::WaitThread(&st->threadGuard);
        nn::os::DestroyThread(&st->thread);
        nn::os::DestroyThread(&st->threadGuard);

        nn::os::FinalizeEvent(&st->trigger);
        nn::os::FinalizeEvent(&st->guardStartEvent);
        nn::os::FinalizeEvent(&st->guardExitEvent);

        Memory::GetInstance()->Free(st->stackMemory);
        Memory::GetInstance()->Free(st);
    }

    nn::os::FinalizeBarrier(&globals.threadBarrier);
    nn::os::FinalizeTimerEvent(&globals.sampleTimerEvent);
    nn::os::FinalizeLightEvent(&globals.sampleTaken);
    nn::os::FinalizeLightEvent(&globals.workBufferClosed);
    nn::os::FinalizeEvent(&globals.startProfilingEvent);

    FinalizeWorkAreas();

    return result;
}



bool AreSamplingThreadsInitialized()
{
    return (globals.threadBarrier._state != globals.threadBarrier.State_NotInitialized);
}



void PrepareInstrumentationBuffer(SettingsFromThePcGui *settings)
{
    InitializeCoreWorkArea(SampleBufferIndex_Instrumentation);
    WorkArea* ws = GetWorkAreaForCore(SampleBufferIndex_Instrumentation);
    ws->settings = settings;

    bool hasPerfCounters = (settings->perf_counters[0] != pmu::PerfCounter_Disabled);
    ws->fields.SetBit(WorkArea::IsUsingPerfCounters, hasPerfCounters);
    ws->fields.SetBit(WorkArea::RecordPerformanceCounters, hasPerfCounters);

    // Set this last to ensure that we don't attempt to enable recording without all settings set
    ws->record_cores = static_cast<uint8_t>(settings->coreMask) | (1 << SampleBufferIndex_Instrumentation);

    if (settings->IsOutOfProcess())
    {
        IpcEventInfo info;
        info.event = IpcEvent_InstrumentationBuffer;
        info.info.instrumentationBuffer.type = InstrumentationBufferType_Start;

        auto queue = GetIpcEventQueue();
        queue->Push(&info);
    }
}



void CloseInstrumentationBuffer()
{
    WorkArea* ws = GetWorkAreaForCore(SampleBufferIndex_Instrumentation);
    if (ws->record_cores & (1 << SampleBufferIndex_Instrumentation))
    {
        DUMP_CURRENT_LINE();
        // Try to only do this once, while it shouldn't cause issues, it may be slow
        ws->record_cores &= ~(1 << SampleBufferIndex_Instrumentation);
        FinalizeCoreWorkArea(SampleBufferIndex_Instrumentation);
    }
    if (ws->settings->IsOutOfProcess())
    {
        IpcEventInfo info;
        info.event = IpcEvent_InstrumentationBuffer;
        info.info.instrumentationBuffer.type = InstrumentationBufferType_Stop;

        auto queue = GetIpcEventQueue();
        queue->Push(&info);
    }
}



void StartProfilingSamplingThreads(SettingsFromThePcGui *settings)
{
    bool haveControlThread = false;
    uint32_t coreCount = 0;
    for (uint32_t core = 0; core < SupportedCoreCount; ++core)
    {
        InitializeCoreWorkArea(SampleBufferIndex(core));
        WorkArea *ws = GetWorkAreaForCore(SampleBufferIndex(core));
        ws->settings = settings;

        bool coreIsActive = ((settings->coreMask & (1 << core)) != 0);

        auto st = sSamplingThreads[core];
        if (st != nullptr)
        {
            st->isActive = coreIsActive;
            st->isControlThread = false;
            if (coreIsActive && !haveControlThread)
            {
                st->isControlThread = true;
                haveControlThread = true;
            }
        }
        if (coreIsActive) { ++coreCount; }
    }
    NN_SDK_ASSERT(0 <= coreCount && coreCount <= SupportedCoreCount);
    NN_STATIC_ASSERT(SupportedCoreCount < INT32_MAX);

    PrepareInstrumentationBuffer(settings);

    // TODO: Would prefer to do this later, but leads to an error as the thread count != 0
    if (globals.samplerThreadStartedBarrier._state == nn::os::BarrierType::State_Initialized)
    {
        nn::os::FinalizeBarrier(&globals.samplerThreadStartedBarrier);
    }
    nn::os::InitializeBarrier(&globals.samplerThreadStartedBarrier, static_cast<int>(coreCount + 1));

    globals.stopSampling = false;
    globals.sampleWait = 0;
    globals.coreCount = coreCount;
    //globals.allCoreSyncCount = coreCount;
    globals.useProfilerProcess =
        TargetApplication::GetCurrent()->GetCoreCount() > 1 &&
        TargetApplication::GetCurrent()->GetSdkVersion() >= NN_SDK_VERSION_NUMBER(3, 0, 0, 0);
    globals.isInstrumented = ((settings->flags & SettingsFromThePcGui::UseInstrumented) != 0);

    nn::os::ClearLightEvent(&globals.sampleTaken);
    nn::os::SignalEvent(&globals.startProfilingEvent);

    nn::os::AwaitBarrier(&globals.samplerThreadStartedBarrier);

    // TODO: Would prefer to do this here, but leads to an error as the thread count != 0
    //nn::os::FinalizeBarrier(&globals.samplerThreadStartedBarrier);

    INFO_LOG("Requesting Target Application to start sampling.\n");

    SetProfilerStatus(ProfilerStatus_Profiling);
    TargetApplication::GetCurrent()->StartProfiling(
        settings->requested_time_between_samples_in_nanoseconds);
}



void StopProfilingSamplingThreads()
{
    if (GetProfilerStatus() == ProfilerStatus_Active)
    {
        SetProfilerStatus(ProfilerStatus_Transferring);
    }

    CloseInstrumentationBuffer();

    for (uint32_t core = 0; core < SupportedCoreCount; ++core)
    {
        SamplingTriggerStop(core);
    }

    globals.stopSampling = true;

    TargetApplication::GetCurrent()->StopProfiling();

    nn::os::SignalEvent(GetStopEvent());
}



void SamplingTriggerSample(uint32_t core, nn::os::ThreadId thread)
{
    DUMP_CURRENT_LINE();
    SamplingThread *coreSettings = sSamplingThreads[core];
    NN_SDK_ASSERT_NOT_NULL(coreSettings);
    if (coreSettings != nullptr)
    {
        NN_SDK_ASSERT(coreSettings->isActive, " on core %d", core);
        NN_SDK_ASSERT(coreSettings->isProfiling, " on core %d", core);

        coreSettings->threadToSample = thread;
        //coreSettings->triggerFlags |= TriggerFlags_TakeSample;

        ++globals.sampleWait;
        nn::os::SignalEvent(&coreSettings->trigger);
    }
}



void SamplingTriggerStop(uint32_t core)
{
    DUMP_CURRENT_LINE();
    SamplingThread *coreSettings = sSamplingThreads[core];
    if (coreSettings != nullptr)
    {
        coreSettings->triggerFlags |= TriggerFlags_StopProfiling;
        nn::os::SignalEvent(&coreSettings->trigger);
    }
}



void WaitCoresClosed()
{
    NN_SDK_ASSERT(nn::os::GetThreadPriority(nn::os::GetCurrentThread()) >= ThreadPriority_Sampler);
    while (globals.workBufferOpen != 0)
    {
        nn::os::TimedWaitLightEvent(&globals.workBufferClosed, nn::TimeSpan::FromMilliSeconds(1));
        // nn::os::YieldThread();
    }
}



void WaitProfilingStarted()
{
    nn::os::WaitEvent(&globals.startProfilingEvent);
}



void WaitSamples()
{
    //NN_SDK_ASSERT(nn::os::GetThreadPriority(nn::os::GetCurrentThread()) >= ThreadPriority_Sampler);
    nn::os::WaitLightEvent(&globals.sampleTaken);
    //while (globals.sampleWait > 0)
    //{
    //    // TODO: Are we softlocking due to a disconnect betwen the event here and the loop counter?
    //    //nn::os::TimedWaitLightEvent(&globals.sampleTaken, nn::TimeSpan::FromMicroSeconds(10));
    //    nn::os::WaitLightEvent(&globals.sampleTaken);
    //    // nn::os::YieldThread();
    //}
}



void DumpSamplingThreadInfo()
{
    FORCE_LOG(" Sampling Thread\n");
    FORCE_LOG("  Sample Wait: 0x%x\n", globals.sampleWait.load());
}



void SamplingThread_RecordImmediate(WorkArea* ws, uint32_t core, nn::os::ThreadId thread)
{
    switch (thread)
    {
    case DebugEventSystemSleepThreadId:
        SamplingThread_RecordNoThread(ws, core);
        break;
    case DebugEventOtherProcessThreadId:
        SamplingThread_RecordUnknownThread(ws, core);
        break;
    default:
        SamplingThread_RecordSample(ws, core, thread);
        break;
    }
}



} // profiler
} // nn
