﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#include <cstdio>
#include <atomic>
#include <new>

#include <nn/nn_Common.h>
#include <nn/nn_TimeSpan.h>
#include <nn/nn_Version.h>
#include <nn/os/os_Barrier.h>
#include <nn/os/os_Event.h>
#include <nn/os/os_SdkThread.h>
#include <nn/os/os_SdkThreadInfo.h>
#include <nn/os/os_SystemEvent.h>
#include <nn/os/os_Thread.h>
#include <nn/os/os_Tick.h>
#include <nn/os/os_TimerEvent.h>
#include <nn/util/util_BitPack.h>
#include <nn/profiler/profiler_Api.h>

#include "pmu/profiler_PerfCounterGroups.h"
#include "pmu/profiler_PerfCounterThread.h"
#include "profiler_CodeRewriting.h"
#include "profiler_Comms.h"
#include "profiler_DataStream.h"
#include "profiler_Defines.h"
#include "profiler_HeaderWriter.h"
#include "profiler_Hipc.h"
#include "profiler_LibPrivate.h"
#include "profiler_Logging.h"
#include "profiler_Memory.h"
#include "profiler_RecordMethods.h"
#include "profiler_ResultPrivate.h"
#include "profiler_SamplingThread.h"
#include "profiler_TargetApplication.h"
#include "profiler_Time.h"
#include "profiler_Timing.h"
#include "profiler_Workarea.h"
#include "profiler_WriteToBuffer.h"


#define NN_PROFILER_USE_INDIVIDUAL_SLEEP

#if !defined(NN_PROFILER_USE_EXTERNAL_PROCESS)
#if !defined(NN_PROFILER_USE_ONE_SHOT_TIMER)
#if !defined(NN_PROFILER_USE_PERIODIC_TIMER)
#if !defined(NN_PROFILER_USE_INDIVIDUAL_SLEEP)
#error "A sampling method must be defined!"
#endif
#endif
#endif
#endif


namespace nn { namespace profiler {


namespace /*anonymous*/
{
    const size_t SampleThreadStackSize = 16 * 1024;
    const size_t RecordFunctionsBufferSize = 8;

    enum SamplingThreadTimer : int
    {
        SamplingThreadTimer_Total,
        SamplingThreadTimer_GetLastThreadInfo,
        SamplingThreadTimer_Sleep,
        SamplingThreadTimer_Record,
        SamplingThreadTimer_RecordLoop,

        SamplingThreadTimer_Count,
    };

    enum TriggerFlags : uint32_t
    {
        TriggerFlags_TakeSample = (1u << 0),
        TriggerFlags_SampleNoThread = (1u << 1),
        TriggerFlags_SampleUnknownThread = (1u << 2),
        TriggerFlags_SampleSamplerThread = (1u << 3),

        TriggerFlags_StopProfiling = (1u << 31),
    };

    struct SamplingThread
    {
        uint8_t *stackMemory;
        nn::os::ThreadType thread;

        volatile bool shouldExit;
        volatile bool isProfiling;
        volatile bool isActive;
        bool isControlThread;

        ProfilerRecordFuncPtr recordFuncs[RecordFunctionsBufferSize];

        std::atomic<uint32_t> triggerFlags;
        nn::os::EventType trigger;

        nn::os::ThreadType* threadToSample;
        nn::Bit32 threadFlags;

#ifdef NN_PROFILER_TIMING_ENABLE
        nn::profiler::Timer timers[SamplingThreadTimer_Count];
#endif
    };
    SamplingThread* sSamplingThreads[SupportedCoreCount];

    struct Globals
    {
        nn::os::EventType startProfilingEvent;
        nn::os::TimerEventType sampleTimerEvent;
        nn::os::BarrierType threadBarrier;
        nn::os::BarrierType samplerThreadStartedBarrier;
        std::atomic<int32_t> sampleWait;
        std::atomic<uint32_t> workBufferOpen;
        std::atomic<uint32_t> allCoreSyncCount;
        std::atomic<uint32_t> coreCount;
        bool isInstrumented;
        bool useProfilerProcess;
        volatile bool stopSampling;
    };
    Globals globals;

    void SynchronizeCores() NN_NOEXCEPT
    {
        const uint32_t coreCount = static_cast<uint32_t>(TargetApplication::GetCoreCount());

        // 前の同期が終わっていることを確認
        // (globals.allCoreSyncCount は InterlockedVariable)
        while (!(globals.allCoreSyncCount < coreCount) && !globals.stopSampling) { }

        // カウントをインクリメントし、
        const uint32_t count = globals.allCoreSyncCount++;

        // 以前のカウントにコア数を足したものになるまで待つ。
        while (globals.allCoreSyncCount != count + coreCount && !globals.stopSampling) { }

        if (globals.allCoreSyncCount == coreCount * 2 - 1)
        {
            // 自分が最後の到達コアだったら、カウントをゼロ初期化することで今回の同期を終わらせる。
            // 次の同期が走れるようになる。
            globals.allCoreSyncCount = 0;
        }
        else
        {
            // 自分が最後の到達コアではなかったら、カウントをインクリメント。
            ++globals.allCoreSyncCount;
        }
    }

} // anonymous



static void SamplingThread_ThreadFunc(void* arg);
static void SamplingThread_ProfilingLoop();
static void SamplingThread_WaitForSampling(WorkArea* ws, SamplingThread* coreSettings);

void SamplingThread_RecordNoThread(WorkArea* ws, uint32_t core);
void SamplingThread_RecordUnknownThread(WorkArea* ws, uint32_t core);
void SamplingThread_RecordProfilerThread(WorkArea* ws, uint32_t core);
static void SamplingThread_RecordTimedBasicEvent(WorkArea* ws, uint32_t core, uint32_t event);

static void SamplingThread_FillRecordFuncBuffer(ProfilerRecordFuncPtr* funcs);
static void SamplingThread_SetupRecordFuncList(
    WorkArea* ws,
    SamplingThread* threadDetails,
    bool* hasPerfCounters);
static nn::Result SamplingThread_RunRecordFuncList(ProfilerRecordFuncPtr* funcs, WorkArea* ws);

static void SetupCoreHeader(uint32_t core, SettingsFromThePcGui* settings);


nn::Result InitializeSamplingThreads()
{
    nn::Result result = nn::ResultSuccess();
    uint32_t core = 0;

    InitializeWorkAreas();

    memset(&globals, 0, sizeof(Globals));

    nn::os::InitializeEvent(
        &globals.startProfilingEvent,
        false,
        nn::os::EventClearMode_ManualClear);
    globals.stopSampling = false;

    nn::os::InitializeTimerEvent(&globals.sampleTimerEvent, nn::os::EventClearMode_ManualClear);

    uint32_t coreMask = TargetApplication::GetCoreMask();
    int actualCoreCount = 0;

    for (core = 0; core < SupportedCoreCount; ++core)
    {
        if ((coreMask & (1 << core)) == 0)
        {
            sSamplingThreads[core] = nullptr;
            continue;
        }

        SamplingThread *st = Memory::GetInstance()->Allocate<SamplingThread>();
        new (st) SamplingThread;

        sSamplingThreads[core] = st;

        memset(st, 0, sizeof(*st));

        st->stackMemory = reinterpret_cast<uint8_t*>(
            Memory::GetInstance()->Allocate(SampleThreadStackSize, nn::os::GuardedStackAlignment));

        result = nn::os::CreateThread(
            &st->thread,
            &SamplingThread_ThreadFunc,
            st,
            st->stackMemory,
            SampleThreadStackSize,
            TargetApplication::GetMaximumThreadPriority(),
            static_cast<int32_t>(core));

        if (result.IsFailure()) { break; }

        ++actualCoreCount;

        {
            // Set the (different) thread names for each core's sampler thread
            char threadName[nn::os::ThreadNameLengthMax];
            snprintf(threadName, sizeof(threadName), "NX CPU Profiler: Sampler %d", core);
            nn::os::SetThreadName(&st->thread, threadName);
        }
        nn::os::StartThread(&st->thread);

        nn::os::InitializeEvent(&st->trigger, false, nn::os::EventClearMode_AutoClear);
    }

    if (result.IsFailure())
    {
        ERROR_LOG("Error initializing sampling threads\n");
        DumpResultInformation(LOG_AS_ERROR, result);
        NN_ABORT();
    }

    nn::os::InitializeBarrier(&globals.threadBarrier, actualCoreCount);

    return result;
}



nn::Result FinalizeSamplingThreads()
{
    nn::Result result = nn::ResultSuccess();

    SendBasicIpcMessage(ProfilerIpcMessage_StopProfiling, 0);
    StopProfilingSamplingThreads();

    uint32_t coreMask = TargetApplication::GetCoreMask();

    for (uint32_t core = 0; core < SupportedCoreCount; ++core)
    {
        if ((coreMask & (1 << core)) == 0) { continue; }

        sSamplingThreads[core]->shouldExit = true;
        sSamplingThreads[core]->isActive = false;
    }

    // Kick the sampling threads so that they can exit
    nn::os::SignalEvent(&globals.startProfilingEvent);

    for (uint32_t core = 0; core < SupportedCoreCount; ++core)
    {
        if ((coreMask & (1 << core)) == 0) { continue; }

        SamplingThread* st = sSamplingThreads[core];
        sSamplingThreads[core] = nullptr;

        nn::os::WaitThread(&st->thread);
        nn::os::DestroyThread(&st->thread);

        nn::os::FinalizeEvent(&st->trigger);

        Memory::GetInstance()->Free(st->stackMemory);
        Memory::GetInstance()->Free(st);
    }

    nn::os::FinalizeBarrier(&globals.threadBarrier);
    nn::os::FinalizeTimerEvent(&globals.sampleTimerEvent);
    nn::os::FinalizeEvent(&globals.startProfilingEvent);

    FinalizeWorkAreas();

    return result;
}



void PrepareInstrumentationBuffer(SettingsFromThePcGui *settings)
{
    DUMP_CURRENT_LINE();
    InitializeCoreWorkArea(SampleBufferIndex_Instrumentation);
    WorkArea* ws = GetWorkAreaForCore(SampleBufferIndex_Instrumentation);
    ws->settings = settings;

    bool hasPerfCounters = (settings->perf_counters[0] != pmu::PerfCounter_Disabled);
    ws->fields.SetBit(WorkArea::IsUsingPerfCounters, hasPerfCounters);
    ws->fields.SetBit(WorkArea::RecordPerformanceCounters, hasPerfCounters);

    // Set this last to ensure that we don't attempt to enable recording without all settings set
    ws->record_cores = settings->coreMask | (1 << SampleBufferIndex_Instrumentation);
}



void CloseInstrumentationBuffer()
{
    DUMP_CURRENT_LINE();
    WorkArea* ws = GetWorkAreaForCore(SampleBufferIndex_Instrumentation);
    if (ws->record_cores & (1 << SampleBufferIndex_Instrumentation))
    {
        DUMP_CURRENT_LINE();
        // Try to only do this once, while it shouldn't cause issues, it may be slow
        ws->record_cores &= ~(1 << SampleBufferIndex_Instrumentation);
        FinalizeCoreWorkArea(SampleBufferIndex_Instrumentation);
    }
}



void StartProfilingSamplingThreads(SettingsFromThePcGui *settings)
{
    bool haveControlThread = false;
    uint32_t coreCount = 0;
    for (uint32_t core = 0; core < SupportedCoreCount; ++core)
    {
        const SampleBufferIndex index = static_cast<SampleBufferIndex>(core);
        InitializeCoreWorkArea(index);
        WorkArea *ws = GetWorkAreaForCore(index);
        ws->settings = settings;

        bool coreIsActive((settings->coreMask & (1 << core)) != 0);

        auto st = sSamplingThreads[core];
        if (st != nullptr)
        {
            st->isActive = coreIsActive;
            st->isControlThread = false;
            if (coreIsActive && !haveControlThread)
            {
                st->isControlThread = true;
                haveControlThread = true;
            }
        }
        if (coreIsActive) { ++coreCount; }
    }
    NN_SDK_ASSERT(0 <= coreCount && coreCount < INT32_MAX);

    PrepareInstrumentationBuffer(settings);

    // TODO: Would prefer to do this later, but leads to an error as the thread count != 0
    if (globals.samplerThreadStartedBarrier._state == nn::os::BarrierType::State_Initialized)
    {
        nn::os::FinalizeBarrier(&globals.samplerThreadStartedBarrier);
    }
    nn::os::InitializeBarrier(&globals.samplerThreadStartedBarrier, static_cast<int>(coreCount + 1));

    globals.stopSampling = false;
    globals.sampleWait = 0;
    globals.coreCount = coreCount;
    globals.allCoreSyncCount = 0;
    globals.useProfilerProcess =
        TargetApplication::GetCoreCount() > 1 &&
        TargetApplication::GetCurrent()->GetSdkVersion() >= NN_SDK_VERSION_NUMBER(3, 0, 0, 0);
    globals.isInstrumented = ((settings->flags & SettingsFromThePcGui::UseInstrumented) != 0);

#if defined(NN_PROFILER_USE_EXTERNAL_PROCESS)
    if (globals.useProfilerProcess)
    {
        StartSignalingProfilerEvent(
            settings->requested_time_between_samples_in_nanoseconds,
            static_cast<int>(globals.coreCount));
    }
#endif

    nn::os::SignalEvent(&globals.startProfilingEvent);

    nn::os::AwaitBarrier(&globals.samplerThreadStartedBarrier);

    // TODO: Would prefer to do this here, but leads to an error as the thread count != 0
    //nn::os::FinalizeBarrier(&globals.samplerThreadStartedBarrier);
}



void StopProfilingSamplingThreads()
{
    CloseInstrumentationBuffer();

    for (uint32_t core = 0; core < SupportedCoreCount; ++core)
    {
        SamplingTriggerStop(core);
    }

    globals.stopSampling = true;

    //nn::os::SignalEvent(GetStopEvent(false));

#if defined(NN_PROFILER_USE_EXTERNAL_PROCESS)
    if (globals.useProfilerProcess)
    {
        StopSignalingProfilerEvent();
    }
#endif
}



void SamplingTriggerSample(uint32_t core, nn::os::ThreadType* thread)
{
    DUMP_CURRENT_LINE();
    SamplingThread *coreSettings = sSamplingThreads[core];
    if (coreSettings != nullptr)
    {
        coreSettings->threadToSample = thread;
        coreSettings->triggerFlags |= TriggerFlags_TakeSample;

        ++globals.sampleWait;
        nn::os::SignalEvent(&coreSettings->trigger);
    }
}



void SamplingTriggerStop(uint32_t core)
{
    DUMP_CURRENT_LINE();
    SamplingThread *coreSettings = sSamplingThreads[core];
    if (coreSettings != nullptr)
    {
        coreSettings->triggerFlags |= TriggerFlags_StopProfiling;
        nn::os::SignalEvent(&coreSettings->trigger);
    }
}



void WaitCoresClosed()
{
    while (globals.workBufferOpen != 0)
    {
        nn::os::YieldThread();
    }
}



void WaitProfilingStarted()
{
    nn::os::WaitEvent(&globals.startProfilingEvent);
}



namespace /*anonymous*/ {


void SamplingThread_Active(uint32_t core, WorkArea* ws, SamplingThread* coreSettings);
void SamplingThread_Inactive();


void SamplingThread_Active(uint32_t core, WorkArea* ws, SamplingThread* coreSettings)
{
    bool success = false;
    bool hasPerfCounters = false;

    globals.workBufferOpen |= (1 << core);

    success = ObtainAndSetupSampleBuffer(SampleBufferIndex(core));
    if (!success)
    {
        ERROR_LOG("Could not obtain memory for core %d\n", core);
        coreSettings->isActive = false;
        nn::os::AwaitBarrier(&globals.samplerThreadStartedBarrier);
        SamplingThread_Inactive();
        return;
    }

    nn::os::ClearEvent(&coreSettings->trigger);
    SamplingThread_SetupRecordFuncList(ws, coreSettings, &hasPerfCounters);

    {
        SettingsFromThePcGui *settings = ws->settings;
        uint32_t flags = settings->flags;
        if ((flags & SettingsFromThePcGui::SampleByPerfCounter))
        {
            if (settings->perf_counters[0] != pmu::PerfCounter_Disabled)
            {
                DEBUG_LOG("Setting up sample by perf on core %d\n", core);
                ws->fields.SetBit(WorkArea::SampleUsingPerfCounters, true);
                ws->fields.SetBit(WorkArea::UseInfiniteTime, true);
                pmu::SetInterrupt(
                    static_cast<pmu::PerfCounter>(settings->perf_counters[0]),
                    settings->requested_time_between_samples_in_nanoseconds,
                    core);
            }
        }

        if (flags & SettingsFromThePcGui::UseInstrumented)
        {
            ws->fields.SetBit(WorkArea::UseInfiniteTime, true);
        }

        DUMP_CURRENT_LINE();

        ws->fields.SetBit(WorkArea::IsUsingPerfCounters, hasPerfCounters);

        if (!ws->fields.GetBit(WorkArea::SampleUsingPerfCounters))
        {
            // If we are sampling by performance counter the counter will fill in this field
            // to indicate that the core should be recorded after breaking.
            // This allows us to only record cores that have overflowed.
            ws->record_cores = static_cast<uint8_t>(settings->coreMask);
        }
    }

    DUMP_CURRENT_LINE();
    SetupCoreHeader(core, ws->settings);

    coreSettings->isProfiling = true;
    nn::os::AwaitBarrier(&globals.samplerThreadStartedBarrier);

    DUMP_CURRENT_LINE();

    // Set up timers
    for (int timerIdx = 0; timerIdx < SamplingThreadTimer_Count; ++timerIdx)
    {
        NN_PROFILER_TIMING_CLEAR(&coreSettings->timers[timerIdx]);
    }

#if defined(NN_PROFILER_USE_PERIODIC_TIMER)
    if (coreSettings->isControlThread)
    {
        nn::TimeSpan interval = nn::TimeSpan::FromNanoSeconds(ws->settings->requested_time_between_samples_in_nanoseconds);
        nn::os::StartPeriodicTimerEvent(&globals.sampleTimerEvent, interval, interval);
    }
#endif

    DUMP_CURRENT_LINE();
    SamplingThread_ProfilingLoop();
    DUMP_CURRENT_LINE();

#if defined(NN_PROFILER_USE_PERIODIC_TIMER)
    if (coreSettings->isControlThread)
    {
        nn::os::StopTimerEvent(&globals.sampleTimerEvent);
    }
#endif

#ifdef NN_PROFILER_TIMING_ENABLE
    for (int timerIdx = 0; timerIdx < SamplingThreadTimer_Count; ++timerIdx)
    {
        nn::TimeSpan ts = nn::os::ConvertToTimeSpan(nn::os::Tick((int64_t)coreSettings->timers[timerIdx].GetAverageTime()));
        FORCE_LOG("Timer %d on core %d: %lluns\n", timerIdx, core, ts.GetNanoSeconds());
    }
#endif

    DEBUG_LOG("Finished profiling on core %d\n", core);
    coreSettings->isProfiling = false;
    coreSettings->isActive = false;

    // Stop performance counters
    if (ws->fields.GetBit(WorkArea::SampleUsingPerfCounters))
    {
        pmu::StopCounters();
        // Unbind from the interrupt overflow now that profiling has ceased
        pmu::SetInterrupt(pmu::PerfCounter_Disabled, 0, core);
    }

    // Close the buffer
    {
        uint8_t *writePtr = ws->curPtr.fetch_add(16);
        writePtr = WriteToBuffer(writePtr, static_cast<uint32_t>(PayloadEvents_Base));
        writePtr = WriteToBuffer(writePtr, GetCurrentTime());
        writePtr = WriteToBuffer(writePtr, static_cast<uint32_t>(PayloadGenericEvents_End));

        FinalizeCoreWorkArea(SampleBufferIndex(core));
    }
}



void SamplingThread_Inactive()
{
    // We need to lock the core to prevent thread migration from occurring
    // Otherwise, threads may escape onto the core where no profiling is occurring
    while (!globals.stopSampling)
    {
        SynchronizeCores();
#if defined(NN_PROFILER_USE_ONE_SHOT_TIMER) || defined(NN_PROFILER_USE_PERIODIC_TIMER)
        nn::os::ClearTimerEvent(&globals.sampleTimerEvent);
#endif
        if (globals.stopSampling) { return; }
#if defined(NN_PROFILER_USE_EXTERNAL_PROCESS)
        nn::os::WaitSystemEvent(&g_ProfilerProcessEvent);
        nn::os::ClearSystemEvent(&g_ProfilerProcessEvent);
#else
        nn::os::WaitTimerEvent(&globals.sampleTimerEvent);
        SynchronizeCores();
#endif
    }
}


} // anonymous




static void SamplingThread_ThreadFunc(void *arg)
{
    uint32_t core = static_cast<uint32_t>(nn::os::GetCurrentCoreNumber());

    SamplingThread* coreSettings = reinterpret_cast<SamplingThread*>(arg);
    WorkArea* ws = GetWorkAreaForCore(SampleBufferIndex(core));

    coreSettings->isProfiling = false;

    DumpThreadInformation();

    while (!coreSettings->shouldExit)
    {
        nn::os::WaitEvent(&globals.startProfilingEvent);

        if (coreSettings->shouldExit) { break; } // Exit!

        if (coreSettings->isActive)
        {
            SamplingThread_Active(core, ws, coreSettings);
        }
#if !defined(NN_PROFILER_USE_INDIVIDUAL_SLEEP)
        else if (globals.useProfilerProcess)
        {
            SamplingThread_Inactive();
        }
#endif

        globals.workBufferOpen &= ~(1 << core);

        nn::os::AwaitBarrier(&globals.threadBarrier);
        nn::os::ClearEvent(&globals.startProfilingEvent);
    }
}



static NN_NOINLINE void SamplingThread_ProfilingLoop()
{
    nn::Result result;
    uint32_t core = static_cast<uint32_t>(nn::os::GetCurrentCoreNumber());
    DUMP_CURRENT_LINE();

    SamplingThread* coreSettings = sSamplingThreads[core];
    WorkArea* ws = GetWorkAreaForCore(SampleBufferIndex(core));

    coreSettings->triggerFlags = 0;

    bool shouldExit = false;

    // Force the sampling thread into the list of threads
    TargetApplication::GetCurrent()->RegisterThread(nn::os::GetCurrentThread());

    if (ws->fields.GetBit(WorkArea::IsUsingPerfCounters))
    {
        pmu::PerformanceCounters data;
        nn::profiler::pmu::ReadCounters(data, core, false);
    }

    while (!shouldExit)
    {
        DUMP_CURRENT_LINE();
        SamplingThread_WaitForSampling(ws, coreSettings);

        uint32_t flags = coreSettings->triggerFlags.exchange(0);
        if (flags == 0)
        {
            ERROR_LOG("Received trigger even though no flags were set\n");
            continue;
        }

        VERBOSE_LOG("Sample requested on core %d, triggerFlags = %08x\n", core, flags);

        // Stop should always be handled first
        if (flags & TriggerFlags_StopProfiling)
        {
            shouldExit = true;
            continue;
        }

        NN_PROFILER_TIMING_BEGIN(&coreSettings->timers[SamplingThreadTimer_Record]);
        if (flags & TriggerFlags_TakeSample)
        {
            SamplingThread_RecordSample(ws, core, coreSettings->threadToSample);
            --globals.sampleWait;
        }

        if (flags & TriggerFlags_SampleNoThread)
        {
            SamplingThread_RecordNoThread(ws, core);
        }

        if (flags & TriggerFlags_SampleUnknownThread)
        {
            SamplingThread_RecordUnknownThread(ws, core);
        }

        if (flags & TriggerFlags_SampleSamplerThread)
        {
            SamplingThread_RecordProfilerThread(ws, core);
        }
        NN_PROFILER_TIMING_END(&coreSettings->timers[SamplingThreadTimer_Record]);
    }

    // Set to the negative of the tracked core count to try to ensure that the Wait cannot lock indefinitely
    // using INT_MIN could cause an underflow
    globals.sampleWait = -static_cast<int32_t>(SupportedCoreCount);
}



static NN_NOINLINE void SamplingThread_WaitForSampling(WorkArea* ws, SamplingThread* coreSettings)
{
    DUMP_CURRENT_LINE();
    NN_PROFILER_TIMING_END(&coreSettings->timers[SamplingThreadTimer_Total]);
    NN_PROFILER_TIMING_BEGIN(&coreSettings->timers[SamplingThreadTimer_Sleep]);

    // todo: SampleByPerf: nn::os::WaitEvent(&coreSettings->trigger);

    if (globals.isInstrumented)
    {
        nn::os::WaitEvent(&coreSettings->trigger);
        return;
    }
    else if (globals.useProfilerProcess)
    {
#if defined(NN_PROFILER_USE_EXTERNAL_PROCESS)
        SynchronizeCores();
        if (globals.stopSampling) { return; }
        nn::os::WaitSystemEvent(&g_ProfilerProcessEvent);
        nn::os::ClearSystemEvent(&g_ProfilerProcessEvent);

#elif defined(NN_PROFILER_USE_ONE_SHOT_TIMER) || defined(NN_PROFILER_USE_PERIODIC_TIMER)
        SynchronizeCores();
        nn::os::ClearTimerEvent(&globals.sampleTimerEvent);
        if (globals.stopSampling) { return; }
#if defined (NN_PROFILER_USE_ONE_SHOT_TIMER)
        if (coreSettings->isControlThread)
        {
            nn::os::StartOneShotTimerEvent(
                &globals.sampleTimerEvent,
                nn::TimeSpan::FromNanoSeconds(ws->settings->requested_time_between_samples_in_nanoseconds));
        }
#endif
        nn::os::WaitTimerEvent(&globals.sampleTimerEvent);
        SynchronizeCores();

#elif defined(NN_PROFILER_USE_INDIVIDUAL_SLEEP)
        nn::os::TimedWaitEvent(
            &coreSettings->trigger,
            nn::TimeSpan::FromNanoSeconds(ws->settings->requested_time_between_samples_in_nanoseconds));
#endif
    }
    else
    {
        nn::os::TimedWaitEvent(
            &coreSettings->trigger,
            nn::TimeSpan::FromNanoSeconds(ws->settings->requested_time_between_samples_in_nanoseconds));
    }

    NN_PROFILER_TIMING_END(&coreSettings->timers[SamplingThreadTimer_Sleep]);
    NN_PROFILER_TIMING_BEGIN(&coreSettings->timers[SamplingThreadTimer_Total]);

    nn::os::SdkLastThreadContext context;
    nn::os::ThreadType*          thread;
    nn::Bit32                    flag;

    DUMP_CURRENT_LINE();
    NN_PROFILER_TIMING_BEGIN(&coreSettings->timers[SamplingThreadTimer_GetLastThreadInfo]);
    nn::Result result = nn::os::GetLastThreadInfo(&thread, &context, &flag);
    ws->fields.SetBit(WorkArea::ThreadInSystemCall, (flag & nn::os::LastThreadInfoFlag_ThreadInSystemCall) != 0);
    if (result.IsSuccess())
    {
        DUMP_CURRENT_LINE();

        coreSettings->threadToSample = thread;
        coreSettings->threadFlags = flag;

        // Copy each saved element in the context individually.
        // This keeps use independent of the context layouts.
        ws->context.fp = context.fp;
        ws->context.lr = context.lr;
        ws->context.sp = context.sp;
        ws->context.pc = context.pc;

        if (thread == nn::os::GetCurrentThread())
        {
            // If we hit our own thread, we want to ensure that the sample location
            // makes sense. During testing we have seen the last thread come through
            // in a location where there was a much deeper callstack than where we currently
            // are. This led to crashing as we attempted to walk the callstack.
            // So for now, we disable callstack walking by setting the FP to 0.
            ws->context.fp = 0;
            coreSettings->triggerFlags |= TriggerFlags_SampleSamplerThread;
        }
        else
        {
            coreSettings->triggerFlags |= TriggerFlags_TakeSample;
        }
    }
    else
    {
        DUMP_CURRENT_LINE();
        if (result <= nn::os::ResultNoThread())
        {
            coreSettings->triggerFlags |= TriggerFlags_SampleNoThread;
            coreSettings->threadToSample = nullptr;
        }
        else if (result <= nn::os::ResultUnknownThread())
        {
            coreSettings->triggerFlags |= TriggerFlags_SampleUnknownThread;
            coreSettings->threadToSample = nullptr;
        }
        else
        {
            ERROR_LOG("Received an unexpected result from GetLastThreadInfo()\n");
            DumpResultInformation(LOG_AS_ERROR, result);
        }
    }
    NN_PROFILER_TIMING_END(&coreSettings->timers[SamplingThreadTimer_GetLastThreadInfo]);
}



void SamplingThread_RecordSample(WorkArea* ws, uint32_t core, nn::os::ThreadType* thread)
{
    DUMP_CURRENT_LINE();

    bool shouldContinue = true;

    shouldContinue = CheckAndExpandBuffersIfNeeded(SampleBufferIndex(core));
    if (shouldContinue)
    {
        DUMP_CURRENT_LINE();

        ws->thread_to_profile = thread;
        SamplingThread_RunRecordFuncList(sSamplingThreads[core]->recordFuncs, ws);

        if (ws->fields.GetBit(WorkArea::SampleUsingPerfCounters))
        {
            nn::profiler::pmu::ResetInterrupt(core);
        }
    }

    if (!shouldContinue)
    {
        SendBasicIpcMessage(ProfilerIpcMessage_StopProfiling, 0);
        //StopProfilingSamplingThreads();
    }
}



void SamplingThread_RecordNoThread(WorkArea* ws, uint32_t core)
{
    DUMP_CURRENT_LINE();
    SamplingThread_RecordTimedBasicEvent(ws, core, PayloadEvents_NoActiveThread);
}



void SamplingThread_RecordUnknownThread(WorkArea* ws, uint32_t core)
{
    DUMP_CURRENT_LINE();
    SamplingThread_RecordTimedBasicEvent(ws, core, PayloadEvents_UnknownActiveThread);
}



void SamplingThread_RecordProfilerThread(WorkArea* ws, uint32_t core)
{
    DUMP_CURRENT_LINE();
    SamplingThread_RecordTimedBasicEvent(ws, core, PayloadEvents_ProfilerActiveThread);
}



static NN_NOINLINE void SamplingThread_RecordTimedBasicEvent(WorkArea* ws, uint32_t core, uint32_t event)
{
    bool shouldContinue = true;

    shouldContinue = CheckAndExpandBuffersIfNeeded(SampleBufferIndex(core));
    if (shouldContinue)
    {
        DUMP_CURRENT_LINE();

        uint8_t *writePtr = ws->curPtr.fetch_add(12);
        writePtr = WriteToBuffer(writePtr, event);
        writePtr = WriteToBuffer(writePtr, GetCurrentTime());

        if (ws->fields.GetBit(WorkArea::IsUsingPerfCounters))
        {
            RecordPerfCounters(ws);
        }

        if (ws->fields.GetBit(WorkArea::SampleUsingPerfCounters))
        {
            nn::profiler::pmu::ResetInterrupt(core);
        }
    }

    if (!shouldContinue)
    {
        SendBasicIpcMessage(ProfilerIpcMessage_StopProfiling, 0);
        //StopProfilingSamplingThreads();
    }
}



static void SamplingThread_FillRecordFuncBuffer(ProfilerRecordFuncPtr* funcs)
{
    for (size_t idx = 0; idx < RecordFunctionsBufferSize; ++idx)
    {
        funcs[idx] = &RecordDefaultFuncPtr;
    }
}



//
// SetupRecordFuncList
//  Current maximum depth is 4
//  If this ever gets above 8 we have a problem
//
static void SamplingThread_SetupRecordFuncList(
    WorkArea* ws,
    SamplingThread* threadDetails,
    bool* hasPerfCounters)
{
    int32_t idx = 0;
    SettingsFromThePcGui* settings = ws->settings;

    SamplingThread_FillRecordFuncBuffer(threadDetails->recordFuncs);

    INFO_LOG("\n\n____ Record Functions:\n");

    // All paths start by recording a common header
    INFO_LOG("\tRecord Sample Header\n");
    threadDetails->recordFuncs[idx++] = &RecordSampleHeader;

    // Record performance counters if needed
    if ((settings->flags & SettingsFromThePcGui::SampleByPerfCounter) == 0)
    {
        bool setupRecordPerfCounters = false;
        if (settings->perf_counter_cycle != 0)
        {
            ws->fields.SetBit(WorkArea::RecordPerformanceCounterCycles, true);
            setupRecordPerfCounters = true;
        }

        if (settings->perf_counters[0] != pmu::PerfCounter_Disabled)
        {
            ws->fields.SetBit(WorkArea::RecordPerformanceCounters, true);
            setupRecordPerfCounters = true;
        }

        *hasPerfCounters = setupRecordPerfCounters;
        if (setupRecordPerfCounters)
        {
            nn::profiler::pmu::SetCounters(settings->perf_counters);

            INFO_LOG("\tRecord Performance Counters\n");
            threadDetails->recordFuncs[idx++] = &RecordPerfCounters;
        }
    }

    // All paths record the current program counter
    INFO_LOG("\tRecord Program Counter\n");
    threadDetails->recordFuncs[idx++] = &RecordPC;

    // If requested, record the stack
    if ((settings->flags & SettingsFromThePcGui::UseSimple) == 0)
    {
        INFO_LOG("\tRecord Callstack\n");
        threadDetails->recordFuncs[idx++] = &RecordStack;
    }
    else if (settings->flags & SettingsFromThePcGui::RecordSimpleStackDepths)
    {
        INFO_LOG("\tRecord Stack Depth\n");
        threadDetails->recordFuncs[idx++] = &RecordStackDepth;
    }

    NN_SDK_ASSERT(static_cast<size_t>(idx) <= RecordFunctionsBufferSize);

    INFO_LOG("\n\n");
}



static nn::Result SamplingThread_RunRecordFuncList(ProfilerRecordFuncPtr* funcs, WorkArea* ws)
{
    nn::Result result = nn::ResultSuccess();

    NN_PROFILER_TIMING_BEGIN(&sSamplingThreads[ws->core_number]->timers[SamplingThreadTimer_RecordLoop]);

    for (size_t i = 0; i < RecordFunctionsBufferSize; ++i)
    {
        result = funcs[i](ws);
        if (result.IsFailure()) { break; }
    }

    if (result <= ResultSampleLoopComplete()) { result = nn::ResultSuccess(); }

    NN_PROFILER_TIMING_END(&sSamplingThreads[ws->core_number]->timers[SamplingThreadTimer_RecordLoop]);

    return result;
}



static void SetupCoreHeader(uint32_t core, SettingsFromThePcGui* settings)
{
    // Now fill the header into the buffer
    Header* header = GetCoreHeader(core);
    header->Initialize();
    header->WriteControlValueOnly(HeaderSpecialValues_CoreHeaderBegin);

    // Section code for Dlls
    // Neb_Note: This is inserted here, but managed by the PC side.
    // ref: ProfilerInput.cs@832 in SaveProfileToFile()
    header->Write(HeaderSpecialValues_Dlls, static_cast<uint32_t>(0) /*false*/);

    header->Write(HeaderSpecialValues_CoreNumber, core);

    header->Write(HeaderSpecialValues_Flags, settings->flags);

    header->Write(
        HeaderSpecialValues_PerformanceCounterSlots,
        settings->perf_counters,
        pmu::PerformanceCounterCount,
        settings->perf_counter_cycle);

    header->Write(HeaderSpecialValues_BaseTime, GetCurrentTime());

    header->Write(
        HeaderSpecialValues_RequestedTimeBetweenSamples,
        settings->requested_time_between_samples_in_nanoseconds);

    header->WriteControlValueOnly(HeaderSpecialValues_HeaderEnd);
}



} // profiler
} // nn
