﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#include <nn/nn_Common.h>
#include <nn/TargetConfigs/build_Compiler.h>
#include <nn/TargetConfigs/build_Cpu.h>
#include <nn/nn_BitTypes.h>

#include "../../kern_Platform.h"
#include "kern_KCPU.h"
#include "kern_KPageTable.h"
#include "../../kern_InterruptNameSelect.h"
#include "../../kern_Kernel.h"
#include "../../kern_KThreadQueue.h"
#include "../../kern_KLightMutex.h"
#include "../../kern_KScopedSchedulingLock.h"
#include "../ARM64/kern_RegisterAccess.h"
#include "../../kern_InterlockedSelect.h"
#include "../../kern_KResourceLimit.h"
#include "kern_SystemControl.h"
#include "../../kern_Utility.h"

#define __isb(x)    do { asm volatile("isb":::"memory"); } while (false)
#define __yield()   do { asm volatile("yield":::"memory"); } while (false)

namespace nn { namespace kern {
namespace ARMv8A {

    namespace
    {
        class KDisableCoreMigration
        {
        public:
            KDisableCoreMigration()
            {
                GetCurrentThread().DisableCoreMigration();
            }
            ~KDisableCoreMigration()
            {
                GetCurrentThread().EnableCoreMigration();
            }
        };

        class KTerminateHandler : public KInterruptHandler
        {
        public:
            virtual KInterruptTask* OnInterrupt(int32_t interruptRequestNo)
            {
                NN_UNUSED(interruptRequestNo);
                return nullptr;
            }
        };

        class KCacheICCHelper : public KInterruptHandler
        {
        public:
            enum Process
            {
                IDLE,
                INST_INVALIDATE,
                DATA_STORE_ALL,
                DATA_FLUSH_ALL
            };

        private:
            KLightMutex                 m_Mutex;
            KLightMutex                 m_SyncMutex;
            KLightConditionvariable     m_SyncCv;
            InterlockedVariable<Bit64>  m_TargetCores;
            volatile Process            m_Process;
        public:
            KCacheICCHelper() : m_TargetCores(0), m_Process(IDLE) {}
            void Initialize(int32_t coreNo)
            {
                NN_KERN_ABORT_UNLESS(Kernel::GetSystemResourceLimit().TestLimit(nn::svc::LimitableResource_ThreadCountMax, 1));

                KThread* pThread = KThread::Create();
                NN_KERN_ABORT_UNLESS(pThread);
                NN_KERN_ABORT_IF_FAILED(InitializeThreadForKernel(pThread, &ThreadMain, reinterpret_cast<uintptr_t>(this), 8, coreNo));
                KThread::Register(pThread);
                pThread->Run();
                NN_LOG("    CacheOpThread ID=%lld, Core=%d, Affinity=%llx State=%d, Priority=%d\n",
                    pThread->GetId(),
                    pThread->GetRunningProcessor(),
                    pThread->GetAffinityMask().GetAffinityMask(),
                    pThread->GetState(),
                    pThread->GetPriority());
            }

            virtual KInterruptTask* OnInterrupt(int32_t interruptRequestNo)
            {
                NN_UNUSED(interruptRequestNo);
                ProcessRequest();
                return nullptr;
            }

            void Request(Process op)
            {
                KScopedLightLock locker(&m_Mutex);
                NN_KERN_ABORT_UNLESS(m_Process == IDLE);
                {
                    KScopedLightLock locker(&m_SyncMutex);
                    NN_KERN_ABORT_UNLESS(m_TargetCores == 0);

                    m_Process = op;
                    bool useInterrupt = ((Kernel::GetState() == Kernel::STATE_INITIALIZING) || (m_Process == INST_INVALIDATE));
                    m_TargetCores = ((1ull << KCPU::NUM_CORE) - 1);

                    if (useInterrupt)
                    {
                        KCPU::DataSynchronizationBarrier();
                        Kernel::GetInterruptManager().SendIpi((((1ull << KCPU::NUM_CORE) - 1) & ~(1ull << GetCurrentCpuNo())), KInterruptName::INTR_CORE_ID_CACHE);
                        ProcessRequest();
                        while (m_TargetCores != 0)
                        {
                            __yield();
                        }
                    }
                    else
                    {
                        m_SyncCv.Broadcast();
                        while (m_TargetCores != 0)
                        {
                            m_SyncCv.Wait(&m_SyncMutex, -1);
                        }
                    }
                }
                m_Process = IDLE;
            }

        private:
            static void ThreadMain(uintptr_t p) { reinterpret_cast<KCacheICCHelper*>(p)->ThreadBody(); }

            void ThreadBody()
            {
                int coreNo = GetCurrentCpuNo();
                for (;;)
                {
                    {
                        KScopedLightLock locker(&m_SyncMutex);
                        while ((m_TargetCores & (1ull << coreNo)) == 0)
                        {
                            m_SyncCv.Wait(&m_SyncMutex, -1);
                        }
                    }
                    ProcessRequest();
                    {
                        KScopedLightLock locker(&m_SyncMutex);
                        if (m_TargetCores == 0)
                        {
                            m_SyncCv.Broadcast();
                        }
                    }
                }
            }

            // キャッシュ処理スレッドで実行される
            void ProcessRequest();
        };

        InterlockedVariable<int32_t> s_AllCoreSyncCount = InterlockedVariable<int32_t>(0);
        KCacheICCHelper     s_CacheHandler;
        KTerminateHandler   s_TerminateHandler;

        void StoreEntireDataCacheImpl(int level)
        {
            Bit64 sizeInfo;

            {
                KDisableInterrupt di;
                HW_SET_CSSELR_EL1(static_cast<Bit64>(level << 1));
                __isb(0xf);
                HW_GET_CCSIDR_EL1(sizeInfo);
            }

            const int32_t numSet   = ((sizeInfo >> 13) & 0x7FFF);
            const int32_t numWay   = ((sizeInfo >>  3) & 0x3FF);
            const int32_t lineSize = ((sizeInfo >>  0) & 0x7);

            const int setShift = lineSize + 4;
            const int wayShift = __builtin_clz(numWay);

            for( int w = 0; w <= numWay; ++w )
            {
                for( int s = 0; s <= numSet; ++s )
                {
                    const Bit64 ws = (w << wayShift) | (s << setShift) | (level << 1);
                    asm volatile("DC CSW, %0"::"r"(ws):"memory");
                }
            }
        }

        void StoreEntireDataCacheShare()
        {
            Bit64 levelInfo;
            HW_GET_CLIDR_EL1(levelInfo);
            const int unificationLevels = ((levelInfo >> 21) & 0x7);
            const int coherenceLevels = ((levelInfo >> 24) & 0x7);

            for (int level = unificationLevels; level <= coherenceLevels; level++)
            {
                StoreEntireDataCacheImpl(level);
            }
        }

        void StoreEntireDataCacheCoreLocal()
        {
            Bit64 levelInfo;
            HW_GET_CLIDR_EL1(levelInfo);
            const int unificationLevels = ((levelInfo >> 21) & 0x7);

            for (int level = 0; level < unificationLevels; level++)
            {
                StoreEntireDataCacheImpl(level);
            }
        }

        void FlushEntireDataCacheImpl(int level)
        {
            Bit64 sizeInfo;
            {
                KDisableInterrupt di;
                HW_SET_CSSELR_EL1(static_cast<Bit64>(level << 1));
                __isb(0xf);
                HW_GET_CCSIDR_EL1(sizeInfo);
            }

            const int32_t numSet   = ((sizeInfo >> 13) & 0x7FFF);
            const int32_t numWay   = ((sizeInfo >>  3) & 0x3FF);
            const int32_t lineSize = ((sizeInfo >>  0) & 0x7);

            const int setShift = lineSize + 4;
            const int wayShift = __builtin_clz(numWay);

            for( int w = 0; w <= numWay; ++w )
            {
                for( int s = 0; s <= numSet; ++s )
                {
                    const Bit64 ws = (w << wayShift) | (s << setShift) | (level << 1);
                    asm volatile("DC CISW, %0"::"r"(ws):"memory");
                }
            }
        }

        void FlushEntireDataCacheShare()
        {
            Bit64 levelInfo;
            HW_GET_CLIDR_EL1(levelInfo);
            const int unificationLevels = ((levelInfo >> 21) & 0x7);
            const int coherenceLevels = ((levelInfo >> 24) & 0x7);

            for (int level = coherenceLevels; level >= unificationLevels; level--)
            {
                FlushEntireDataCacheImpl(level);
            }
        }

        void FlushEntireDataCacheCoreLocal()
        {
            Bit64 levelInfo;
            HW_GET_CLIDR_EL1(levelInfo);
            const int unificationLevels = ((levelInfo >> 21) & 0x7);

            for (int level = unificationLevels - 1; level >= 0; level--)
            {
                FlushEntireDataCacheImpl(level);
            }
        }

        void InvalidateEntireInstructionCacheImpl()
        {
            asm volatile("IC IALLU":::"memory");
        }

        void InvalidateEntireInstructionCacheAllCoreImpl()
        {
            asm volatile("IC IALLUIS":::"memory");
        }

        Result InvalidateDataCacheRange(uintptr_t begin, uintptr_t end)
        {
            NN_KERN_ALIGN_ASSERT(begin, KCPU::DATA_CACHE_LINE_SIZE);
            NN_KERN_ALIGN_ASSERT(end, KCPU::DATA_CACHE_LINE_SIZE);
            if (!nn::kern::ARM64::InvalidateDataCache(begin, end))
            {
                return nn::svc::ResultInvalidCurrentMemory();
            }
            KCPU::WaitDataCacheOperation();
            return ResultSuccess();
        }

        Result StoreDataCacheRange(uintptr_t begin, uintptr_t end)
        {
            NN_KERN_ALIGN_ASSERT(begin, KCPU::DATA_CACHE_LINE_SIZE);
            NN_KERN_ALIGN_ASSERT(end, KCPU::DATA_CACHE_LINE_SIZE);
            if (!nn::kern::ARM64::StoreDataCache(begin, end))
            {
                return nn::svc::ResultInvalidCurrentMemory();
            }
            KCPU::WaitDataCacheOperation();
            return ResultSuccess();
        }

        Result FlushDataCacheRange(uintptr_t begin, uintptr_t end)
        {
            NN_KERN_ALIGN_ASSERT(begin, KCPU::DATA_CACHE_LINE_SIZE);
            NN_KERN_ALIGN_ASSERT(end, KCPU::DATA_CACHE_LINE_SIZE);
            if (!nn::kern::ARM64::FlushDataCache(begin, end))
            {
                return nn::svc::ResultInvalidCurrentMemory();
            }
            KCPU::WaitDataCacheOperation();
            return ResultSuccess();
        }

        Result InvalidateInstructionCacheRange(uintptr_t begin, uintptr_t end)
        {
            NN_KERN_ALIGN_ASSERT(begin, KCPU::INSTRUCTION_CACHE_LINE_SIZE);
            NN_KERN_ALIGN_ASSERT(end, KCPU::INSTRUCTION_CACHE_LINE_SIZE);
            if (!nn::kern::ARM64::InvalidateInstructionCache(begin, end))
            {
                return nn::svc::ResultInvalidCurrentMemory();
            }
            KCPU::CareInstructionConsistency();
            return ResultSuccess();
        }

        void KCacheICCHelper::ProcessRequest()
        {
            switch (m_Process)
            {
            case INST_INVALIDATE:
                {
                    KCPU::InstructionMemoryBarrier();
                }
                break;

            case DATA_STORE_ALL:
                {
                    StoreEntireDataCacheCoreLocal();
                    KCPU::WaitDataCacheOperation();
                }
                break;

            case DATA_FLUSH_ALL:
                {
                    FlushEntireDataCacheCoreLocal();
                    KCPU::WaitDataCacheOperation();
                }
                break;

            default:
                break;
            }
            m_TargetCores &= ~(1ull << GetCurrentCpuNo());
        }

        class KCycleCounterHandler : public KInterruptHandler
        {
        public:
            void Setup(int n)
            {
                m_IsComplete = false;
                m_Number = n;
            }
            void Wait()
            {
                while (!m_IsComplete)
                {
                    asm volatile ("yield");
                }
            }
            Bit64 GetCounter() const { return m_Pmcc; }
            virtual KInterruptTask* OnInterrupt(int32_t interruptRequestNo)
            {
                NN_UNUSED(interruptRequestNo);
                if (m_Number < 0)
                {
                    m_Pmcc = KCPU::GetCycleCounter();
                }
                else
                {
                    m_Pmcc = KCPU::GetPerformanceCounter(m_Number);
                }
                KCPU::DataMemoryBarrier();
                m_IsComplete = true;
                return NULL;
            }
            static KLightMutex& GetLock() { return s_Lock; }
        private:
            Bit64 m_Pmcc;
            bool m_IsComplete;
            int m_Number;
            static KLightMutex s_Lock;
        };

        KCycleCounterHandler g_CycleCounterHandler[KCPU::NUM_CORE];
        KLightMutex KCycleCounterHandler::s_Lock;
    }


Result KCPU::InvalidateDataCache(void* addr, size_t size)
{
    KDisableCoreMigration disableMigration;
    const uintptr_t a = reinterpret_cast<uintptr_t>(addr);
    const uintptr_t b = a + size;

    uintptr_t begin;
    uintptr_t end;

    Result result;

    // Invalidate はキャッシュラインにアライメントしていない場合
    // 端数ぶんは Flush にしなければならない
    begin = RoundDown(a, KCPU::DATA_CACHE_LINE_SIZE);
    end   = RoundUp(b, KCPU::DATA_CACHE_LINE_SIZE);

    // 前の端数
    if( begin != a )
    {
        // size == 0 でも Flush してしまうが気にしない
        result = FlushDataCacheRange(begin, begin + KCPU::DATA_CACHE_LINE_SIZE);
        if (result.IsFailure())
        {
            return result;
        }
        begin += KCPU::DATA_CACHE_LINE_SIZE;
    }

    // 後の端数
    if( (begin < end) && (end != b) )
    {
        result = FlushDataCacheRange(end - KCPU::DATA_CACHE_LINE_SIZE, end);
        if (result.IsFailure())
        {
            return result;
        }
        end -= KCPU::DATA_CACHE_LINE_SIZE;
    }

    // キャッシュラインにアラインしている部分
    if( begin < end )
    {
        result = InvalidateDataCacheRange(begin, end);
        if (result.IsFailure())
        {
            return result;
        }
    }

    return ResultSuccess();
}

Result KCPU::StoreDataCache(const void* addr, size_t size)
{
    KDisableCoreMigration disableMigration;
    const uintptr_t a = reinterpret_cast<uintptr_t>(addr);

    // Store はキャッシュラインを気にする必要はない
    const uintptr_t begin = RoundDown(a, KCPU::DATA_CACHE_LINE_SIZE);
    const uintptr_t end   = RoundUp(a + size, KCPU::DATA_CACHE_LINE_SIZE);

    Result result;

    result = StoreDataCacheRange(begin, end);
    if (result.IsFailure())
    {
        return result;
    }

    return ResultSuccess();
}

Result KCPU::FlushDataCache(const void* addr, size_t size)
{
    KDisableCoreMigration disableMigration;
    const uintptr_t a = reinterpret_cast<uintptr_t>(addr);

    // Flush はキャッシュラインを気にする必要はない
    const uintptr_t begin = RoundDown(a, KCPU::DATA_CACHE_LINE_SIZE);
    const uintptr_t end   = RoundUp(a + size, KCPU::DATA_CACHE_LINE_SIZE);

    Result result;

    result = FlushDataCacheRange(begin, end);
    if (result.IsFailure())
    {
        return result;
    }

    return ResultSuccess();
}

Result KCPU::InvalidateInstructionCache(void* addr, size_t size)
{
    KDisableCoreMigration disableMigration;
    const uintptr_t a = reinterpret_cast<uintptr_t>(addr);

    // 命令キャッシュは心置きなく Invalidate して良い
    const uintptr_t begin = RoundDown(a, KCPU::INSTRUCTION_CACHE_LINE_SIZE);
    const uintptr_t end   = RoundUp(a + size, KCPU::INSTRUCTION_CACHE_LINE_SIZE);

    Result result;

    result = InvalidateInstructionCacheRange(begin, end);
    if (result.IsFailure())
    {
        return result;
    }

    s_CacheHandler.Request(KCacheICCHelper::INST_INVALIDATE);

    return ResultSuccess();
}

void KCPU::InvalidateEntireDataCache()
{
    FlushEntireDataCache();
}

void KCPU::StoreEntireDataCache()
{
    KDisableCoreMigration disableMigration;
    s_CacheHandler.Request(KCacheICCHelper::DATA_STORE_ALL);
    StoreEntireDataCacheShare();
    WaitDataCacheOperation();
}

void KCPU::FlushEntireDataCache()
{
    KDisableCoreMigration disableMigration;
#if defined NN_BUILD_CONFIG_CPU_CORTEX_A57_AARCH64 || defined NN_BUILD_CONFIG_CPU_CORTEX_A53_AARCH64
    // Cortex A53, A57 は L2キャッシュラインが Clean & Invalidate されると
    // L1キャッシュも一緒にメモリにはき出される
    // また、L2キャッシュはコア間で共有しているため、あるコアからL2キャッシュを Clean & Invalidate するだけでよい。

    FlushEntireDataCacheShare();
    WaitDataCacheOperation();
#else
    s_CacheHandler.Request(KCacheICCHelper::DATA_STORE_ALL);
    StoreEntireDataCacheShare();
    WaitDataCacheOperation();

    FlushEntireDataCacheShare();
    WaitDataCacheOperation();
    s_CacheHandler.Request(KCacheICCHelper::DATA_FLUSH_ALL);
#endif
}

void KCPU::InvalidateEntireInstructionCache()
{
    KDisableCoreMigration disableMigration;

    InvalidateEntireInstructionCacheAllCoreImpl();
    CareInstructionConsistency();

    s_CacheHandler.Request(KCacheICCHelper::INST_INVALIDATE);
}

void KCPU::InvalidateEntireInstructionCacheLocal()
{
    KDisableCoreMigration disableMigration;

    InvalidateEntireInstructionCacheImpl();
    CareInstructionConsistency();
}


/*!
    @brief     すべてのコア間で処理の待ち合わせをします。

*/
void KCPU::SynchronizeAllCore()
{
    // 前の同期が終わっていることを確認
    // (s_AllCoreSyncCount は InterlockedVariable)
    while ( ! (s_AllCoreSyncCount < KCPU::NUM_CORE) ) {}

    // カウントをインクリメントし、
    const int32_t count = s_AllCoreSyncCount++;

    // 以前のカウントにコア数を足したものになるまで待つ。
    while (s_AllCoreSyncCount != count + KCPU::NUM_CORE) {}

    if (s_AllCoreSyncCount == KCPU::NUM_CORE * 2 - 1)
    {
        // 自分が最後の到達コアだったら、カウントをゼロ初期化することで今回の同期を終わらせる。
        // 次の同期が走れるようになる。
        s_AllCoreSyncCount = 0;
    }
    else
    {
        // 自分が最後の到達コアではなかったら、カウントをインクリメント。
        ++s_AllCoreSyncCount;
    }
}

void KCPU::Initialize0(int32_t coreNo)
{
    NN_UNUSED(coreNo);
}

void KCPU::Initialize1(int32_t coreNo)
{
    s_CacheHandler.Initialize(coreNo);
    Kernel::GetInterruptManager().BindHandler(&s_CacheHandler, KInterruptName::INTR_CORE_ID_CACHE, coreNo, KInterruptController::PriorityLevel_DevInterrupt, false,  false);
    Kernel::GetInterruptManager().BindHandler(&s_TerminateHandler, KInterruptName::INTR_CORE_ID_TERMINATE, coreNo, KInterruptController::PriorityLevel_SchInterrupt, false, false);
    if (KTargetSystem::IsUserPmuAccessEnabled())
    {
        HW_SET_PMUSERENR_EL0(1ul);
    }
    Kernel::GetInterruptManager().BindHandler(&g_CycleCounterHandler[coreNo], KInterruptName::INTR_CORE_ID_CC, coreNo, KInterruptController::PriorityLevel_TimInterrupt, false, false);
}

void KCPU::GetCycleCounter(Bit64* pOut)
{
    KScopedLightLock locker(&KCycleCounterHandler::GetLock());

    for (int i = 0; i < KCPU::NUM_CORE; i++)
    {
        g_CycleCounterHandler[i].Setup(-1);
    }

    KCPU::DataSynchronizationBarrier();
    Kernel::GetInterruptManager().SendIpi((1ull << KCPU::NUM_CORE) - 1, KInterruptName::INTR_CORE_ID_CC);

    KCPU::DataMemoryBarrier();
    for (int i = 0; i < KCPU::NUM_CORE; i++)
    {
        g_CycleCounterHandler[i].Wait();
        pOut[i] = g_CycleCounterHandler[i].GetCounter();
    }
}

void KCPU::GetPerformanceCounter(Bit32* pOut, int n)
{
    KScopedLightLock locker(&KCycleCounterHandler::GetLock());

    for (int i = 0; i < KCPU::NUM_CORE; i++)
    {
        g_CycleCounterHandler[i].Setup(n);
    }

    KCPU::DataSynchronizationBarrier();
    Kernel::GetInterruptManager().SendIpi((1ull << KCPU::NUM_CORE) - 1, KInterruptName::INTR_CORE_ID_CC);

    KCPU::DataMemoryBarrier();
    for (int i = 0; i < KCPU::NUM_CORE; i++)
    {
        g_CycleCounterHandler[i].Wait();
        pOut[i] = g_CycleCounterHandler[i].GetCounter();
    }
}


}
}}

