﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

//-----------------------------------------------------------------------------
// メモリバンド幅測定テスト
//-----------------------------------------------------------------------------

#include <random>
#include <thread>

#include <nn/nn_Common.h>
#include <nn/nn_Abort.h>
#include <nn/nn_Log.h>

#include <nn/oe.h>
#include <nn/os.h>

namespace {
    enum AccessPattern
    {
        AccessPattern_LoadAllData,
        AccessPattern_LoadCacheLine,
    };

    const int ThreadCount = 1;

    const int CacheLineSize = 64;
    struct NN_ALIGNAS(CacheLineSize) Data
    {
        int64_t dummy[CacheLineSize / sizeof(int64_t)]; // 最適化抑制のために格納する値
    };

    NN_STATIC_ASSERT(sizeof(Data) == CacheLineSize);

    struct ThreadArgs
    {
        size_t offset;
        size_t size;
        int loop;
        AccessPattern accessPattern;
    };

    // テストセット
    struct TestSetEntry
    {
        const char* name;
        size_t size;
        int loop;
        AccessPattern accessPattern;
    };
    const TestSetEntry g_TestSet[] = {
        { " 32KB (TX1 L1 size)      ",          32 * 1024,  256 * 1024, AccessPattern_LoadAllData },
        { "128KB                    ",         128 * 1024,   64 * 1024, AccessPattern_LoadAllData },
        { "512KB                    ",         512 * 1024,   16 * 1024, AccessPattern_LoadAllData },
        { "  2MB (TX2 L1 size)      ",    2 * 1024 * 1024,    4 * 1024, AccessPattern_LoadAllData },
        { "  8MB                    ",    8 * 1024 * 1024,        1024, AccessPattern_LoadAllData },
        { " 32MB (load all data)    ",   32 * 1024 * 1024,         256, AccessPattern_LoadAllData },
        { " 32MB (load cache line)  ",   32 * 1024 * 1024,         256, AccessPattern_LoadCacheLine },
        { "128MB (load all data)    ",  128 * 1024 * 1024,          64, AccessPattern_LoadAllData },
        { "128MB (load cache line)  ",  128 * 1024 * 1024,          64, AccessPattern_LoadCacheLine },
        { "512MB (load all data)    ",  512 * 1024 * 1024,          16, AccessPattern_LoadAllData },
        { "512MB (load cache line)  ",  512 * 1024 * 1024,          16, AccessPattern_LoadCacheLine },
    };

    // バンド幅測定のためにアクセスするワーキングメモリ
    const size_t WorkMemorySize = 512 * 1024 * 1024;
    NN_ALIGNAS(sizeof(Data) / sizeof(int8_t)) Data g_WorkMemory[WorkMemorySize / sizeof(Data)];

    // 最適化抑制のために、ワーキングメモリの内容から計算した値を書き込む領域
    int64_t g_Dummy;
    volatile int64_t* g_pDummy = &g_Dummy;

    // スレッドスタック
    const int ThreadStackSize = 4096;
    NN_ALIGNAS(nn::os::ThreadStackAlignment) uint8_t g_ThreadStacks[ThreadCount][ThreadStackSize];
}

void InitializeWorkMemory()
{
    std::mt19937_64 mt;
    for (size_t i = 0; i < sizeof(g_WorkMemory) / sizeof(g_WorkMemory[0]); i++)
    {
        for (int j = 0; j < sizeof(g_WorkMemory[i].dummy) / sizeof(g_WorkMemory[i].dummy[0]); j++)
        {
            g_WorkMemory[i].dummy[j] = static_cast<int64_t>(mt());
        }
    }
}

void ThreadFunc(void* pArgs)
{
    auto offset = reinterpret_cast<ThreadArgs*>(pArgs)->offset;
    auto size = reinterpret_cast<ThreadArgs*>(pArgs)->size;
    auto loop = reinterpret_cast<ThreadArgs*>(pArgs)->loop;
    auto accessPattern = reinterpret_cast<ThreadArgs*>(pArgs)->accessPattern;

    int64_t dummy = 0;

    switch (accessPattern)
    {
    case AccessPattern_LoadAllData:
        for (int i = 0; i < loop; i++)
        {
            // Check array size for loop-unrolling
            NN_ABORT_UNLESS_EQUAL(8U, sizeof(g_WorkMemory[0].dummy) / sizeof(g_WorkMemory[0].dummy[0]));

            auto startIndex = offset / sizeof(g_WorkMemory[0]);
            auto endIndex = (offset + size) / sizeof(g_WorkMemory[0]);

            for (size_t j = startIndex; j < endIndex; j++)
            {
                dummy += g_WorkMemory[j].dummy[0];
                dummy += g_WorkMemory[j].dummy[1];
                dummy += g_WorkMemory[j].dummy[2];
                dummy += g_WorkMemory[j].dummy[3];
                dummy += g_WorkMemory[j].dummy[4];
                dummy += g_WorkMemory[j].dummy[5];
                dummy += g_WorkMemory[j].dummy[6];
                dummy += g_WorkMemory[j].dummy[7];
            }
        }
        break;
    case AccessPattern_LoadCacheLine:
        // Check alignment for loop-unrolling
        NN_ABORT_UNLESS_EQUAL(0U, (size / sizeof(g_WorkMemory[0])) % 4);

        for (int i = 0; i < loop; i++)
        {
            auto startIndex = offset / sizeof(g_WorkMemory[0]);
            auto endIndex = (offset + size) / sizeof(g_WorkMemory[0]);

            for (size_t j = startIndex; j < endIndex; j += 4)
            {
                dummy += g_WorkMemory[j + 0].dummy[0];
                dummy += g_WorkMemory[j + 1].dummy[0];
                dummy += g_WorkMemory[j + 2].dummy[0];
                dummy += g_WorkMemory[j + 3].dummy[0];
            }
        }
        break;
    default:
        NN_UNEXPECTED_DEFAULT;
    }

    *g_pDummy = dummy;
}

extern "C" void nnMain()
{
#if defined(NN_BUILD_CONFIG_SPEC_NX)
    nn::oe::DisableRecording();
#endif

    nn::os::ThreadType threads[ThreadCount];
    ThreadArgs args[ThreadCount];

    InitializeWorkMemory();

    NN_LOG("%32s%16s%16s%16s\n", "", "LoopCount", "ExecutionTime", "Performance");
    for (int i = 0; i < sizeof(g_TestSet) / sizeof(g_TestSet[0]); i++)
    {
        auto& testSet = g_TestSet[i];

        for (int j = 0; j < ThreadCount; j++)
        {
            args[j].size = testSet.size / ThreadCount;
            args[j].offset = args[j].size * j;
            args[j].loop = testSet.loop;
            args[j].accessPattern = testSet.accessPattern;

            auto idealCoreNumber = j;
            nn::os::CreateThread(&threads[j], ThreadFunc, &args[j], g_ThreadStacks[j], ThreadStackSize, nn::os::HighestThreadPriority, idealCoreNumber);
        }

        auto start = nn::os::GetSystemTick();

        for (int j = 0; j < ThreadCount; j++)
        {
            nn::os::StartThread(&threads[j]);
        }

        for (int j = 0; j < ThreadCount; j++)
        {
            nn::os::WaitThread(&threads[j]);
        }

        auto end = nn::os::GetSystemTick();

        for (int j = 0; j < ThreadCount; j++)
        {
            nn::os::DestroyThread(&threads[j]);
        }

        auto executionTime = static_cast<float>(nn::os::ConvertToTimeSpan(end - start).GetMicroSeconds()) / 1000;
        auto performance = testSet.size * testSet.loop / nn::os::ConvertToTimeSpan(end - start).GetMicroSeconds();

        NN_LOG("%15s:%11d loop %9.3f ms%11d MB/s\n", testSet.name, testSet.loop, executionTime, performance);
    }

#if defined(NN_BUILD_CONFIG_SPEC_NX)
    nn::oe::EnableRecording();
#endif
}
