﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#include <cstring>
#include <nn/nn_SdkAssert.h>
#include <nn/nn_Common.h>
#include <nn/crypto/crypto_Config.h>
#include <nn/crypto/detail/crypto_AesImpl.h>
#include <nn/crypto/detail/crypto_Clear.h>
#include "crypto_AesImpl-cpu.x86x64.h"

#if !defined(NN_BUILD_CONFIG_ENDIAN_LITTLE)
    #error unsupported endian.
#endif

namespace nn { namespace crypto { namespace detail {

namespace
{
    const int BlockWords = AesImpl<16>::BlockSize / sizeof(Bit32);

    /* SubBytes 処理の置換表 */
    const Bit8 SubBytesTable[256] =
    {
        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
        0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
        0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
        0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
        0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
        0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
        0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
        0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
        0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
        0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
        0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
        0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
        0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
        0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
        0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
    };

    /* InvSubBytes 処理の置換表 */
    const Bit8 InvSubBytesTable[256] =
    {
        0x52 ,0x09 ,0x6a ,0xd5 ,0x30 ,0x36 ,0xa5 ,0x38 ,0xbf ,0x40 ,0xa3 ,0x9e ,0x81 ,0xf3 ,0xd7 ,0xfb,
        0x7c ,0xe3 ,0x39 ,0x82 ,0x9b ,0x2f ,0xff ,0x87 ,0x34 ,0x8e ,0x43 ,0x44 ,0xc4 ,0xde ,0xe9 ,0xcb,
        0x54 ,0x7b ,0x94 ,0x32 ,0xa6 ,0xc2 ,0x23 ,0x3d ,0xee ,0x4c ,0x95 ,0x0b ,0x42 ,0xfa ,0xc3 ,0x4e,
        0x08 ,0x2e ,0xa1 ,0x66 ,0x28 ,0xd9 ,0x24 ,0xb2 ,0x76 ,0x5b ,0xa2 ,0x49 ,0x6d ,0x8b ,0xd1 ,0x25,
        0x72 ,0xf8 ,0xf6 ,0x64 ,0x86 ,0x68 ,0x98 ,0x16 ,0xd4 ,0xa4 ,0x5c ,0xcc ,0x5d ,0x65 ,0xb6 ,0x92,
        0x6c ,0x70 ,0x48 ,0x50 ,0xfd ,0xed ,0xb9 ,0xda ,0x5e ,0x15 ,0x46 ,0x57 ,0xa7 ,0x8d ,0x9d ,0x84,
        0x90 ,0xd8 ,0xab ,0x00 ,0x8c ,0xbc ,0xd3 ,0x0a ,0xf7 ,0xe4 ,0x58 ,0x05 ,0xb8 ,0xb3 ,0x45 ,0x06,
        0xd0 ,0x2c ,0x1e ,0x8f ,0xca ,0x3f ,0x0f ,0x02 ,0xc1 ,0xaf ,0xbd ,0x03 ,0x01 ,0x13 ,0x8a ,0x6b,
        0x3a ,0x91 ,0x11 ,0x41 ,0x4f ,0x67 ,0xdc ,0xea ,0x97 ,0xf2 ,0xcf ,0xce ,0xf0 ,0xb4 ,0xe6 ,0x73,
        0x96 ,0xac ,0x74 ,0x22 ,0xe7 ,0xad ,0x35 ,0x85 ,0xe2 ,0xf9 ,0x37 ,0xe8 ,0x1c ,0x75 ,0xdf ,0x6e,
        0x47 ,0xf1 ,0x1a ,0x71 ,0x1d ,0x29 ,0xc5 ,0x89 ,0x6f ,0xb7 ,0x62 ,0x0e ,0xaa ,0x18 ,0xbe ,0x1b,
        0xfc ,0x56 ,0x3e ,0x4b ,0xc6 ,0xd2 ,0x79 ,0x20 ,0x9a ,0xdb ,0xc0 ,0xfe ,0x78 ,0xcd ,0x5a ,0xf4,
        0x1f ,0xdd ,0xa8 ,0x33 ,0x88 ,0x07 ,0xc7 ,0x31 ,0xb1 ,0x12 ,0x10 ,0x59 ,0x27 ,0x80 ,0xec ,0x5f,
        0x60 ,0x51 ,0x7f ,0xa9 ,0x19 ,0xb5 ,0x4a ,0x0d ,0x2d ,0xe5 ,0x7a ,0x9f ,0x93 ,0xc9 ,0x9c ,0xef,
        0xa0 ,0xe0 ,0x3b ,0x4d ,0xae ,0x2a ,0xf5 ,0xb0 ,0xc8 ,0xeb ,0xbb ,0x3c ,0x83 ,0x53 ,0x99 ,0x61,
        0x17 ,0x2b ,0x04 ,0x7e ,0xba ,0x77 ,0xd6 ,0x26 ,0xe1 ,0x69 ,0x14 ,0x63 ,0x55 ,0x21 ,0x0c ,0x7d,
    };

    /* 鍵拡張で使用される定数 */
    const Bit8 RoundKeyRcon0[] =
    {
        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
        0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, 0x2F,
        0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, 0xD4,
        0xB3, 0x7D, 0xFA, 0xEF, 0xC5, 0x91,
    };

    // リトルエンディアン
    const int AesWordByte0 = 0UL;
    const int AesWordByte1 = 8UL;
    const int AesWordByte2 = 16UL;
    const int AesWordByte3 = 24UL;
    const int MixShift = 24UL;

    const Bit32 EncryptTable[256] =
    {
        0xA56363C6,0x847C7CF8,0x997777EE,0x8D7B7BF6,0x0DF2F2FF,0xBD6B6BD6,0xB16F6FDE,0x54C5C591,
        0x50303060,0x03010102,0xA96767CE,0x7D2B2B56,0x19FEFEE7,0x62D7D7B5,0xE6ABAB4D,0x9A7676EC,
        0x45CACA8F,0x9D82821F,0x40C9C989,0x877D7DFA,0x15FAFAEF,0xEB5959B2,0xC947478E,0x0BF0F0FB,
        0xECADAD41,0x67D4D4B3,0xFDA2A25F,0xEAAFAF45,0xBF9C9C23,0xF7A4A453,0x967272E4,0x5BC0C09B,
        0xC2B7B775,0x1CFDFDE1,0xAE93933D,0x6A26264C,0x5A36366C,0x413F3F7E,0x02F7F7F5,0x4FCCCC83,
        0x5C343468,0xF4A5A551,0x34E5E5D1,0x08F1F1F9,0x937171E2,0x73D8D8AB,0x53313162,0x3F15152A,
        0x0C040408,0x52C7C795,0x65232346,0x5EC3C39D,0x28181830,0xA1969637,0x0F05050A,0xB59A9A2F,
        0x0907070E,0x36121224,0x9B80801B,0x3DE2E2DF,0x26EBEBCD,0x6927274E,0xCDB2B27F,0x9F7575EA,
        0x1B090912,0x9E83831D,0x742C2C58,0x2E1A1A34,0x2D1B1B36,0xB26E6EDC,0xEE5A5AB4,0xFBA0A05B,
        0xF65252A4,0x4D3B3B76,0x61D6D6B7,0xCEB3B37D,0x7B292952,0x3EE3E3DD,0x712F2F5E,0x97848413,
        0xF55353A6,0x68D1D1B9,0x00000000,0x2CEDEDC1,0x60202040,0x1FFCFCE3,0xC8B1B179,0xED5B5BB6,
        0xBE6A6AD4,0x46CBCB8D,0xD9BEBE67,0x4B393972,0xDE4A4A94,0xD44C4C98,0xE85858B0,0x4ACFCF85,
        0x6BD0D0BB,0x2AEFEFC5,0xE5AAAA4F,0x16FBFBED,0xC5434386,0xD74D4D9A,0x55333366,0x94858511,
        0xCF45458A,0x10F9F9E9,0x06020204,0x817F7FFE,0xF05050A0,0x443C3C78,0xBA9F9F25,0xE3A8A84B,
        0xF35151A2,0xFEA3A35D,0xC0404080,0x8A8F8F05,0xAD92923F,0xBC9D9D21,0x48383870,0x04F5F5F1,
        0xDFBCBC63,0xC1B6B677,0x75DADAAF,0x63212142,0x30101020,0x1AFFFFE5,0x0EF3F3FD,0x6DD2D2BF,
        0x4CCDCD81,0x140C0C18,0x35131326,0x2FECECC3,0xE15F5FBE,0xA2979735,0xCC444488,0x3917172E,
        0x57C4C493,0xF2A7A755,0x827E7EFC,0x473D3D7A,0xAC6464C8,0xE75D5DBA,0x2B191932,0x957373E6,
        0xA06060C0,0x98818119,0xD14F4F9E,0x7FDCDCA3,0x66222244,0x7E2A2A54,0xAB90903B,0x8388880B,
        0xCA46468C,0x29EEEEC7,0xD3B8B86B,0x3C141428,0x79DEDEA7,0xE25E5EBC,0x1D0B0B16,0x76DBDBAD,
        0x3BE0E0DB,0x56323264,0x4E3A3A74,0x1E0A0A14,0xDB494992,0x0A06060C,0x6C242448,0xE45C5CB8,
        0x5DC2C29F,0x6ED3D3BD,0xEFACAC43,0xA66262C4,0xA8919139,0xA4959531,0x37E4E4D3,0x8B7979F2,
        0x32E7E7D5,0x43C8C88B,0x5937376E,0xB76D6DDA,0x8C8D8D01,0x64D5D5B1,0xD24E4E9C,0xE0A9A949,
        0xB46C6CD8,0xFA5656AC,0x07F4F4F3,0x25EAEACF,0xAF6565CA,0x8E7A7AF4,0xE9AEAE47,0x18080810,
        0xD5BABA6F,0x887878F0,0x6F25254A,0x722E2E5C,0x241C1C38,0xF1A6A657,0xC7B4B473,0x51C6C697,
        0x23E8E8CB,0x7CDDDDA1,0x9C7474E8,0x211F1F3E,0xDD4B4B96,0xDCBDBD61,0x868B8B0D,0x858A8A0F,
        0x907070E0,0x423E3E7C,0xC4B5B571,0xAA6666CC,0xD8484890,0x05030306,0x01F6F6F7,0x120E0E1C,
        0xA36161C2,0x5F35356A,0xF95757AE,0xD0B9B969,0x91868617,0x58C1C199,0x271D1D3A,0xB99E9E27,
        0x38E1E1D9,0x13F8F8EB,0xB398982B,0x33111122,0xBB6969D2,0x70D9D9A9,0x898E8E07,0xA7949433,
        0xB69B9B2D,0x221E1E3C,0x92878715,0x20E9E9C9,0x49CECE87,0xFF5555AA,0x78282850,0x7ADFDFA5,
        0x8F8C8C03,0xF8A1A159,0x80898909,0x170D0D1A,0xDABFBF65,0x31E6E6D7,0xC6424284,0xB86868D0,
        0xC3414182,0xB0999929,0x772D2D5A,0x110F0F1E,0xCBB0B07B,0xFC5454A8,0xD6BBBB6D,0x3A16162C,
    };

    const Bit32 DecryptTable[256] =
    {
        0x50A7F451,0x5365417E,0xC3A4171A,0x965E273A,0xCB6BAB3B,0xF1459D1F,0xAB58FAAC,0x9303E34B,
        0x55FA3020,0xF66D76AD,0x9176CC88,0x254C02F5,0xFCD7E54F,0xD7CB2AC5,0x80443526,0x8FA362B5,
        0x495AB1DE,0x671BBA25,0x980EEA45,0xE1C0FE5D,0x02752FC3,0x12F04C81,0xA397468D,0xC6F9D36B,
        0xE75F8F03,0x959C9215,0xEB7A6DBF,0xDA595295,0x2D83BED4,0xD3217458,0x2969E049,0x44C8C98E,
        0x6A89C275,0x78798EF4,0x6B3E5899,0xDD71B927,0xB64FE1BE,0x17AD88F0,0x66AC20C9,0xB43ACE7D,
        0x184ADF63,0x82311AE5,0x60335197,0x457F5362,0xE07764B1,0x84AE6BBB,0x1CA081FE,0x942B08F9,
        0x58684870,0x19FD458F,0x876CDE94,0xB7F87B52,0x23D373AB,0xE2024B72,0x578F1FE3,0x2AAB5566,
        0x0728EBB2,0x03C2B52F,0x9A7BC586,0xA50837D3,0xF2872830,0xB2A5BF23,0xBA6A0302,0x5C8216ED,
        0x2B1CCF8A,0x92B479A7,0xF0F207F3,0xA1E2694E,0xCDF4DA65,0xD5BE0506,0x1F6234D1,0x8AFEA6C4,
        0x9D532E34,0xA055F3A2,0x32E18A05,0x75EBF6A4,0x39EC830B,0xAAEF6040,0x069F715E,0x51106EBD,
        0xF98A213E,0x3D06DD96,0xAE053EDD,0x46BDE64D,0xB58D5491,0x055DC471,0x6FD40604,0xFF155060,
        0x24FB9819,0x97E9BDD6,0xCC434089,0x779ED967,0xBD42E8B0,0x888B8907,0x385B19E7,0xDBEEC879,
        0x470A7CA1,0xE90F427C,0xC91E84F8,0x00000000,0x83868009,0x48ED2B32,0xAC70111E,0x4E725A6C,
        0xFBFF0EFD,0x5638850F,0x1ED5AE3D,0x27392D36,0x64D90F0A,0x21A65C68,0xD1545B9B,0x3A2E3624,
        0xB1670A0C,0x0FE75793,0xD296EEB4,0x9E919B1B,0x4FC5C080,0xA220DC61,0x694B775A,0x161A121C,
        0x0ABA93E2,0xE52AA0C0,0x43E0223C,0x1D171B12,0x0B0D090E,0xADC78BF2,0xB9A8B62D,0xC8A91E14,
        0x8519F157,0x4C0775AF,0xBBDD99EE,0xFD607FA3,0x9F2601F7,0xBCF5725C,0xC53B6644,0x347EFB5B,
        0x7629438B,0xDCC623CB,0x68FCEDB6,0x63F1E4B8,0xCADC31D7,0x10856342,0x40229713,0x2011C684,
        0x7D244A85,0xF83DBBD2,0x1132F9AE,0x6DA129C7,0x4B2F9E1D,0xF330B2DC,0xEC52860D,0xD0E3C177,
        0x6C16B32B,0x99B970A9,0xFA489411,0x2264E947,0xC48CFCA8,0x1A3FF0A0,0xD82C7D56,0xEF903322,
        0xC74E4987,0xC1D138D9,0xFEA2CA8C,0x360BD498,0xCF81F5A6,0x28DE7AA5,0x268EB7DA,0xA4BFAD3F,
        0xE49D3A2C,0x0D927850,0x9BCC5F6A,0x62467E54,0xC2138DF6,0xE8B8D890,0x5EF7392E,0xF5AFC382,
        0xBE805D9F,0x7C93D069,0xA92DD56F,0xB31225CF,0x3B99ACC8,0xA77D1810,0x6E639CE8,0x7BBB3BDB,
        0x097826CD,0xF418596E,0x01B79AEC,0xA89A4F83,0x656E95E6,0x7EE6FFAA,0x08CFBC21,0xE6E815EF,
        0xD99BE7BA,0xCE366F4A,0xD4099FEA,0xD67CB029,0xAFB2A431,0x31233F2A,0x3094A5C6,0xC066A235,
        0x37BC4E74,0xA6CA82FC,0xB0D090E0,0x15D8A733,0x4A9804F1,0xF7DAEC41,0x0E50CD7F,0x2FF69117,
        0x8DD64D76,0x4DB0EF43,0x544DAACC,0xDF0496E4,0xE3B5D19E,0x1B886A4C,0xB81F2CC1,0x7F516546,
        0x04EA5E9D,0x5D358C01,0x737487FA,0x2E410BFB,0x5A1D67B3,0x52D2DB92,0x335610E9,0x1347D66D,
        0x8C61D79A,0x7A0CA137,0x8E14F859,0x893C13EB,0xEE27A9CE,0x35C961B7,0xEDE51CE1,0x3CB1477A,
        0x59DFD29C,0x3F73F255,0x79CE1418,0xBF37C773,0xEACDF753,0x5BAAFD5F,0x146F3DDF,0x86DB4478,
        0x81F3AFCA,0x3EC468B9,0x2C342438,0x5F40A3C2,0x72C31D16,0x0C25E2BC,0x8B493C28,0x41950DFF,
        0x7101A839,0xDEB30C08,0x9CE4B4D8,0x90C15664,0x6184CB7B,0x70B632D5,0x745C6C48,0x4257B8D0,
    };

    inline Bit32 RotateLeft(Bit32 value, int shift) NN_NOEXCEPT
    {
        if (shift != 0)
        {
            return (value << shift) | (value >> (32 - shift));
        }
        else
        {
            return value;
        }
    }

    inline void InvMixColumns(Bit32* pDst, const Bit32* pSrc) NN_NOEXCEPT
    {
        for (int i = 0; i < BlockWords; ++i)
        {
            const Bit32 x0 = pSrc[i];
            const Bit32 x1 = (((x0 & ~0x80808080UL) << 1UL) ^ (((x0 & 0x80808080UL) >> 7UL) * 0x1BUL));
            const Bit32 x2 = (((x1 & ~0x80808080UL) << 1UL) ^ (((x1 & 0x80808080UL) >> 7UL) * 0x1BUL));
            const Bit32 x3 = (((x2 & ~0x80808080UL) << 1UL) ^ (((x2 & 0x80808080UL) >> 7UL) * 0x1BUL));

            Bit32 w = static_cast<Bit32>(x0 ^ x3);
            w ^= (RotateLeft(w, MixShift) ^ x2);
            w ^= (RotateLeft(w, MixShift) ^ x1);
            w ^= (RotateLeft(w, MixShift) ^ x0);
            pDst[i] = w;
        }
    }

    inline Bit32 Func0(Bit32 x) NN_NOEXCEPT
    {
        return ( (SubBytesTable[(x >> AesWordByte0) & 0xFFUL] << AesWordByte3)
                 ^ (SubBytesTable[(x >> AesWordByte1) & 0xFFUL] << AesWordByte0)
                 ^ (SubBytesTable[(x >> AesWordByte2) & 0xFFUL] << AesWordByte1)
                 ^ (SubBytesTable[(x >> AesWordByte3) & 0xFFUL] << AesWordByte2) );
    }
    inline Bit32 Func1(Bit32 x) NN_NOEXCEPT
    {
        return ( (SubBytesTable[(x >> AesWordByte0) & 0xFFUL] << AesWordByte0)
                 ^ (SubBytesTable[(x >> AesWordByte1) & 0xFFUL] << AesWordByte1)
                 ^ (SubBytesTable[(x >> AesWordByte2) & 0xFFUL] << AesWordByte2)
                 ^ (SubBytesTable[(x >> AesWordByte3) & 0xFFUL] << AesWordByte3) );
    }
    inline Bit32 Func2(Bit32 x0, Bit32 x1, Bit32 x2, Bit32 x3) NN_NOEXCEPT
    {
        return ( RotateLeft(EncryptTable[(x0 >> AesWordByte0) & 0xFFUL], AesWordByte0)
                 ^ RotateLeft(EncryptTable[(x1 >> AesWordByte1) & 0xFFUL], AesWordByte1)
                 ^ RotateLeft(EncryptTable[(x2 >> AesWordByte2) & 0xFFUL], AesWordByte2)
                 ^ RotateLeft(EncryptTable[(x3 >> AesWordByte3) & 0xFFUL], AesWordByte3) );
    }
    inline Bit32 Func3(Bit32 x0, Bit32 x1, Bit32 x2, Bit32 x3) NN_NOEXCEPT
    {
        return ( (SubBytesTable[(x0 >> AesWordByte0) & 0xFFUL] << AesWordByte0)
                 | (SubBytesTable[(x1 >> AesWordByte1) & 0xFFUL] << AesWordByte1)
                 | (SubBytesTable[(x2 >> AesWordByte2) & 0xFFUL] << AesWordByte2)
                 | (SubBytesTable[(x3 >> AesWordByte3) & 0xFFUL] << AesWordByte3) );

    }
    inline Bit32 Func4(Bit32 x0, Bit32 x1, Bit32 x2, Bit32 x3) NN_NOEXCEPT
    {
        return ( RotateLeft(DecryptTable[(x0 >> AesWordByte0) & 0xFFUL], AesWordByte0)
                 ^ RotateLeft(DecryptTable[(x1 >> AesWordByte1) & 0xFFUL], AesWordByte1)
                 ^ RotateLeft(DecryptTable[(x2 >> AesWordByte2) & 0xFFUL], AesWordByte2)
                 ^ RotateLeft(DecryptTable[(x3 >> AesWordByte3) & 0xFFUL], AesWordByte3) );
    }
    inline Bit32 Func5(Bit32 x0, Bit32 x1, Bit32 x2, Bit32 x3) NN_NOEXCEPT
    {
        return ( (InvSubBytesTable[(x0 >> AesWordByte0) & 0xFFUL] << AesWordByte0)
                 ^ (InvSubBytesTable[(x1 >> AesWordByte1) & 0xFFUL] << AesWordByte1)
                 ^ (InvSubBytesTable[(x2 >> AesWordByte2) & 0xFFUL] << AesWordByte2)
                 ^ (InvSubBytesTable[(x3 >> AesWordByte3) & 0xFFUL] << AesWordByte3) );

    }

    inline bool GetAesNiAvailability() NN_NOEXCEPT
    {
        int cpuInfo[4];
        __cpuid(cpuInfo, 0x00000001);
        //              check AES-NI available && check SSE2 available
        return ((cpuInfo[2] & (1 << 25)) != 0) && ((cpuInfo[3] & (1 << 26)) != 0);
    }

}   // anonymous namespace

const bool g_IsAesNiAvailable = GetAesNiAvailability();

template <size_t KeySize>
AesImpl<KeySize>::~AesImpl() NN_NOEXCEPT
{
    ClearMemory(this, sizeof(*this));
}

template <size_t KeySize>
void AesImpl<KeySize>::Initialize(const void* pKey, size_t keySize, bool isEncryptionKey) NN_NOEXCEPT
{
    NN_STATIC_ASSERT(BlockSize == sizeof(__m128i));
    NN_SDK_REQUIRES_NOT_NULL(pKey);
    NN_SDK_REQUIRES(keySize == KeySize, "invalid key size. keySize(=%d) must be either 16, 24, 32", keySize);

    /* 鍵の拡張 */
    const int KeySizeInWord = static_cast<int>(keySize / sizeof(Bit32));
    Bit32*    pDst = m_RoundKey;
    Bit32     reg;

    /* 初期鍵をコピー */
    std::memcpy(pDst, pKey, keySize);

    /* 前段の最後のwordを使用 */
    reg = pDst[KeySizeInWord - 1];

    for (int i = KeySizeInWord; i < (RoundCount + 1) * 4; ++i)
    {
        /* SubWord + RotWord + XorRcon */
        if ((i % KeySizeInWord) == 0)
        {
            reg = Func0(reg);
            reg ^= (RoundKeyRcon0[i / KeySizeInWord - 1] << AesWordByte0);
        }
        else if ((KeySizeInWord > 6) && ((i % KeySizeInWord) == 4))
        {
            reg = Func1(reg);
        }

        reg ^= pDst[i - KeySizeInWord];
        pDst[i] = reg;
    }

    /*
     * InvMixColumns(a ^ b) = InvMixColumns(a) ^ InvMixColumns(b)
     * であることを利用して AddRoundKey 処理を遅延し,
     * 復号化でも暗号化と同様にテーブル検索を効率化する.
     */
    if (!isEncryptionKey)
    {
        if (g_IsAesNiAvailable)
        {
            auto pKey8 = reinterpret_cast<char*>(m_RoundKey);
            pKey8 += BlockSize;

            for (int i = 1; i < RoundCount; ++i)
            {
                const auto pKey128 = reinterpret_cast<__m128i*>(pKey8);
                _mm_storeu_si128(pKey128, _mm_aesimc_si128(_mm_loadu_si128(pKey128)));
                pKey8 += BlockSize;
            }
        }
        else
        {
            for (int i = 1; i < RoundCount; ++i)
            {
                InvMixColumns(&m_RoundKey[BlockWords * i], &m_RoundKey[BlockWords * i]);
            }
        }
    }
}

template <size_t KeySize>
void AesImpl<KeySize>::EncryptBlock(void* pDst, size_t dstSize, const void* pSrc, size_t srcSize) const NN_NOEXCEPT
{
    NN_SDK_REQUIRES_NOT_NULL(pSrc);
    NN_SDK_REQUIRES_NOT_NULL(pDst);
    NN_SDK_REQUIRES(dstSize == BlockSize && srcSize == BlockSize, "invalid block size is specified");
    NN_UNUSED(dstSize);
    NN_UNUSED(srcSize);

    if (g_IsAesNiAvailable)
    {
        auto pKey8 = reinterpret_cast<const char*>(m_RoundKey);

        /* 入力をレジスタへ */
        auto reg = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc));

        /* AddRoundKey */
        reg = _mm_xor_si128(reg, _mm_loadu_si128(reinterpret_cast<const __m128i*>(pKey8)));
        pKey8 += BlockSize;

        for (int round = 1; round < RoundCount; ++round)
        {
            /* SubBytes + ShiftRows + MixColumns + AddRoundKey */
            reg = _mm_aesenc_si128(reg, _mm_loadu_si128(reinterpret_cast<const __m128i*>(pKey8)));
            pKey8 += BlockSize;
        }

        /* SubBytes + ShiftRows + AddRound */
        reg = _mm_aesenclast_si128(reg, _mm_loadu_si128(reinterpret_cast<const __m128i*>(pKey8)));

        /* レジスタを出力バッファに書き出す */
        _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst), reg);
    }
    else
    {
        const Bit32* pKey32 = m_RoundKey;
        const Bit32* pSrc32 = static_cast<const Bit32*>(pSrc);
        Bit32* pDst32 = static_cast<Bit32*>(pDst);
        int round = RoundCount;

        /* AddRoundKey */
        Bit32 tmp0 = pSrc32[0] ^ pKey32[0];
        Bit32 tmp1 = pSrc32[1] ^ pKey32[1];
        Bit32 tmp2 = pSrc32[2] ^ pKey32[2];
        Bit32 tmp3 = pSrc32[3] ^ pKey32[3];
        pKey32 += 4;

        while (--round > 0)
        {
            /* ShiftRows + SubBytes + MixColumn */
            const Bit32 mix0 = Func2(tmp0, tmp1, tmp2, tmp3);
            const Bit32 mix1 = Func2(tmp1, tmp2, tmp3, tmp0);
            const Bit32 mix2 = Func2(tmp2, tmp3, tmp0, tmp1);
            const Bit32 mix3 = Func2(tmp3, tmp0, tmp1, tmp2);

            /* AddRoundKey */
            tmp0 = mix0 ^ pKey32[0];
            tmp1 = mix1 ^ pKey32[1];
            tmp2 = mix2 ^ pKey32[2];
            tmp3 = mix3 ^ pKey32[3];
            pKey32 += 4;
        }

        /* ShiftRows + SubBytes + AddRoundKey */
        pDst32[0] = (pKey32[0] ^ Func3(tmp0, tmp1, tmp2, tmp3));
        pDst32[1] = (pKey32[1] ^ Func3(tmp1, tmp2, tmp3, tmp0));
        pDst32[2] = (pKey32[2] ^ Func3(tmp2, tmp3, tmp0, tmp1));
        pDst32[3] = (pKey32[3] ^ Func3(tmp3, tmp0, tmp1, tmp2));
    }
}

template <size_t KeySize>
void AesImpl<KeySize>::DecryptBlock(void* pDst, size_t dstSize, const void* pSrc, size_t srcSize) const NN_NOEXCEPT
{
    NN_SDK_REQUIRES_NOT_NULL(pSrc);
    NN_SDK_REQUIRES_NOT_NULL(pDst);
    NN_SDK_REQUIRES(dstSize == BlockSize && srcSize == BlockSize, "invalid block size is specified");
    NN_UNUSED(dstSize);
    NN_UNUSED(srcSize);

    if (g_IsAesNiAvailable)
    {
        auto pKey8 = reinterpret_cast<const char*>(m_RoundKey) + (RoundCount * BlockSize);

        /* 入力をレジスタへ */
        auto reg = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc));

        /* AddRoundKey */
        reg = _mm_xor_si128(reg, _mm_loadu_si128(reinterpret_cast<const __m128i*>(pKey8)));
        pKey8 -= BlockSize;

        for (int round = RoundCount; round > 1; --round)
        {
            /* InvSubBytes + InvShiftRows + InvMixColumns + AddRoundKey */
            reg = _mm_aesdec_si128(reg, _mm_loadu_si128(reinterpret_cast<const __m128i*>(pKey8)));
            pKey8 -= BlockSize;
        }

        /* InvSubBytes + InvShiftRows + AddRound */
        reg = _mm_aesdeclast_si128(reg, _mm_loadu_si128(reinterpret_cast<const __m128i*>(pKey8)));

        /* レジスタを出力バッファに書き出す */
        _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst), reg);
    }
    else
    {
        const Bit32* pKey32 = m_RoundKey;
        const Bit32* pSrc32 = static_cast<const Bit32*>(pSrc);
        Bit32* pDst32 = static_cast<Bit32*>(pDst);
        int round = RoundCount;

        pKey32 += 4 * round;

        /* AddRoundKey */
        Bit32 tmp0 = pSrc32[0] ^ pKey32[0];
        Bit32 tmp1 = pSrc32[1] ^ pKey32[1];
        Bit32 tmp2 = pSrc32[2] ^ pKey32[2];
        Bit32 tmp3 = pSrc32[3] ^ pKey32[3];
        pKey32 -= 4;

        /*
         * InvMixColumns(a ^ b) = InvMixColumns(a) ^ InvMixColumns(b)
         * であることを利用して実際の AddRoundKey 処理を遅延し,
         * 暗号化と同様の形式でテーブル検索を効率化する.
         */
        while (--round > 0)
        {
            /* InvShiftRows + InvSubBytes + InvMixColumns(except pKey) */
            const Bit32 mix0 = Func4(tmp0, tmp3, tmp2, tmp1);
            const Bit32 mix1 = Func4(tmp1, tmp0, tmp3, tmp2);
            const Bit32 mix2 = Func4(tmp2, tmp1, tmp0, tmp3);
            const Bit32 mix3 = Func4(tmp3, tmp2, tmp1, tmp0);

            /* AddRoundKey(with InvMixColumns) */
            tmp0 = mix0 ^ pKey32[0];
            tmp1 = mix1 ^ pKey32[1];
            tmp2 = mix2 ^ pKey32[2];
            tmp3 = mix3 ^ pKey32[3];
            pKey32 -= 4;
        }

        /* InvShiftRows + InvSubBytes + AddRoundKey */
        pDst32[0] = (pKey32[0] ^ Func5(tmp0, tmp3, tmp2, tmp1));
        pDst32[1] = (pKey32[1] ^ Func5(tmp1, tmp0, tmp3, tmp2));
        pDst32[2] = (pKey32[2] ^ Func5(tmp2, tmp1, tmp0, tmp3));
        pDst32[3] = (pKey32[3] ^ Func5(tmp3, tmp2, tmp1, tmp0));
    }
}

/* テンプレートの明示的実体化 */
template class AesImpl<16>;
template class AesImpl<24>;
template class AesImpl<32>;

}}}

