﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#include <nn/nn_Common.h>
#include <nn/nn_SdkAssert.h>
#include <nn/crypto/detail/crypto_XtsModeImpl.h>
#include <nn/crypto/crypto_AesDecryptor.h>
#include <nn/crypto/crypto_AesEncryptor.h>
#include "crypto_UpdateImpl.h"

#include <arm_neon.h>

/**
 * Configuration parameters:
 *
 * Whether to use unrolled AES-128 code (~2.4x faster)
 * NN_CRYPTO_CONFIG_XTS_UNROLL_AES128
 *
 * Whether to use AArch64 ASM optimizations (~1.4x faster than using unrolled intrinsics)
 * NN_CRYPTO_CONFIG_XTS_USE_ASM
 *
 * Whether to use the SIMD version of Galois Field multiplication (usually not a good idea, see below)
 * NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT
 */

#if defined(NN_BUILD_CONFIG_CPU_CORTEX_A57_AARCH64)
    #define NN_CRYPTO_CONFIG_XTS_UNROLL_AES128    1
    #define NN_CRYPTO_CONFIG_XTS_USE_ASM          1
    #define NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT  0
#elif defined(NN_BUILD_CONFIG_CPU_CORTEX_A57_AARCH32)
    #define NN_CRYPTO_CONFIG_XTS_UNROLL_AES128    1
    #define NN_CRYPTO_CONFIG_XTS_USE_ASM          0
    #define NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT  0
#else
    #define NN_CRYPTO_CONFIG_XTS_UNROLL_AES128    0
    #define NN_CRYPTO_CONFIG_XTS_USE_ASM          0
    #define NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT  0
#endif

namespace nn { namespace crypto { namespace detail {

namespace {

/**
 * GF(2^128) 上での単位元を乗算する関数
 *
 * Multiplies `tweak` by the polynomial a¹ in the Galois Field GF(2¹²⁸) whose modulus is:
 * a¹²⁸ + a⁷ + a² + a¹ + a⁰
 *
 * This boils down to shifting left the 128-bit tweak value and, if the result overflowed, XOR it with 0x87
 * (0x87 being the binary representation of a⁷ + a² + a¹ + a⁰). Done!
 */
NN_FORCEINLINE uint8x16_t GfMult(uint8x16_t tweak)
{
#if !NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT
    // ALU version (usually faster as it runs in parallel of the encryption done on the SIMD unit)
    uint64x2_t tweak64 = vreinterpretq_u64_u8(tweak);
    uint64_t   tweakHi = vgetq_lane_u64(tweak64, 1);
    uint64_t   tweakLo = vgetq_lane_u64(tweak64, 0);
    uint64_t   mask    = int64_t(tweakHi) >> 63;
    tweakHi = (tweakHi << 1) | (tweakLo >> 63);
    tweakLo = (tweakLo << 1) ^ (mask & 0x87ull);
    return vreinterpretq_u8_u64(vcombine_u64(vmov_n_u64(tweakLo), vmov_n_u64(tweakHi)));
#else
    // SIMD version (could be faster in some cases, left here for reference)
    static NN_ALIGNAS(16) const uint8_t modulus[16] = {0x87,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
    const uint8x16_t mod = vld1q_u8(modulus);
    uint8x16_t tmp1, tmp2, res;
    tmp1 = vcombine_u8(vget_high_u8(tweak), vget_low_u8(tweak));
    res  = vreinterpretq_u8_u64(vshlq_n_u64(vreinterpretq_u64_u8(tweak), 1));
    tmp2 = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(tmp1), 63));
    tmp1 = vreinterpretq_u8_s64(vshrq_n_s64(vreinterpretq_s64_u8(tmp1), 63));
    tmp2 = vreinterpretq_u8_u64(vsetq_lane_u64(0, vreinterpretq_u64_u8(tmp2), 0));
    res  = vorrq_u8(res, tmp2);
    tmp1 = vandq_u8(tmp1, mod);
    return veorq_u8(res, tmp1);
#endif
}

}   // anonymous namespace


size_t XtsModeImpl::UpdateGeneric(void* pDst, size_t dstSize, const void* pSrc, size_t srcSize) NN_NOEXCEPT
{
    NN_SDK_REQUIRES(m_State == State_Initialized || m_State == State_Processing, "Invalid state. Please restart from Initialize().");

    return UpdateImpl<void>(this, pDst, dstSize, pSrc, srcSize);
}

size_t XtsModeImpl::ProcessBlocksGeneric(uint8_t* pDst8, const uint8_t* pSrc8, int numBlocks) NN_NOEXCEPT
{
    size_t processed = (numBlocks - 1) * BlockSize;

    // 1ブロック以上処理済みの場合はまずバッファされているブロックを処理する
    if (m_State == State_Processing)
    {
        ProcessBlock(pDst8, m_LastBlock);
        pDst8 += BlockSize;
        processed += BlockSize;
    }

    uint8x16_t tweak = vld1q_u8(m_Tweak);

    // 最後のブロック以外を処理して dst に書き出す
    while (--numBlocks > 0)
    {
        // Load block & apply tweak
        uint8x16_t block = vld1q_u8(pSrc8);
        pSrc8 += BlockSize;
        block = veorq_u8(block, tweak);

        // Send block to cipher
        vst1q_u8(pDst8, block);
        m_pCipherFunction(pDst8, pDst8, m_pCipherContext);
        block = vld1q_u8(pDst8);

        // Re-apply tweak and store block
        block = veorq_u8(block, tweak);
        vst1q_u8(pDst8, block);
        pDst8 += BlockSize;

        // Update tweak
        tweak = GfMult(tweak);
    }
    vst1q_u8(m_Tweak, tweak);

    // 最後のブロックは Finalize で使われる可能性があるので平文で保存しておく
    std::memcpy(m_LastBlock, pSrc8, BlockSize);

    m_State = State_Processing;

    return processed;
}

template <>
size_t XtsModeImpl::Update<AesEncryptor128>(void* pDst, size_t dstSize, const void* pSrc, size_t srcSize) NN_NOEXCEPT
{
    return UpdateImpl<AesEncryptor128>(this, pDst, dstSize, pSrc, srcSize);
}

template <>
size_t XtsModeImpl::ProcessBlocks<AesEncryptor128>(uint8_t* pDst8, const uint8_t* pSrc8, int numBlocks) NN_NOEXCEPT
{
    size_t processed = (numBlocks - 1) * BlockSize;

    // 1ブロック以上処理済みの場合はまずバッファされているブロックを処理する
    if (m_State == State_Processing)
    {
        ProcessBlock(pDst8, m_LastBlock);
        pDst8 += BlockSize;
        processed += BlockSize;
    }

    uint8x16_t tweak = vld1q_u8(m_Tweak);
    const uint8_t* keys = static_cast<const AesEncryptor128*>(m_pCipherContext)->GetRoundKey();

    // Preload AES-128 round keys
    const uint8x16_t key0  = vld1q_u8(keys);
    const uint8x16_t key1  = vld1q_u8(keys + 16);
    const uint8x16_t key2  = vld1q_u8(keys + 16 * 2);
    const uint8x16_t key3  = vld1q_u8(keys + 16 * 3);
    const uint8x16_t key4  = vld1q_u8(keys + 16 * 4);
    const uint8x16_t key5  = vld1q_u8(keys + 16 * 5);
    const uint8x16_t key6  = vld1q_u8(keys + 16 * 6);
    const uint8x16_t key7  = vld1q_u8(keys + 16 * 7);
    const uint8x16_t key8  = vld1q_u8(keys + 16 * 8);
    const uint8x16_t key9  = vld1q_u8(keys + 16 * 9);
    const uint8x16_t key10 = vld1q_u8(keys + 16 * 10);

#if NN_CRYPTO_CONFIG_XTS_UNROLL_AES128
    // Latencies of AES instructions are masked by processing 3 blocks at a time.
    // GF multiplication is done on the scalar ALU, and so runs in parallel of AES (i.e. it's basically free)
    // For increased legibility, the two instructions streams are written side by side.
    // Note that this code is optimized for Cortex-A57 and might run slower on other processors.

    // Convenience macros for AES-128 encryption
    #define AES128E_RND(num,data) "aese  %[" #data "].16b, %[key" #num "].16b\n" \
                                  "aesmc %[" #data "].16b, %[" #data "].16b\n"
    #define AES128E_RND9(data)    "aese  %[" #data "].16b, %[key9].16b\n"
    #define AES128E_RND10(data)   "eor   %[" #data "].16b, %[" #data "].16b, %[key10].16b\n"

    const int batchSize = 3;
    if (numBlocks > batchSize)
    {
        uint8x16_t block1, block2, block3;
        uint8x16_t tweak2, tweak3, tweak4, tweak5, tweak6;

        // Premultiply tweak twice
        tweak2 = GfMult(tweak);
        tweak3 = GfMult(tweak2);

        do
        {
            block1 = vld1q_u8(pSrc8); pSrc8 += 16;
            block2 = vld1q_u8(pSrc8); pSrc8 += 16;
            block3 = vld1q_u8(pSrc8); pSrc8 += 16;

            block1 = veorq_u8(block1, tweak);
            block2 = veorq_u8(block2, tweak2);
            block3 = veorq_u8(block3, tweak3);

#if NN_CRYPTO_CONFIG_XTS_USE_ASM
#if !NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT
            const uint64_t mod = 0x87;
            uint64_t hi, lo, mask, tmp;
            __asm__ volatile
            (
                                                  AES128E_RND(0,block1)
                "mov %[hi],   %[tweak3].d[1]\n"   AES128E_RND(0,block2)
                "mov %[lo],   %[tweak3].d[0]\n"   AES128E_RND(0,block3)
                "asr %[mask], %[hi], #63\n"       AES128E_RND(1,block1)
                "lsl %[hi],   %[hi], #1\n"        AES128E_RND(1,block2)
                "lsr %[tmp],  %[lo], #63\n"       AES128E_RND(1,block3)
                "and %[mask], %[mask], %[mod]\n"  AES128E_RND(2,block1)
                "lsl %[lo],   %[lo], #1\n"        AES128E_RND(2,block2)
                "orr %[hi],   %[hi], %[tmp]\n"    AES128E_RND(2,block3)
                "eor %[lo],   %[lo], %[mask]\n"   AES128E_RND(3,block1)
                "mov %[tweak4].d[1], %[hi]\n"     AES128E_RND(3,block2)
                "mov %[tweak4].d[0], %[lo]\n"     AES128E_RND(3,block3)
                "asr %[mask], %[hi], #63\n"       AES128E_RND(4,block1)
                "lsl %[hi],   %[hi], #1\n"        AES128E_RND(4,block2)
                "lsr %[tmp],  %[lo], #63\n"       AES128E_RND(4,block3)
                "and %[mask], %[mask], %[mod]\n"  AES128E_RND(5,block1)
                "lsl %[lo],   %[lo], #1\n"        AES128E_RND(5,block2)
                "orr %[hi],   %[hi], %[tmp]\n"    AES128E_RND(5,block3)
                "eor %[lo],   %[lo], %[mask]\n"   AES128E_RND(6,block1)
                "mov %[tweak5].d[1], %[hi]\n"     AES128E_RND(6,block2)
                "mov %[tweak5].d[0], %[lo]\n"     AES128E_RND(6,block3)
                "asr %[mask], %[hi], #63\n"       AES128E_RND(7,block1)
                "lsl %[hi],   %[hi], #1\n"        AES128E_RND(7,block2)
                "lsr %[tmp],  %[lo], #63\n"       AES128E_RND(7,block3)
                "and %[mask], %[mask], %[mod]\n"  AES128E_RND(8,block1)
                "lsl %[lo],   %[lo], #1\n"        AES128E_RND(8,block2)
                "orr %[hi],   %[hi], %[tmp]\n"    AES128E_RND(8,block3)
                "eor %[lo],   %[lo], %[mask]\n"   AES128E_RND9( block1)
                "mov %[tweak6].d[1], %[hi]\n"     AES128E_RND9( block2)
                "mov %[tweak6].d[0], %[lo]\n"     AES128E_RND9( block3)
                                                  AES128E_RND10(block1)
                                                  AES128E_RND10(block2)
                                                  AES128E_RND10(block3)

                : [block1]"+w"(block1),  [block2]"+w"(block2),  [block3]"+w"(block3),
                  [tweak4]"=&w"(tweak4), [tweak5]"=&w"(tweak5), [tweak6]"=&w"(tweak6),
                  [hi]"=&r"(hi), [lo]"=&r"(lo), [tmp]"=&r"(tmp), [mask]"=&r"(mask)
                : [key0]"w"(key0), [key1]"w"(key1), [key2]"w"(key2), [key3]"w"(key3),
                  [key4]"w"(key4), [key5]"w"(key5), [key6]"w"(key6), [key7]"w"(key7),
                  [key8]"w"(key8), [key9]"w"(key9), [key10]"w"(key10),
                  [tweak3]"w"(tweak3), [mod]"r"(mod)
                : /* Empty clobber list */
            );
#else  // NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT
            // Same as above, but GF mult is done on the SIMD unit. Left here for reference, might be useful some day.
            static NN_ALIGNAS(16) const uint8_t mod[16] = {0x87,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            uint8x16_t tmp1, tmp2;
            __asm__ volatile
            (
                "mov  %[tmp1].d[0], %[tweak3].d[1]\n"               AES128E_RND(0,block1)
                "mov  %[tmp1].d[1], %[tweak3].d[0]\n"               AES128E_RND(0,block2)
                "shl  %[tweak4].2d, %[tweak3].2d, #1\n"             AES128E_RND(0,block3)
                "ushr %[tmp2].2d, %[tmp1].2d, #63\n"                AES128E_RND(1,block1)
                "sshr %[tmp1].2d, %[tmp1].2d, #63\n"                AES128E_RND(1,block2)
                "mov  %[tmp2].d[0], xzr\n"                          AES128E_RND(1,block3)
                "orr  %[tweak4].16b, %[tweak4].16b, %[tmp2].16b\n"  AES128E_RND(2,block1)
                "and  %[tmp1].16b, %[tmp1].16b, %[mod].16b\n"       AES128E_RND(2,block2)
                                                                    AES128E_RND(2,block3)
                "eor  %[tweak4].16b, %[tweak4].16b, %[tmp1].16b\n"  AES128E_RND(3,block1)
                                                                    AES128E_RND(3,block2)
                "mov  %[tmp1].d[0], %[tweak4].d[1]\n"               AES128E_RND(3,block3)
                "mov  %[tmp1].d[1], %[tweak4].d[0]\n"               AES128E_RND(4,block1)
                "shl  %[tweak5].2d, %[tweak4].2d, #1\n"             AES128E_RND(4,block2)
                "ushr %[tmp2].2d, %[tmp1].2d, #63\n"                AES128E_RND(4,block3)
                "sshr %[tmp1].2d, %[tmp1].2d, #63\n"                AES128E_RND(5,block1)
                "mov  %[tmp2].d[0], xzr\n"                          AES128E_RND(5,block2)
                "orr  %[tweak5].16b, %[tweak5].16b, %[tmp2].16b\n"  AES128E_RND(5,block3)
                "and  %[tmp1].16b, %[tmp1].16b, %[mod].16b\n"       AES128E_RND(6,block1)
                                                                    AES128E_RND(6,block2)
                "eor  %[tweak5].16b, %[tweak5].16b, %[tmp1].16b\n"  AES128E_RND(6,block3)
                                                                    AES128E_RND(7,block1)
                                                                    AES128E_RND(7,block2)
                "mov  %[tmp1].d[0], %[tweak5].d[1]\n"               AES128E_RND(7,block3)
                "mov  %[tmp1].d[1], %[tweak5].d[0]\n"               AES128E_RND(8,block1)
                "shl  %[tweak6].2d, %[tweak5].2d, #1\n"             AES128E_RND(8,block2)
                "ushr %[tmp2].2d, %[tmp1].2d, #63\n"                AES128E_RND(8,block3)
                "sshr %[tmp1].2d, %[tmp1].2d, #63\n"                AES128E_RND9( block1)
                "mov  %[tmp2].d[0], xzr\n"                          AES128E_RND9( block2)
                                                                    AES128E_RND9( block3)
                "orr  %[tweak6].16b, %[tweak6].16b, %[tmp2].16b\n"  AES128E_RND10(block1)
                "and  %[tmp1].16b, %[tmp1].16b, %[mod].16b\n"       AES128E_RND10(block2)
                                                                    AES128E_RND10(block3)
                "eor  %[tweak6].16b, %[tweak6].16b, %[tmp1].16b\n"

                : [block1]"+w"(block1),  [block2]"+w"(block2), [block3]"+w"(block3),
                  [tweak4]"=&w"(tweak4), [tweak5]"=&w"(tweak5), [tweak6]"=&w"(tweak6),
                  [tmp1]"=&w"(tmp1), [tmp2]"=&w"(tmp2)
                : [key0]"w"(key0), [key1]"w"(key1), [key2]"w"(key2), [key3]"w"(key3),
                  [key4]"w"(key4), [key5]"w"(key5), [key6]"w"(key6), [key7]"w"(key7),
                  [key8]"w"(key8), [key9]"w"(key9), [key10]"w"(key10),
                  [tweak3]"w"(tweak3), [mod]"w"(mod)
                : /* Empty clobber list */
            );
#endif // NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT
#else  // NN_CRYPTO_CONFIG_XTS_USE_ASM
            block1 = vaesmcq_u8(vaeseq_u8(block1, key0));  block2 = vaesmcq_u8(vaeseq_u8(block2, key0));  block3 = vaesmcq_u8(vaeseq_u8(block3, key0));
            block1 = vaesmcq_u8(vaeseq_u8(block1, key1));  block2 = vaesmcq_u8(vaeseq_u8(block2, key1));  block3 = vaesmcq_u8(vaeseq_u8(block3, key1));
            block1 = vaesmcq_u8(vaeseq_u8(block1, key2));  block2 = vaesmcq_u8(vaeseq_u8(block2, key2));  block3 = vaesmcq_u8(vaeseq_u8(block3, key2));
            block1 = vaesmcq_u8(vaeseq_u8(block1, key3));  block2 = vaesmcq_u8(vaeseq_u8(block2, key3));  block3 = vaesmcq_u8(vaeseq_u8(block3, key3));
            block1 = vaesmcq_u8(vaeseq_u8(block1, key4));  block2 = vaesmcq_u8(vaeseq_u8(block2, key4));  block3 = vaesmcq_u8(vaeseq_u8(block3, key4));
            block1 = vaesmcq_u8(vaeseq_u8(block1, key5));  block2 = vaesmcq_u8(vaeseq_u8(block2, key5));  block3 = vaesmcq_u8(vaeseq_u8(block3, key5));
            block1 = vaesmcq_u8(vaeseq_u8(block1, key6));  block2 = vaesmcq_u8(vaeseq_u8(block2, key6));  block3 = vaesmcq_u8(vaeseq_u8(block3, key6));
            block1 = vaesmcq_u8(vaeseq_u8(block1, key7));  block2 = vaesmcq_u8(vaeseq_u8(block2, key7));  block3 = vaesmcq_u8(vaeseq_u8(block3, key7));
            block1 = vaesmcq_u8(vaeseq_u8(block1, key8));  block2 = vaesmcq_u8(vaeseq_u8(block2, key8));  block3 = vaesmcq_u8(vaeseq_u8(block3, key8));
            block1 = vaeseq_u8(block1, key9);              block2 = vaeseq_u8(block2, key9);              block3 = vaeseq_u8(block3, key9);
            block1 = veorq_u8(block1, key10);              block2 = veorq_u8(block2, key10);              block3 = veorq_u8(block3, key10);

            tweak4 = GfMult(tweak3);
            tweak5 = GfMult(tweak4);
            tweak6 = GfMult(tweak5);
#endif // NN_CRYPTO_CONFIG_XTS_USE_ASM

            block1 = veorq_u8(block1, tweak);
            block2 = veorq_u8(block2, tweak2);
            block3 = veorq_u8(block3, tweak3);

            tweak  = tweak4;
            tweak2 = tweak5;
            tweak3 = tweak6;

            vst1q_u8(pDst8, block1); pDst8 += 16;
            vst1q_u8(pDst8, block2); pDst8 += 16;
            vst1q_u8(pDst8, block3); pDst8 += 16;

            numBlocks -= batchSize;

        } while (numBlocks > batchSize);
    }
#endif // NN_CRYPTO_CONFIG_XTS_UNROLL_AES128

    // 最後のブロック以外を処理して dst に書き出す
    while (--numBlocks > 0)
    {
        // Load block & apply tweak
        uint8x16_t block = vld1q_u8(pSrc8);
        pSrc8 += BlockSize;
        block = veorq_u8(block, tweak);

        // Send block to cipher
        block = vaesmcq_u8(vaeseq_u8(block, key0));
        block = vaesmcq_u8(vaeseq_u8(block, key1));
        block = vaesmcq_u8(vaeseq_u8(block, key2));
        block = vaesmcq_u8(vaeseq_u8(block, key3));
        block = vaesmcq_u8(vaeseq_u8(block, key4));
        block = vaesmcq_u8(vaeseq_u8(block, key5));
        block = vaesmcq_u8(vaeseq_u8(block, key6));
        block = vaesmcq_u8(vaeseq_u8(block, key7));
        block = vaesmcq_u8(vaeseq_u8(block, key8));
        block = vaeseq_u8(block, key9);
        block = veorq_u8(block, key10);

        // Re-apply tweak and store block
        block = veorq_u8(block, tweak);
        vst1q_u8(pDst8, block);
        pDst8 += BlockSize;

        // Update tweak
        tweak = GfMult(tweak);
    }
    vst1q_u8(m_Tweak, tweak);

    // 最後のブロックは Finalize で使われる可能性があるので平文で保存しておく
    std::memcpy(m_LastBlock, pSrc8, BlockSize);

    m_State = State_Processing;

    return processed;
}

template <>
size_t XtsModeImpl::Update<AesDecryptor128>(void* pDst, size_t dstSize, const void* pSrc, size_t srcSize) NN_NOEXCEPT
{
    return UpdateImpl<AesDecryptor128>(this, pDst, dstSize, pSrc, srcSize);
}

template <>
size_t XtsModeImpl::ProcessBlocks<AesDecryptor128>(uint8_t* pDst8, const uint8_t* pSrc8, int numBlocks) NN_NOEXCEPT
{
    size_t processed = (numBlocks - 1) * BlockSize;

    // 1ブロック以上処理済みの場合はまずバッファされているブロックを処理する
    if (m_State == State_Processing)
    {
        ProcessBlock(pDst8, m_LastBlock);
        pDst8 += BlockSize;
        processed += BlockSize;
    }

    uint8x16_t tweak = vld1q_u8(m_Tweak);
    const uint8_t* keys = static_cast<const AesEncryptor128*>(m_pCipherContext)->GetRoundKey();

    // Preload AES-128 round keys
    const uint8x16_t key0  = vld1q_u8(keys);
    const uint8x16_t key1  = vld1q_u8(keys + 16);
    const uint8x16_t key2  = vld1q_u8(keys + 16 * 2);
    const uint8x16_t key3  = vld1q_u8(keys + 16 * 3);
    const uint8x16_t key4  = vld1q_u8(keys + 16 * 4);
    const uint8x16_t key5  = vld1q_u8(keys + 16 * 5);
    const uint8x16_t key6  = vld1q_u8(keys + 16 * 6);
    const uint8x16_t key7  = vld1q_u8(keys + 16 * 7);
    const uint8x16_t key8  = vld1q_u8(keys + 16 * 8);
    const uint8x16_t key9  = vld1q_u8(keys + 16 * 9);
    const uint8x16_t key10 = vld1q_u8(keys + 16 * 10);

#if NN_CRYPTO_CONFIG_XTS_UNROLL_AES128
    // Latencies of AES instructions are masked by processing 3 blocks at a time.
    // GF multiplication is done on the scalar ALU, and so runs in parallel of AES (i.e. it's basically free)
    // For increased legibility, the two instructions streams are written side by side.
    // Note that this code is optimized for Cortex-A57 and might run slower on other processors.

    // Convenience macros for AES-128 decryption
    #define AES128D_RND(num,data) "aesd   %[" #data "].16b, %[key" #num "].16b\n" \
                                  "aesimc %[" #data "].16b, %[" #data "].16b\n"
    #define AES128D_RND1(data)    "aesd   %[" #data "].16b, %[key1].16b\n"
    #define AES128D_RND0(data)    "eor    %[" #data "].16b, %[" #data "].16b, %[key0].16b\n"

    const int batchSize = 3;
    if (numBlocks > batchSize)
    {
        uint8x16_t block1, block2, block3;
        uint8x16_t tweak2, tweak3, tweak4, tweak5, tweak6;

        // Premultiply tweak twice
        tweak2 = GfMult(tweak);
        tweak3 = GfMult(tweak2);

        do
        {
            block1 = vld1q_u8(pSrc8); pSrc8 += 16;
            block2 = vld1q_u8(pSrc8); pSrc8 += 16;
            block3 = vld1q_u8(pSrc8); pSrc8 += 16;

            block1 = veorq_u8(block1, tweak);
            block2 = veorq_u8(block2, tweak2);
            block3 = veorq_u8(block3, tweak3);

#if NN_CRYPTO_CONFIG_XTS_USE_ASM
#if !NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT
            const uint64_t mod = 0x87;
            uint64_t hi, lo, mask, tmp;
            __asm__ volatile
            (
                                                  AES128D_RND(10,block1)
                "mov %[hi],   %[tweak3].d[1]\n"   AES128D_RND(10,block2)
                "mov %[lo],   %[tweak3].d[0]\n"   AES128D_RND(10,block3)
                "asr %[mask], %[hi], #63\n"       AES128D_RND(9,block1)
                "lsl %[hi],   %[hi], #1\n"        AES128D_RND(9,block2)
                "lsr %[tmp],  %[lo], #63\n"       AES128D_RND(9,block3)
                "and %[mask], %[mask], %[mod]\n"  AES128D_RND(8,block1)
                "lsl %[lo],   %[lo], #1\n"        AES128D_RND(8,block2)
                "orr %[hi],   %[hi], %[tmp]\n"    AES128D_RND(8,block3)
                "eor %[lo],   %[lo], %[mask]\n"   AES128D_RND(7,block1)
                "mov %[tweak4].d[1], %[hi]\n"     AES128D_RND(7,block2)
                "mov %[tweak4].d[0], %[lo]\n"     AES128D_RND(7,block3)
                "asr %[mask], %[hi], #63\n"       AES128D_RND(6,block1)
                "lsl %[hi],   %[hi], #1\n"        AES128D_RND(6,block2)
                "lsr %[tmp],  %[lo], #63\n"       AES128D_RND(6,block3)
                "and %[mask], %[mask], %[mod]\n"  AES128D_RND(5,block1)
                "lsl %[lo],   %[lo], #1\n"        AES128D_RND(5,block2)
                "orr %[hi],   %[hi], %[tmp]\n"    AES128D_RND(5,block3)
                "eor %[lo],   %[lo], %[mask]\n"   AES128D_RND(4,block1)
                "mov %[tweak5].d[1], %[hi]\n"     AES128D_RND(4,block2)
                "mov %[tweak5].d[0], %[lo]\n"     AES128D_RND(4,block3)
                "asr %[mask], %[hi], #63\n"       AES128D_RND(3,block1)
                "lsl %[hi],   %[hi], #1\n"        AES128D_RND(3,block2)
                "lsr %[tmp],  %[lo], #63\n"       AES128D_RND(3,block3)
                "and %[mask], %[mask], %[mod]\n"  AES128D_RND(2,block1)
                "lsl %[lo],   %[lo], #1\n"        AES128D_RND(2,block2)
                "orr %[hi],   %[hi], %[tmp]\n"    AES128D_RND(2,block3)
                "eor %[lo],   %[lo], %[mask]\n"   AES128D_RND1( block1)
                "mov %[tweak6].d[1], %[hi]\n"     AES128D_RND1( block2)
                "mov %[tweak6].d[0], %[lo]\n"     AES128D_RND1( block3)
                                                  AES128D_RND0( block1)
                                                  AES128D_RND0( block2)
                                                  AES128D_RND0( block3)

                : [block1]"+w"(block1),  [block2]"+w"(block2),  [block3]"+w"(block3),
                  [tweak4]"=&w"(tweak4), [tweak5]"=&w"(tweak5), [tweak6]"=&w"(tweak6),
                  [hi]"=&r"(hi), [lo]"=&r"(lo), [tmp]"=&r"(tmp), [mask]"=&r"(mask)
                : [key0]"w"(key0), [key1]"w"(key1), [key2]"w"(key2), [key3]"w"(key3),
                  [key4]"w"(key4), [key5]"w"(key5), [key6]"w"(key6), [key7]"w"(key7),
                  [key8]"w"(key8), [key9]"w"(key9), [key10]"w"(key10),
                  [tweak3]"w"(tweak3), [mod]"r"(mod)
                : /* Empty clobber list */
            );
#else // NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT
            // Same as above, but GF mult is done on the SIMD unit. Left here for reference, might be useful some day.
            static NN_ALIGNAS(16) const uint8_t mod[16] = {0x87,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
            uint8x16_t tmp1, tmp2;
            __asm__ volatile
            (
                "mov  %[tmp1].d[0], %[tweak3].d[1]\n"               AES128D_RND(10,block1)
                "mov  %[tmp1].d[1], %[tweak3].d[0]\n"               AES128D_RND(10,block2)
                "shl  %[tweak4].2d, %[tweak3].2d, #1\n"             AES128D_RND(10,block3)
                "ushr %[tmp2].2d, %[tmp1].2d, #63\n"                AES128D_RND(9,block1)
                "sshr %[tmp1].2d, %[tmp1].2d, #63\n"                AES128D_RND(9,block2)
                "mov  %[tmp2].d[0], xzr\n"                          AES128D_RND(9,block3)
                "orr  %[tweak4].16b, %[tweak4].16b, %[tmp2].16b\n"  AES128D_RND(8,block1)
                "and  %[tmp1].16b, %[tmp1].16b, %[mod].16b\n"       AES128D_RND(8,block2)
                                                                    AES128D_RND(8,block3)
                "eor  %[tweak4].16b, %[tweak4].16b, %[tmp1].16b\n"  AES128D_RND(7,block1)
                                                                    AES128D_RND(7,block2)
                "mov  %[tmp1].d[0], %[tweak4].d[1]\n"               AES128D_RND(7,block3)
                "mov  %[tmp1].d[1], %[tweak4].d[0]\n"               AES128D_RND(6,block1)
                "shl  %[tweak5].2d, %[tweak4].2d, #1\n"             AES128D_RND(6,block2)
                "ushr %[tmp2].2d, %[tmp1].2d, #63\n"                AES128D_RND(6,block3)
                "sshr %[tmp1].2d, %[tmp1].2d, #63\n"                AES128D_RND(5,block1)
                "mov  %[tmp2].d[0], xzr\n"                          AES128D_RND(5,block2)
                "orr  %[tweak5].16b, %[tweak5].16b, %[tmp2].16b\n"  AES128D_RND(5,block3)
                "and  %[tmp1].16b, %[tmp1].16b, %[mod].16b\n"       AES128D_RND(4,block1)
                                                                    AES128D_RND(4,block2)
                "eor  %[tweak5].16b, %[tweak5].16b, %[tmp1].16b\n"  AES128D_RND(4,block3)
                                                                    AES128D_RND(3,block1)
                                                                    AES128D_RND(3,block2)
                "mov  %[tmp1].d[0], %[tweak5].d[1]\n"               AES128D_RND(3,block3)
                "mov  %[tmp1].d[1], %[tweak5].d[0]\n"               AES128D_RND(2,block1)
                "shl  %[tweak6].2d, %[tweak5].2d, #1\n"             AES128D_RND(2,block2)
                "ushr %[tmp2].2d, %[tmp1].2d, #63\n"                AES128D_RND(2,block3)
                "sshr %[tmp1].2d, %[tmp1].2d, #63\n"                AES128D_RND1( block1)
                "mov  %[tmp2].d[0], xzr\n"                          AES128D_RND1( block2)
                                                                    AES128D_RND1( block3)
                "orr  %[tweak6].16b, %[tweak6].16b, %[tmp2].16b\n"  AES128D_RND0( block1)
                "and  %[tmp1].16b, %[tmp1].16b, %[mod].16b\n"       AES128D_RND0( block2)
                                                                    AES128D_RND0( block3)
                "eor  %[tweak6].16b, %[tweak6].16b, %[tmp1].16b\n"

                : [block1]"+w"(block1),  [block2]"+w"(block2), [block3]"+w"(block3),
                  [tweak4]"=&w"(tweak4), [tweak5]"=&w"(tweak5), [tweak6]"=&w"(tweak6),
                  [tmp1]"=&w"(tmp1), [tmp2]"=&w"(tmp2)
                : [key0]"w"(key0), [key1]"w"(key1), [key2]"w"(key2), [key3]"w"(key3),
                  [key4]"w"(key4), [key5]"w"(key5), [key6]"w"(key6), [key7]"w"(key7),
                  [key8]"w"(key8), [key9]"w"(key9), [key10]"w"(key10),
                  [tweak3]"w"(tweak3), [mod]"w"(mod)
                : /* Empty clobber list */
            );
#endif // NN_CRYPTO_CONFIG_XTS_USE_SIMD_GFMULT
#else  // NN_CRYPTO_CONFIG_XTS_USE_ASM
            block1 = vaesimcq_u8(vaesdq_u8(block1, key10));  block2 = vaesimcq_u8(vaesdq_u8(block2, key10));  block3 = vaesimcq_u8(vaesdq_u8(block3, key10));
            block1 = vaesimcq_u8(vaesdq_u8(block1, key9));   block2 = vaesimcq_u8(vaesdq_u8(block2, key9));   block3 = vaesimcq_u8(vaesdq_u8(block3, key9));
            block1 = vaesimcq_u8(vaesdq_u8(block1, key8));   block2 = vaesimcq_u8(vaesdq_u8(block2, key8));   block3 = vaesimcq_u8(vaesdq_u8(block3, key8));
            block1 = vaesimcq_u8(vaesdq_u8(block1, key7));   block2 = vaesimcq_u8(vaesdq_u8(block2, key7));   block3 = vaesimcq_u8(vaesdq_u8(block3, key7));
            block1 = vaesimcq_u8(vaesdq_u8(block1, key6));   block2 = vaesimcq_u8(vaesdq_u8(block2, key6));   block3 = vaesimcq_u8(vaesdq_u8(block3, key6));
            block1 = vaesimcq_u8(vaesdq_u8(block1, key5));   block2 = vaesimcq_u8(vaesdq_u8(block2, key5));   block3 = vaesimcq_u8(vaesdq_u8(block3, key5));
            block1 = vaesimcq_u8(vaesdq_u8(block1, key4));   block2 = vaesimcq_u8(vaesdq_u8(block2, key4));   block3 = vaesimcq_u8(vaesdq_u8(block3, key4));
            block1 = vaesimcq_u8(vaesdq_u8(block1, key3));   block2 = vaesimcq_u8(vaesdq_u8(block2, key3));   block3 = vaesimcq_u8(vaesdq_u8(block3, key3));
            block1 = vaesimcq_u8(vaesdq_u8(block1, key2));   block2 = vaesimcq_u8(vaesdq_u8(block2, key2));   block3 = vaesimcq_u8(vaesdq_u8(block3, key2));
            block1 = vaesdq_u8(block1, key1);                block2 = vaesdq_u8(block2, key1);                block3 = vaesdq_u8(block3, key1);
            block1 = veorq_u8(block1, key0);                 block2 = veorq_u8(block2, key0);                 block3 = veorq_u8(block3, key0);

            tweak4 = GfMult(tweak3);
            tweak5 = GfMult(tweak4);
            tweak6 = GfMult(tweak5);
#endif // NN_CRYPTO_CONFIG_XTS_USE_ASM

            block1 = veorq_u8(block1, tweak);
            block2 = veorq_u8(block2, tweak2);
            block3 = veorq_u8(block3, tweak3);

            tweak  = tweak4;
            tweak2 = tweak5;
            tweak3 = tweak6;

            vst1q_u8(pDst8, block1); pDst8 += 16;
            vst1q_u8(pDst8, block2); pDst8 += 16;
            vst1q_u8(pDst8, block3); pDst8 += 16;

            numBlocks -= batchSize;

        } while (numBlocks > batchSize);
    }
#endif // NN_CRYPTO_CONFIG_XTS_UNROLL_AES128

    // 最後のブロック以外を処理して dst に書き出す
    while (--numBlocks > 0)
    {
        // Load block & apply tweak
        uint8x16_t block = vld1q_u8(pSrc8);
        pSrc8 += BlockSize;
        block = veorq_u8(block, tweak);

        // AES decryption
        block = vaesimcq_u8(vaesdq_u8(block, key10));
        block = vaesimcq_u8(vaesdq_u8(block, key9));
        block = vaesimcq_u8(vaesdq_u8(block, key8));
        block = vaesimcq_u8(vaesdq_u8(block, key7));
        block = vaesimcq_u8(vaesdq_u8(block, key6));
        block = vaesimcq_u8(vaesdq_u8(block, key5));
        block = vaesimcq_u8(vaesdq_u8(block, key4));
        block = vaesimcq_u8(vaesdq_u8(block, key3));
        block = vaesimcq_u8(vaesdq_u8(block, key2));
        block = vaesdq_u8(block, key1);
        block = veorq_u8(block, key0);

        // Re-apply tweak and store block
        block = veorq_u8(block, tweak);
        vst1q_u8(pDst8, block);
        pDst8 += BlockSize;

        // Update tweak
        tweak = GfMult(tweak);
    }
    vst1q_u8(m_Tweak, tweak);

    // 最後のブロックは Finalize で使われる可能性があるので平文で保存しておく
    std::memcpy(m_LastBlock, pSrc8, BlockSize);

    m_State = State_Processing;

    return processed;
}

}}} // namespace nn::crypto::detail
