﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#include <cstring>
#include <nn/nn_SdkAssert.h>
#include <nn/crypto/detail/crypto_GhashImpl.h>
#include <nn/crypto/detail/crypto_Clear.h>
#include "crypto_UpdateImpl.h"

#include <arm_neon.h>

/**
 * Configuration parameters:
 *
 * Whether to unroll hashing. It depends on algorithm (not on CPU spec)
 * NN_CRYPTO_CONFIG_GHASH_UNROLL
 *
 * Whether to use AArch64 ASM optimizations
 * NN_CRYPTO_CONFIG_GHASH_USE_ASM
 *
 * Whether to use Karatsuba's algorithm for GF multiplication (default is Classic method)
 * NN_CRYPTO_CONFIG_GHASH_GFMULT_KARATSUBA
 */

// Common settings
#define NN_CRYPTO_CONFIG_GHASH_UNROLL           1
#define NN_CRYPTO_CONFIG_GHASH_USE_ASM          1
#define NN_CRYPTO_CONFIG_GHASH_GFMULT_KARATSUBA 0


namespace nn { namespace crypto { namespace detail {

namespace
{
//=============================================================================
// Utility functions that avoid cluttering the code with SIMD type casts.
// Names follow Neon intrinsics naming convention.

NN_FORCEINLINE poly64_t vget_low_p64(uint8x16_t p)
{
    return vgetq_lane_p64(vreinterpretq_p64_u8(p), 0);
}

NN_FORCEINLINE poly64_t vget_high_p64(uint8x16_t p)
{
    return vgetq_lane_p64(vreinterpretq_p64_u8(p), 1);
}

NN_FORCEINLINE poly64_t vget_low_p64(poly128_t p)
{
    return vgetq_lane_p64(vreinterpretq_p64_p128(p), 0);
}

NN_FORCEINLINE poly64_t vget_high_p64(poly128_t p)
{
    return vgetq_lane_p64(vreinterpretq_p64_p128(p), 1);
}

#if NN_CRYPTO_CONFIG_GHASH_GFMULT_KARATSUBA
NN_FORCEINLINE poly64_t veor_p64(poly64_t a, poly64_t b)
{
    return a ^ b;
}
#endif

NN_FORCEINLINE poly128_t veorq_p128(poly128_t a, poly128_t b)
{
    return vreinterpretq_p128_u8(veorq_u8(vreinterpretq_u8_p128(a), vreinterpretq_u8_p128(b)));
}

NN_FORCEINLINE poly128_t vcombine_p64(poly64_t low, poly64_t high)
{
    return vreinterpretq_p128_u64(vcombine_u64(vmov_n_u64(low), vmov_n_u64(high)));
}

template<int n>
NN_FORCEINLINE poly128_t vshlq_n_p64(poly128_t p)
{
    return vreinterpretq_p128_u64(vshlq_n_u64(vreinterpretq_u64_p128(p), n));
}

template<int n>
NN_FORCEINLINE poly128_t vshlq_n_p128(poly128_t p)
{
    return p << n;
}

template<int n>
NN_FORCEINLINE poly128_t vshrq_n_p128(poly128_t p)
{
    return p >> n;
}


//=============================================================================
// GCM implementation proper

#if !NN_CRYPTO_CONFIG_GHASH_GFMULT_KARATSUBA
/**
 * @brief Multiplies two 128-bit polynomials in the GF(2¹²⁸) Galois field
 *
 * We use the 64-bit carry-less multiplication instruction to obtain a 256-bit product.
 * But there's a catch: GCM defines polynomials as having the lower orders on the left,
 * whereas the carry-less multiplication logically considers lower orders to be in the
 * least significant bits (i.e. on the right).
 *
 * However, this is not too big an issue, this only implies that the result is offset by
 * one bit to the right and we thus simply need to perform a left shift to restore it.
 * However, this left shift is not performed here but at the reduction step.
 *
 * @param [in]  a   The polynomial to multiply
 * @param [in]  b   The multiplier polynomial
 * @param [out] pHi The product's high-order half
 * @param [out] pLo The product's low-order  half
 */
NN_FORCEINLINE void GfMultClassic(uint8x16_t& resHi, uint8x16_t& resLo, uint8x16_t a, uint8x16_t b)
{
    poly128_t  prodLoHi  = vmull_p64(vget_low_p64(a),  vget_high_p64(b)); // a₀∙b₁
    poly128_t  prodHiLo  = vmull_p64(vget_high_p64(a), vget_low_p64(b));  // a₁∙b₀
    poly128_t  prodLoLo  = vmull_p64(vget_low_p64(a),  vget_low_p64(b));  // a₀∙b₀
    poly128_t  prodHiHi  = vmull_p64(vget_high_p64(a), vget_high_p64(b)); // a₁∙b₁
    poly128_t  zero      = vreinterpretq_p128_u8(vdupq_n_u8(0));
    poly128_t  prodMid   = veorq_p128(prodLoHi, prodHiLo);                // a₀∙b₁ + a₀∙b₁
    poly128_t  prodMidLo = vcombine_p64(vget_low_p64(zero),     vget_low_p64(prodMid));
    poly128_t  prodMidHi = vcombine_p64(vget_high_p64(prodMid), vget_high_p64(zero));
               resLo     = vreinterpretq_u8_p128(veorq_p128(prodLoLo, prodMidLo));
               resHi     = vreinterpretq_u8_p128(veorq_p128(prodHiHi, prodMidHi));
}

#else
/**
 * @brief Multiplies two 128-bit polynomials in the GF(2¹²⁸) Galois field
 *
 * This produces the same result as `gfmultClassic()` but using Karatsuba's algorithm
 * for multiplication. This uses only 3 multiplications at the expense of a few more
 * additions. Depending on cases, this might pipeline more efficiently.
 *
 * @param [in]  a   The polynomial to multiply
 * @param [in]  b   The multiplier polynomial
 * @param [out] pHi The product's high-order half
 * @param [out] pLo The product's low-order  half
 */
NN_FORCEINLINE void GfMultKaratsuba(uint8x16_t& resHi, uint8x16_t& resLo, uint8x16_t a, uint8x16_t b)
{
    poly64_t   aSum      = veor_p64(vget_low_p64(a), vget_high_p64(a));
    poly64_t   bSum      = veor_p64(vget_low_p64(b), vget_high_p64(b));
    poly128_t  prodMid   = vmull_p64(aSum, bSum);
    poly128_t  prodLoLo  = vmull_p64(vget_low_p64(a),  vget_low_p64(b));  // a₀∙b₀
    poly128_t  prodHiHi  = vmull_p64(vget_high_p64(a), vget_high_p64(b)); // a₁∙b₁
    poly128_t  zero      = vreinterpretq_p128_u8(vdupq_n_u8(0));
               prodMid   = veorq_p128(prodMid, prodLoLo);
               prodMid   = veorq_p128(prodMid, prodHiHi);                 // a₀∙b₁ + a₀∙b₁
    poly128_t  prodMidLo = vcombine_p64(vget_low_p64(zero),     vget_low_p64(prodMid));
    poly128_t  prodMidHi = vcombine_p64(vget_high_p64(prodMid), vget_high_p64(zero));
               resLo     = vreinterpretq_u8_p128(veorq_p128(prodLoLo, prodMidLo));
               resHi     = vreinterpretq_u8_p128(veorq_p128(prodHiHi, prodMidHi));
}
#endif

/**
 * @brief Performs the modular reduction of the 256-bit result of a product to 128 bits
 *
 * The product is composed of four 64-bit parts: `P₃P₂P₁P₀`
 * The modulus is composed of four 64-bit parts: `M₃M₂M₁M₀`
 * However the modulus is `x¹²⁸ + x⁷ + x² + x + 1` which means that `M₃=0`, `M₂=1` and `M₁=0`.
 * Thus the modulus can be written `10M₀`
 *
 * The modular nature of Galois fields allows us to write the following: `P₃P₂P₁P₀ = P₃P₂P₁P₀ ⊕ (n∙10M₀)`
 * If we choose `n = P₃0`, we can simplify the equation as follows:
 * `P₃P₂P₁P₀ = P₃P₂P₁P₀ ⊕ (P₃0 ∙ 10M₀)`              <br>
 * `         = P₃P₂P₁P₀ ⊕ (P₃0 ∙ (100 ⊕ M₀))`       <br>
 * `         = P₃P₂P₁P₀ ⊕ (P₃0 ∙ 100) ⊕ (P₃0 ∙ M₀)` <br>
 * `         = P₃P₂P₁P₀ ⊕ P₃000 ⊕ (P₃0 ∙ M₀)`       <br>
 * `         =  0P₂P₁P₀ ⊕ (P₃0 ∙ M₀)`
 *
 * So by computing `(P₃0 ∙ M₁M₀)` we can cancel `P₃`. To cancel `P₂` we simply do it again with `n = P₂`.
 *
 * The only thing to be careful about is that since GCM's polynomials are bit-reversed,
 * P₀ is the higher 64 bits and P₃ is the lower 64 bits.
 */
NN_FORCEINLINE uint8x16_t GfReduce(uint8x16_t prodHi, uint8x16_t prodLo)
{
    // The (bit-reversed) modulus is shifted 7 bits to the right so that the results of
    // multiplications can be shifted 8 bits to the left
    const poly64_t modulus = vget_lane_p64(vmov_n_p64(0xE100000000000000ull >> 7), 0); // M0

    // First reduction step
    poly128_t a   = vmull_p64(vget_low_p64(prodLo), modulus);  // P₃0 ∙ M₀
    poly128_t aLo = vshlq_n_p128<72>(a);
    poly128_t aHi = vshrq_n_p128<56>(a);
              aLo = veorq_p128(aLo, vreinterpretq_p128_u8(prodLo));
              aHi = veorq_p128(aHi, vreinterpretq_p128_u8(prodHi));

    // Second reduction step (and shift to the left to restore correct value)
    poly128_t bit127 = vshrq_n_p128<127>(aLo);
              aLo    = vshlq_n_p64<1>(aLo);
    poly128_t b      = vmull_p64(vget_high_p64(aLo), modulus); // P₂ ∙ M₀
              aHi    = vshlq_n_p128<1>(aHi);
              aHi    = veorq_p128(aHi, bit127);
              b      = vshlq_n_p128<8>(b);
    poly128_t res    = veorq_p128(aHi, b);

    return vreinterpretq_u8_p128(res);
}


/**
 * @brief Combined multiplication and reduction
 */
NN_NOINLINE uint8x16_t GfMult(uint8x16_t a, uint8x16_t b)
{
    uint8x16_t prodHi, prodLo;
#if NN_CRYPTO_CONFIG_GHASH_GFMULT_KARATSUBA
    GfMultKaratsuba(prodHi, prodLo, a, b);
#else
    GfMultClassic(prodHi, prodLo, a, b);
#endif
    return GfReduce(prodHi, prodLo);
}


NN_FORCEINLINE uint8x16_t ByteSwap(uint8x16_t block)
{
    static NN_ALIGNAS(16) const uint8_t indexes[16] = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
    const uint8x16_t shuffle = vld1q_u8(indexes);
    return vqtbl1q_u8(block, shuffle);
}

} // anonymous namespace

void GhashImpl::InitializeHashKey(Block* h) NN_NOEXCEPT
{
    // Generate the hash key (all 0s encrypted with the key)
    uint8x16_t hashKey = vld1q_u8(h->as8);
    hashKey = ByteSwap(hashKey);
    vst1q_u8(m_MultH[0].as8, hashKey);

    // Pre-compute powers 2 to 4 of the hash key
#if NN_CRYPTO_CONFIG_GHASH_UNROLL
    uint8x16_t hashKey2 = GfMult(hashKey,  hashKey);
    vst1q_u8(m_MultH[1].as8, hashKey2);

    uint8x16_t hashKey3 = GfMult(hashKey2, hashKey);
    vst1q_u8(m_MultH[2].as8, hashKey3);

    uint8x16_t hashKey4 = GfMult(hashKey3, hashKey);
    vst1q_u8(m_MultH[3].as8, hashKey4);
#endif
}

/*
    Sets m_Ghash *= H
*/
void GhashImpl::GfMultH(Block* pGhash) NN_NOEXCEPT
{
    const uint8x16_t key = vld1q_u8(m_MultH[0].as8);

    uint8x16_t x = vld1q_u8(pGhash->as8);
    x = GfMult(x, key);
    vst1q_u8(pGhash->as8, x);
}

void GhashImpl::Update(const void* pData, size_t dataSize) NN_NOEXCEPT
{
    NN_SDK_REQUIRES(m_State == State_ProcessingData, "Invalid state. Please restart from Reset().");

    m_MessageSizeLo += dataSize;
    UpdateImpl<void>(this, pData, dataSize); // void is dummy parameter
}

void GhashImpl::ProcessPartialData(const uint8_t* pData8, size_t dataSize) NN_NOEXCEPT
{
    for (int i = 0; i < static_cast<int>(dataSize); i++)
    {
        m_Ghash.as8[BlockSize - m_BufferedByte - i - 1] ^= pData8[i];
    }
    m_BufferedByte += dataSize;

    if (m_BufferedByte == BlockSize)
    {
        GfMultH(&m_Ghash);
        m_BufferedByte = 0;
    }
}

void GhashImpl::ProcessBlocksGeneric(const uint8_t* pData8, int numBlocks) NN_NOEXCEPT
{
    const uint8x16_t key = vld1q_u8(m_MultH[0].as8);

    // Load hash accumulator
    uint8x16_t x = vld1q_u8(m_Ghash.as8);

#if NN_CRYPTO_CONFIG_GHASH_UNROLL
    // Hash 4 blocks at a time
    //
    // The accumulator is computed as follows:
    //   A(i) = [A(i-1) ⊕ D(i)] ∙ H
    // Which, when applied 4 times yields the following:
    //   A(i) = [D(i) ∙ H] ⊕ [D(i-1) ∙ H²] ⊕ [D(i-2) ∙ H³] ⊕ [[D(i-3)⊕A(i-1)] ∙ H⁴]
    //
    // By doing so, we perform the reduction step only once every four blocks. It simply
    // requires to have precomputed powers of the hash key.
    const size_t BatchBlocksCount = 4;
    if (numBlocks >= BatchBlocksCount)
    {
        const uint8x16_t key2 = vld1q_u8(m_MultH[1].as8);
        const uint8x16_t key3 = vld1q_u8(m_MultH[2].as8);
        const uint8x16_t key4 = vld1q_u8(m_MultH[3].as8);

#if NN_CRYPTO_CONFIG_GHASH_USE_ASM
        static NN_ALIGNAS(16) const uint8_t g_Reverse64[16] = {8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7};
        const uint8x16_t reverse64 = vld1q_u8(g_Reverse64);

        do
        {
            register uint8x16_t block1 __asm__("v16") = vld1q_u8(pData8); pData8+=BlockSize;
            register uint8x16_t block2 __asm__("v17") = vld1q_u8(pData8); pData8+=BlockSize;
            register uint8x16_t block3 __asm__("v18") = vld1q_u8(pData8); pData8+=BlockSize;
            register uint8x16_t block4 __asm__("v19") = vld1q_u8(pData8); pData8+=BlockSize;

            block1 = ByteSwap(block1);
            block2 = ByteSwap(block2);
            block3 = ByteSwap(block3);
            block4 = ByteSwap(block4);

            block1 = veorq_u8(x, block1);

            // Hand written gfmultClassic() applied four times (about 20% faster than the intrinsics code)
            register uint8x16_t prodLo1 __asm__("v20");
            register uint8x16_t prodLo2 __asm__("v21");
            register uint8x16_t prodLo3 __asm__("v22");
            register uint8x16_t prodLo4 __asm__("v23");
            register uint8x16_t prodHi1 __asm__("v24");
            register uint8x16_t prodHi2 __asm__("v25");
            register uint8x16_t prodHi3 __asm__("v26");
            register uint8x16_t prodHi4 __asm__("v27");
            __asm__ volatile
            (
                // a??b?
                "pmull  %[prodLo1].1q, %[block1].1d,    %[key4].1d\n"
                "pmull  %[prodLo2].1q, %[block2].1d,    %[key3].1d\n"
                "pmull  %[prodLo3].1q, %[block3].1d,    %[key2].1d\n"
                "pmull  %[prodLo4].1q, %[block4].1d,    %[key].1d\n"
                // a??b?
                "pmull2 %[prodHi1].1q, %[block1].2d,    %[key4].2d\n"
                "pmull2 %[prodHi2].1q, %[block2].2d,    %[key3].2d\n"
                "pmull2 %[prodHi3].1q, %[block3].2d,    %[key2].2d\n"
                "pmull2 %[prodHi4].1q, %[block4].2d,    %[key].2d\n"
                // Revert a? and a?
                "tbl    %[block1].16b, {%[block1].16b}, %[reverse64].16b\n"
                "tbl    %[block2].16b, {%[block2].16b}, %[reverse64].16b\n"
                "tbl    %[block3].16b, {%[block3].16b}, %[reverse64].16b\n"
                "tbl    %[block4].16b, {%[block4].16b}, %[reverse64].16b\n"
                // a??b?
                "pmull  v28.1q,        %[block1].1d,    %[key4].1d\n"
                "pmull  v29.1q,        %[block2].1d,    %[key3].1d\n"
                "pmull  v30.1q,        %[block3].1d,    %[key2].1d\n"
                "pmull  v31.1q,        %[block4].1d,    %[key].1d\n"
                // a??b?
                "pmull2 %[block1].1q,  %[block1].2d,    %[key4].2d\n"
                "pmull2 %[block2].1q,  %[block2].2d,    %[key3].2d\n"
                "pmull2 %[block3].1q,  %[block3].2d,    %[key2].2d\n"
                "pmull2 %[block4].1q,  %[block4].2d,    %[key].2d\n"
                // a??b? + a??b?
                "eor    %[block1].16b, %[block1].16b,   v28.16b\n"
                "eor    %[block2].16b, %[block2].16b,   v29.16b\n"
                "eor    %[block3].16b, %[block3].16b,   v30.16b\n"
                "eor    %[block4].16b, %[block4].16b,   v31.16b\n"
                // Put it all together
                "mov    v28.d[1], xzr\n"
                "mov    v29.d[0], xzr\n"
                "mov    v28.d[0], %[block1].d[1]\n"
                "mov    v29.d[1], %[block1].d[0]\n"
                "eor    %[prodHi1].16b, %[prodHi1].16b, v28.16b\n"
                "eor    %[prodLo1].16b, %[prodLo1].16b, v29.16b\n"
                "mov    v28.d[0], %[block2].d[1]\n"
                "mov    v29.d[1], %[block2].d[0]\n"
                "eor    %[prodHi2].16b, %[prodHi2].16b, v28.16b\n"
                "eor    %[prodLo2].16b, %[prodLo2].16b, v29.16b\n"
                "mov    v28.d[0], %[block3].d[1]\n"
                "mov    v29.d[1], %[block3].d[0]\n"
                "eor    %[prodHi3].16b, %[prodHi3].16b, v28.16b\n"
                "eor    %[prodLo3].16b, %[prodLo3].16b, v29.16b\n"
                "mov    v28.d[0], %[block4].d[1]\n"
                "mov    v29.d[1], %[block4].d[0]\n"
                "eor    %[prodHi4].16b, %[prodHi4].16b, v28.16b\n"
                "eor    %[prodLo4].16b, %[prodLo4].16b, v29.16b\n"
                : [prodLo1]"=&w"(prodLo1), [prodLo2]"=&w"(prodLo2), [prodLo3]"=&w"(prodLo3), [prodLo4]"=&w"(prodLo4),
                  [prodHi1]"=&w"(prodHi1), [prodHi2]"=&w"(prodHi2), [prodHi3]"=&w"(prodHi3), [prodHi4]"=&w"(prodHi4),
                  [block1]"+w"(block1), [block2]"+w"(block2), [block3]"+w"(block3), [block4]"+w"(block4)
                : [key4]"w"(key4), [key3]"w"(key3), [key2]"w"(key2), [key]"w"(key), [reverse64]"w"(reverse64)
                : "v28", "v29", "v30", "v31"
            );

            // Sum the 4 products and reduce the result
            prodHi1 = veorq_u8(prodHi1, prodHi2);
            prodLo1 = veorq_u8(prodLo1, prodLo2);
            prodHi1 = veorq_u8(prodHi1, prodHi3);
            prodLo1 = veorq_u8(prodLo1, prodLo3);
            prodHi1 = veorq_u8(prodHi1, prodHi4);
            prodLo1 = veorq_u8(prodLo1, prodLo4);
            x = GfReduce(prodHi1, prodLo1);

            numBlocks -= BatchBlocksCount;

        } while (numBlocks >= BatchBlocksCount);
#else  // NN_CRYPTO_CONFIG_GHASH_USE_ASM
        do
        {
            uint8x16_t block1 = vld1q_u8(pData8); pData8+=BlockSize;
            uint8x16_t block2 = vld1q_u8(pData8); pData8+=BlockSize;
            uint8x16_t block3 = vld1q_u8(pData8); pData8+=BlockSize;
            uint8x16_t block4 = vld1q_u8(pData8); pData8+=BlockSize;

            block1 = ByteSwap(block1);
            block2 = ByteSwap(block2);
            block3 = ByteSwap(block3);
            block4 = ByteSwap(block4);

            block1 = veorq_u8(x, block1);

            uint8x16_t prodLo1, prodLo2, prodLo3, prodLo4;
            uint8x16_t prodHi1, prodHi2, prodHi3, prodHi4;

#if NN_CRYPTO_CONFIG_GHASH_GFMULT_KARATSUBA
            GfMultKaratsuba(prodHi1, prodLo1, block1, key4);
            GfMultKaratsuba(prodHi2, prodLo2, block2, key3);
            GfMultKaratsuba(prodHi3, prodLo3, block3, key2);
            GfMultKaratsuba(prodHi4, prodLo4, block4, key);
#else
            GfMultClassic(prodHi1, prodLo1, block1, key4);
            GfMultClassic(prodHi2, prodLo2, block2, key3);
            GfMultClassic(prodHi3, prodLo3, block3, key2);
            GfMultClassic(prodHi4, prodLo4, block4, key);
#endif // NN_CRYPTO_CONFIG_GHASH_GFMULT_KARATSUBA

            // Sum the 4 products and reduce the result
            prodHi1 = veorq_u8(prodHi1, prodHi2);
            prodLo1 = veorq_u8(prodLo1, prodLo2);
            prodHi1 = veorq_u8(prodHi1, prodHi3);
            prodLo1 = veorq_u8(prodLo1, prodLo3);
            prodHi1 = veorq_u8(prodHi1, prodHi4);
            prodLo1 = veorq_u8(prodLo1, prodLo4);
            x = GfReduce(prodHi1, prodLo1);

            numBlocks -= BatchBlocksCount;

        } while (numBlocks >= BatchBlocksCount);
#endif // NN_CRYPTO_CONFIG_GHASH_USE_ASM
    }
#endif // NN_CRYPTO_CONFIG_GHASH_UNROLL

    // Process full blocks
    while (numBlocks--)
    {
        uint8x16_t block = vld1q_u8(pData8);
        pData8 += BlockSize;
        block = ByteSwap(block);
        x = veorq_u8(x, block);
        x = GfMult(x, key);
    }

    vst1q_u8(m_Ghash.as8, x);
} // NOLINT(impl/function_size)

void GhashImpl::ProcessRemainingData(const uint8_t* pData8, size_t dataSize) NN_NOEXCEPT
{
    for (int i = 0; i < static_cast<int>(dataSize); i++)
    {
        m_Ghash.as8[BlockSize - i - 1] ^= pData8[i];
    }
    m_BufferedByte = dataSize;
}

void GhashImpl::GetGhash(void* pHash, size_t hashSize) NN_NOEXCEPT
{
    NN_SDK_REQUIRES(hashSize >= BlockSize);
    NN_UNUSED(hashSize);

    if (m_State == State_ProcessingData)
    {
        const uint8x16_t hashKey = vld1q_u8(m_MultH[0].as8);
        uint8x16_t x = vld1q_u8(m_Ghash.as8);

        // Process last partial block, if any
        if (m_BufferedByte > 0)
        {
            x = GfMult(x, hashKey);
        }

        // Append AAD length + message length
        uint64x2_t lengths = vcombine_u64(vdup_n_u64(m_MessageSizeLo), vdup_n_u64(m_MessageSizeHi));
        lengths = vshlq_n_u64(lengths, 3); // lengths *= 8
        x = veorq_u8(x, vreinterpretq_u8_u64(lengths));
        x = GfMult(x, hashKey);
        x = ByteSwap(x);

        vst1q_u8(m_Ghash.as8, x);

        m_State = State_Done;
    }

    std::memcpy(pHash, &m_Ghash.as8, BlockSize);
}

}}} // namespace nn::crypto::detail
