﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#pragma once

#include <arm_neon.h>
#include <nn/nn_Macro.h>

namespace nn { namespace audio { namespace dsp { namespace detail {

NN_FORCEINLINE void ApplyMix2(int32_t* output, const int32_t* input, int32_t gain, int sampleCount) NN_NOEXCEPT
{
    const int Q = 15;
    int32x2_t gain2 = vdup_n_s32(gain);
    for (int i = sampleCount; i != 0; i -= 2)
    {
        int32x2_t in2 = vld1_s32(input);
        int64x2_t tmp = vmull_s32(in2, gain2);
        int32x2_t mix2 = vqrshrn_n_s64(tmp, Q);
        int32x2_t out2 = vld1_s32(output);
        out2 = vqadd_s32(out2, mix2);
        vst1_s32(output, out2);
        input += 2;
        output += 2;
    }
}

NN_FORCEINLINE void ApplyMix4(int32_t* output, const int32_t* input, int32_t gain, int sampleCount) NN_NOEXCEPT
{
    const int Q = 15;
    int32x2_t gain2 = vdup_n_s32(gain);

    for (int i = sampleCount; i != 0; i -= 4)
    {
        int32x4_t in4  = vld1q_s32(input);
        int32x4_t out4 = vld1q_s32(output);

        int32x2_t in2_h = vget_high_s32(in4);
        int32x2_t in2_l = vget_low_s32(in4);

        int64x2_t tmp_h = vmull_s32(in2_h, gain2);
        int64x2_t tmp_l = vmull_s32(in2_l, gain2);
        int32x2_t mix2_h = vqrshrn_n_s64(tmp_h, Q);
        int32x2_t mix2_l = vqrshrn_n_s64(tmp_l, Q);
        int32x4_t mix4 = vcombine_s32(mix2_l, mix2_h);

        out4 = vqaddq_s32(out4, mix4);
        vst1q_s32(output, out4);

        input += 4;
        output += 4;
    }
}

NN_FORCEINLINE void ApplyMix4(float* output, const float* input, float gain, int sampleCount) NN_NOEXCEPT
{
    float32x4_t gain4 = vdupq_n_f32(gain);
    for (int i = sampleCount; i != 0; i -= 4)
    {
        float32x4_t in4 = vld1q_f32(input);
        float32x4_t out4 = vld1q_f32(output);
        float32x4_t tmp4 = vmulq_f32(in4, gain4);
        out4 = vaddq_f32(out4, tmp4);
        vst1q_f32(output, out4);
        input += 4;
        output += 4;
    }
}

NN_FORCEINLINE void ApplyMixRamp2(int32_t* output, const int32_t* input, int32_t gain, int32_t delta, int sampleCount) NN_NOEXCEPT
{
    const int Q = 15;
    int32x2_t gain2 = vcreate_s32(0);
    gain2 = vset_lane_s32(gain, gain2, 0);
    gain2 = vset_lane_s32(gain + delta, gain2, 1);
    int32x2_t delta2 = vdup_n_s32(delta * 2);
    for (int i = sampleCount; i != 0; i -= 2)
    {
        int32x2_t in2 = vld1_s32(input);
        int64x2_t tmp = vmull_s32(in2, gain2);
        int32x2_t mix2 = vqrshrn_n_s64(tmp, Q);
        int32x2_t out2 = vld1_s32(output);
        out2 = vqadd_s32(out2, mix2);
        vst1_s32(output, out2);
        input += 2;
        output += 2;
        gain2 = vadd_s32(gain2, delta2);
    }
}

NN_FORCEINLINE void ApplyMixRamp4(int32_t* output, const int32_t* input, int32_t gain, int32_t delta, int sampleCount) NN_NOEXCEPT
{
    const int Q = 15;
    int32x4_t gain4 = vdupq_n_s32(0);
    gain4 = vsetq_lane_s32(gain + delta * 0, gain4, 0);
    gain4 = vsetq_lane_s32(gain + delta * 1, gain4, 1);
    gain4 = vsetq_lane_s32(gain + delta * 2, gain4, 2);
    gain4 = vsetq_lane_s32(gain + delta * 3, gain4, 3);
    int32x4_t delta4 = vdupq_n_s32(delta * 4);
    for (int i = sampleCount; i != 0; i -= 4)
    {
        int32x4_t in4 = vld1q_s32(input);
        int32x4_t out4 = vld1q_s32(output);

        int32x2_t in2_h = vget_high_s32(in4);
        int32x2_t in2_l = vget_low_s32(in4);

        int32x2_t gain2_h = vget_high_s32(gain4);
        int32x2_t gain2_l = vget_low_s32(gain4);

        int64x2_t tmp_h = vmull_s32(in2_h, gain2_h);
        int64x2_t tmp_l = vmull_s32(in2_l, gain2_l);
        int32x2_t mix2_h = vqrshrn_n_s64(tmp_h, Q);
        int32x2_t mix2_l = vqrshrn_n_s64(tmp_l, Q);
        int32x4_t mix4 = vcombine_s32(mix2_l, mix2_h);

        out4 = vqaddq_s32(out4, mix4);
        vst1q_s32(output, out4);
        input += 4;
        output += 4;
        gain4 = vaddq_s32(gain4, delta4);
    }
}

NN_FORCEINLINE void ApplyMixRamp4(float* output, const float* input, float gain, float delta, int sampleCount) NN_NOEXCEPT
{
    float32x4_t gain4 = vdupq_n_f32(0);
    gain4 = vsetq_lane_f32(gain + delta * 0.0f, gain4, 0);
    gain4 = vsetq_lane_f32(gain + delta * 1.0f, gain4, 1);
    gain4 = vsetq_lane_f32(gain + delta * 2.0f, gain4, 2);
    gain4 = vsetq_lane_f32(gain + delta * 3.0f, gain4, 3);
    float32x4_t delta4 = vdupq_n_f32(delta * 4.0f);
    for (int i = sampleCount; i != 0; i -= 4)
    {
        float32x4_t in4 = vld1q_f32(input);
        float32x4_t out4 = vld1q_f32(output);
        float32x4_t tmp4 = vmulq_f32(in4, gain4);
        out4 = vaddq_f32(out4, tmp4);
        vst1q_f32(output, out4);
        gain4 = vaddq_f32(gain4, delta4);
        input += 4;
        output += 4;
    }
}

}}}}  // namespace nn::audio::dsp::detail
