﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#pragma once

#include <arm_neon.h>

#include "../audio_DspCommon.h"
#include "../audio_Qf.h"
#include "../audio_DelayLine.h"
#include "../audio_EffectDelay.h"

namespace nn { namespace audio { namespace detail {

NN_FORCEINLINE void ApplyDelay1ch(const DelayParameter* delay, DelayState* state, int32_t *pInData1, int32_t *pOutData1, int sampleCount)
{
    NN_AUDIO_DSP_ASSERT(false, "Not implemented.");
}

NN_FORCEINLINE void ApplyDelay2ch(const DelayParameter* delay, DelayState* state, int32_t **ppInData, int32_t **ppOutData, int sampleCount)
{
    NN_AUDIO_DSP_ASSERT(false, "Not implemented.");
#if 0
    // get input samples (as int32_t)
    int32x2_t scVec = vdup_n_s32(0);
    scVec = vld1_lane_s32(ppInData[0]++, scVec, 0); // VLD1.32 {d0[0]}, [r0]
    scVec = vld1_lane_s32(ppInData[1]++, scVec, 1);
    scVec = vshl_n_s32(scVec, (QF_FRACTIONAL_BIT_COUNT - 8));   // VSHL.I32 d0,d0,#0

    // get output samples for each delay (as qf)
    int32x2_t outSampVec = vdup_n_s32(0);
    outSampVec = vld1_lane_s32((&state->pDelayLine[0])->p, outSampVec, 0); // VLD1.32 {d0[0]}, [r0]
    outSampVec = vld1_lane_s32((&state->pDelayLine[1])->p, outSampVec, 1);

    // Calc input to each lpf and delay for channels 0 and 1
    int32x2_t feedbackVec = vdup_n_s32(0);
    int64x2_t  multiplyVec;
    feedbackVec = vld1_lane_s32(&feedbackCross, feedbackVec, 1); // VLD1.32 {d0[0]}, [r0]
    feedbackVec = vld1_lane_s32(&feedbackDirect, feedbackVec, 0);
    multiplyVec = vmull_s32(outSampVec, feedbackVec);  //VQDMULL.S32 q0,d0,d0

    int64_t sTemp;
    sTemp = vgetq_lane_s64(multiplyVec, 0) + vgetq_lane_s64(multiplyVec, 1);    // VMOV r0,r0,d0

    int64x2_t sVec = vdupq_n_s64(0);
    sVec = vld1q_lane_s64(&sTemp, sVec, 0);

    feedbackVec = vld1_lane_s32(&feedbackCross, feedbackVec, 0); // VLD1.32 {d0[0]}, [r0]
    feedbackVec = vld1_lane_s32(&feedbackDirect, feedbackVec, 1);
    multiplyVec = vmull_s32(outSampVec, feedbackVec);  //VQDMULL.S32 q0,d0,d0

    sTemp = vgetq_lane_s64(multiplyVec, 0) + vgetq_lane_s64(multiplyVec, 1);    // VMOV r0,r0,d0
    sVec = vld1q_lane_s64(&sTemp, sVec, 1);

    int32x2_t gainVec;
    gainVec = vdup_n_s32(gain);
    //int32x2_t   vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c);        //vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]    VMLA.I32 d0,d0,d0
    sVec = vmlal_s32(sVec, gainVec, scVec);   //int64x2_t  vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c);    // VMLAL.S32 q0,d0,d0
    int32x2_t tempsVec;
    tempsVec = vrshrn_n_s64(sVec, (QF_FRACTIONAL_BIT_COUNT));

    // tempsVec, outSampVec, scVec need to be kept, others can be re-used.
    // re-use feedbackVec and gainVec, multiplyVec
    feedbackVec = vld1_s32(LowPassFilterHistory);
    gainVec = vdup_n_s32(lowPassFilterCoefficientA1);
    multiplyVec = vmull_s32(feedbackVec, gainVec);  //VQDMULL.S32 q0,d0,d0
    gainVec = vdup_n_s32(lowPassFilterCoefficientB0);
    multiplyVec = vmlal_s32(multiplyVec, gainVec, tempsVec);
    tempsVec = vrshrn_n_s64(multiplyVec, (QF_FRACTIONAL_BIT_COUNT));  // re-use tempsVec
    vst1_s32(LowPassFilterHistory, tempsVec);
    DelayLineTick(&state->pDelayLine[0], vget_lane_s32(tempsVec, 0));
    DelayLineTick(&state->pDelayLine[1], vget_lane_s32(tempsVec, 1));

    gainVec = vdup_n_s32(OutGain);
    multiplyVec = vmull_s32(outSampVec, gainVec);
    gainVec = vdup_n_s32(dryGain);
    multiplyVec = vmlal_s32(multiplyVec, gainVec, scVec);
    tempsVec = vrshrn_n_s64(multiplyVec, (QF_FRACTIONAL_BIT_COUNT));
    outSampVec = vshr_n_s32(tempsVec, (QF_FRACTIONAL_BIT_COUNT - 8));
    *ppOutData[0]++ = vget_lane_s32(outSampVec, 0);
    *ppOutData[1]++ = vget_lane_s32(outSampVec, 1);
#endif
}

NN_FORCEINLINE void ApplyDelay4ch(const DelayParameter* delay, DelayState* state, int32_t **ppInData, int32_t **ppOutData, int sampleCount)
{
    NN_AUDIO_DSP_ASSERT(false, "Not implemented.");
}

void ApplyDelay6ch(const DelayParameter* delay, DelayState* state, int32_t **ppInData, int32_t **ppOutData, int sampleCount)
{
    NN_AUDIO_DSP_ASSERT(false, "Not implemented.");
}

NN_FORCEINLINE void ApplyDelayBypass(int32_t **ppInData, int32_t **ppOutData, int channelCount, int sampleCount)
{
    NN_AUDIO_DSP_ASSERT(false, "Not implemented.");
}

}}} // namespace nn::audio::detail
