﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

    .section    ".text"
    .global     interpolating1Ch16TapFilter
    .global     interpolating2Ch16TapFilter
    .global     interpolating6Ch16TapFilter
    .global     interpolating1Ch32TapFilter
    .global     interpolating2Ch32TapFilter
    .global     interpolating6Ch32TapFilter
    .global     decimating1Ch16TapFilter
    .global     decimating2Ch16TapFilter
    .global     decimating6Ch16TapFilter
    .global     decimating1Ch32TapFilter
    .global     decimating2Ch32TapFilter
    .global     decimating6Ch32TapFilter

// All filters use this convention:
// On entry:
// X0 ->    struct _resampler_state
// X1 ->    const int16_t* input
// X2 ->    int16_t* output
// X3       number of input samples
// On return:
// X0 ->    number of output samples

    .align      6
    .type       interpolating1Ch32TapFilter, %function

interpolating1Ch32TapFilter:
    prfm        pldl1strm, [x1]
    ldr         x10, [x0, #24]      // X10 --> pDelayLines
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    sxtw        x7, w7
    ldr         x12, [x0, #16]      // X12 --> ptr to kernel ptrs, indexed by interpolationIndex
    ld1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11] // v16, v17, v18, v19: channel 0 data
    cbz         w3, 2f
    ldr         x12, [x12]          // X12 --> filter elements of phase 0
    mov         x14, #0
    add         x13, x12, x7, lsl #6
1:
    ld1         {v24.8h, v25.8h, v26.8h, v27.8h}, [x13] // v24, v25: filter kernel for current phase
    add         w7, w7, #1
    sub         v28.4s, v28.4s, v28.4s
    cmp         w7, w4
    csel        w7, wzr, w7, hs
    cmp         w5, w6
    add         x13, x12, x7, lsl #6
    b.hs        2f
    ld1         {v30.h}[0], [x1], #2
    sub         w3, w3, #1
    nop
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v17.16b, v17.16b, v18.16b, #2
    ext         v18.16b, v18.16b, v19.16b, #2
    ext         v19.16b, v19.16b, v30.16b, #2
    prfm        pldl1strm, [x1, #64]
2:
    smlal       v28.4s, v16.4h, v24.4h
    prfm        pldl1keep, [x13, #192]
    smlal2      v28.4s, v16.8h, v24.8h
    prfm        pstl1strm, [x2, #64]
    smlal       v28.4s, v17.4h, v25.4h
    add         w9, w9, w8
    smlal2      v28.4s, v17.8h, v25.8h
    mov         w15, #1
    smlal       v28.4s, v18.4h, v26.4h
    cmp         w9, w4
    smlal2      v28.4s, v18.8h, v26.8h
    sub         w11, w9, w4
    smlal       v28.4s, v19.4h, v27.4h
    csel        w6, w15, wzr, hs
    smlal2      v28.4s, v19.8h, v27.8h
    csel        w9, w11, w9, hs
    addv        s28, v28.4s
    add         x14, x14, #1
    sqrshrn     h28, s28, #15
    st1         {v28.h}[0], [x2], #2
    cbnz        w3, 1b
3:
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    strh        w9, [x0, #10]
    mov         x0, x14
    st1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11]
    ret

    .size       interpolating1Ch32TapFilter, [.-interpolating1Ch32TapFilter]

    .align      6
    .type       decimating1Ch32TapFilter, %function

decimating1Ch32TapFilter:
    mov         x14, #0             // X14 --> return value, number of output samples produced
    cbz         w3, 4f
    ldr         x12, [x0, #16]      // X12 --> ptr to kernel ptrs, indexed by interpolationIndex
    ldr         x10, [x0, #24]      // X10 --> pDelayLines
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x12, [x12]          // X12 --> filter elements of phase 0
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    ldrb        w15, [x0, #6]       // W15 --> minimum stuff count
    ldrb        w16, [x0, #7]       // W16 --> maximum stuff count
    ld1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11] // v16, v17, v18, v19: channel 0 data
    add         x13, x12, x7, lsl #6
1:
    ld1         {v24.8h, v25.8h, v26.8h, v27.8h}, [x13] // v24, v25: filter kernel for current phase
2:
    ld1         {v30.h}[0], [x1], #2
    subs        w3, w3, #1
    add         w5, w5, #1
    ccmp        w5, w6, #0x4, eq        // if W3 == 0: sets w5 - w6; if W3 != 0, sets (nZcv) EQ
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v17.16b, v17.16b, v18.16b, #2
    ext         v18.16b, v18.16b, v19.16b, #2
    ext         v19.16b, v19.16b, v30.16b, #2
    b.ne        3f                  // if conditional compare was applied, and W5 != W6, then save state and exit
    cmp         w5, w6              // need to fetch more source samples?
    b.lo        2b                  // yes

    sub         v28.4s, v28.4s, v28.4s
    add         w7, w7, #1
    mov         w5, #0              // reset stuff index
    smlal       v28.4s, v16.4h, v24.4h
    cmp         w7, w4
    prfm        pldl1strm, [x1, #64]
    smlal2      v28.4s, v16.8h, v24.8h
    csel        w7, wzr, w7, hs
    smlal       v28.4s, v17.4h, v25.4h
    prfm        pstl1strm, [x2, #64]
    add         x13, x12, x7, lsl #6
    smlal2      v28.4s, v17.8h, v25.8h
    prfm        pldl1keep, [x13, #192]
    adds        w9, w9, w8
    smlal       v28.4s, v18.4h, v26.4h
    add         w11, w9, w4
    smlal2      v28.4s, v18.8h, v26.8h
    csel        w6, w15, w16, lt
    smlal       v28.4s, v19.4h, v27.4h
    csel        w9, w11, w9, lt
    smlal2      v28.4s, v19.8h, v27.8h
    add         x14, x14, #1
    addv        s28, v28.4s
    sqrshrn     h28, s28, #15
    st1         {v28.h}[0], [x2], #2
    cbnz        w3, 1b
3:
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    strb        w5, [x0, #2]        // write back stuffIndex
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    strh        w9, [x0, #10]
    st1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11]
4:
    mov         x0, x14
    ret

    .size       decimating1Ch32TapFilter, [.-decimating1Ch32TapFilter]

    .align      6
    .type       interpolating1Ch16TapFilter, %function

interpolating1Ch16TapFilter:
    prfm        pldl1strm, [x1]
    ldr         x10, [x0, #24]      // X10 --> pDelayLines
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    sxtw        x7, w7
    ldr         x12, [x0, #16]      // X12 --> ptr to kernel ptrs, indexed by interpolationIndex
    ld1         {v16.8h, v17.8h}, [x11] // v16, v17: channel 0 data
    cbz         w3, 2f
    ldr         x12, [x12]          // X12 --> filter elements of phase 0
    mov         x14, #0
    add         x13, x12, x7, lsl #5
1:
    ld1         {v24.8h, v25.8h}, [x13] // v24, v25: filter kernel for current phase
    add         w7, w7, #1
    sub         v28.4s, v28.4s, v28.4s
    cmp         w7, w4
    csel        w7, wzr, w7, hs
    cmp         w5, w6
    b.hs        2f
    ld1         {v30.h}[0], [x1], #2
    sub         w3, w3, #1
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v17.16b, v17.16b, v30.16b, #2
    prfm        pldl1strm, [x1, #64]
2:
    add         x13, x12, x7, lsl #5
    smlal       v28.4s, v16.4h, v24.4h
    prfm        pstl1strm, [x2, #64]
    smlal2      v28.4s, v16.8h, v24.8h
    prfm        pldl1keep, [x13, #96]
    smlal       v28.4s, v17.4h, v25.4h
    smlal2      v28.4s, v17.8h, v25.8h
    add         w9, w9, w8
    mov         w15, #1
    cmp         w9, w4
    sub         w11, w9, w4
    addv        s28, v28.4s
    csel        w6, w15, wzr, hs
    csel        w9, w11, w9, hs
    sqrshrn     h28, s28, #15
    add         x14, x14, #1
    st1         {v28.h}[0], [x2], #2
    cbnz        w3, 1b
3:
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    strh        w9, [x0, #10]
    mov         x0, x14
    st1         {v16.8h, v17.8h}, [x11]
    ret

    .size       interpolating1Ch16TapFilter, [.-interpolating1Ch16TapFilter]

    .align      6
    .type       decimating1Ch16TapFilter, %function

decimating1Ch16TapFilter:
    mov         x14, #0             // X14 --> return value, number of output samples produced
    cbz         w3, 4f
    ldp         x12, x10, [x0, #16] // X12 --> ptr to kernel ptrs, indexed by interpolationIndex,X10 --> pDelayLines
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x12, [x12]          // X12 --> filter elements of phase 0
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    ldrb        w15, [x0, #6]       // W15 --> minimum stuff count
    ldrb        w16, [x0, #7]       // W16 --> maximum stuff count
    sxtw        x7, w7
    ld1         {v16.8h, v17.8h}, [x11] // v16, v17: channel 0 data
    add         x13, x12, x7, lsl #5
1:
    ld1         {v24.8h, v25.8h}, [x13] // v24, v25: filter kernel for current phase
    sub         v28.4s, v28.4s, v28.4s
2:
    ld1         {v30.h}[0], [x1], #2
    subs        w3, w3, #1
    add         w5, w5, #1
    ccmp        w5, w6, #0x4, eq        // if W3 == 0: sets w5 - w6; if W3 != 0, sets (nZcv) EQ
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v17.16b, v17.16b, v30.16b, #2
    b.ne        3f                  // if conditional compare was applied, and W5 != W6, then save state and exit
    cmp         w5, w6              // need to fetch more source samples?
    b.lo        2b                  // yes

    mov         w5, #0              // reset stuff index
    add         w7, w7, #1
    smlal       v28.4s, v16.4h, v24.4h
    prfm        pldl1strm, [x1, #64]
    cmp         w7, w4
    smlal2      v28.4s, v16.8h, v24.8h
    prfm        pstl1strm, [x2, #64]
    csel        w7, wzr, w7, hs
    smlal       v28.4s, v17.4h, v25.4h
    add         x13, x12, x7, lsl #5
    smlal2      v28.4s, v17.8h, v25.8h
    prfm        pldl1keep, [x13, #96]
    adds        w9, w9, w8
    add         w11, w9, w4
    addv        s28, v28.4s
    csel        w6, w15, w16, lt
    csel        w9, w11, w9, lt
    sqrshrn     h28, s28, #15
    add         x14, x14, #1
    st1         {v28.h}[0], [x2], #2
    cbnz        w3, 1b
3:
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    strb        w5, [x0, #2]        // write back stuffIndex
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    strh        w9, [x0, #10]
    st1         {v16.8h, v17.8h}, [x11]
4:
    mov         x0, x14
    ret

    .size       decimating1Ch16TapFilter, [.-decimating1Ch16TapFilter]

    .align      6
    .type       interpolating2Ch16TapFilter, %function

interpolating2Ch16TapFilter:
    ldp         x12, x10, [x0, #16] // X12 --> ptr to kernel ptrs, indexed by interpolationIndex,X10 --> pDelayLines
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x12, [x12]          // X12 --> filter kernel, phase 0
    ldp         x11, x13, [x10]     // X11 --> delay line channel 0, initial vector location, X13 --> delay line channel 1, initial vector location
    sxtw        x7, w7
    ld1         {v16.8h, v17.8h}, [x11] // v16, v17: channel 0 data
    ld1         {v20.8h, v21.8h}, [x13] // v20, v21: channel 1 data
    mov         x14, #0
    add         x13, x12, x7, lsl #5
    cbz         w3, 3f
    ld1         {v24.8h, v25.8h}, [x13] // v24, v25: filter kernel for current phase
1:
    cmp         w5, w6
    sub         v28.4s, v28.4s, v28.4s
    sub         v29.4s, v29.4s, v29.4s
    b.hs        2f
    ld2         {v30.h, v31.h}[0], [x1], #4
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v20.16b, v20.16b, v21.16b, #2
    sub         w3, w3, #1
    ext         v17.16b, v17.16b, v30.16b, #2
    ext         v21.16b, v21.16b, v31.16b, #2
    prfm        pldl1keep, [x1, #8]
2:
    smlal       v28.4s, v16.4h, v24.4h
    add         w7, w7, #1
    smlal       v29.4s, v20.4h, v24.4h
    cmp         w7, w4
    smlal2      v28.4s, v16.8h, v24.8h
    csel        w7, wzr, w7, hs
    smlal2      v29.4s, v20.8h, v24.8h
    add         x13, x12, x7, lsl #5
    smlal       v28.4s, v17.4h, v25.4h
    add         w9, w9, w8
    smlal       v29.4s, v21.4h, v25.4h
    prfm        pstl1strm, [x2, #8]
    smlal2      v28.4s, v17.8h, v25.8h
    mov         w15, #1
    smlal2      v29.4s, v21.8h, v25.8h
    ld1         {v24.8h, v25.8h}, [x13] // v24, v25: filter kernel for current phase
    cmp         w9, w4
    addv        s28, v28.4s
    sub         w11, w9, w4
    addv        s29, v29.4s
    prfm        pldl1keep, [x13, #96]
    csel        w6, w15, wzr, hs
    sqrshrn     h28, s28, #15
    csel        w9, w11, w9, hs
    sqrshrn     h29, s29, #15
    add         x14, x14, #1
    st2         {v28.h, v29.h}[0], [x2], #4
    cbnz        w3, 1b
3:
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    ldr         x13, [x10, #8]      // X12 --> delay line channel 1, initial vector location
    strh        w9, [x0, #10]
    mov         x0, x14
    st1         {v16.8h, v17.8h}, [x11]
    st1         {v20.8h, v21.8h}, [x13]
    ret

    .size       interpolating2Ch16TapFilter, [.-interpolating2Ch16TapFilter]

    .align      6
    .type       decimating2Ch16TapFilter, %function

decimating2Ch16TapFilter:
    mov         x14, #0             // X14 --> return value, number of output samples produced
    cbz         w3, 4f
    ldp         x12, x10, [x0, #16] // X12 --> ptr to kernel ptrs, indexed by interpolationIndex,X10 --> pDelayLines
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    prfm        pldl1keep, [x1]
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    nop
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldp         x11, x13, [x10]     // X11 --> delay line channel 0, initial vector location, X13 --> delay line channel 1, initial vector location
    ldr         x12, [x12]          // X12 --> filter kernel, phase 0
    sxtw        x7, w7
    ldrb        w15, [x0, #6]       // W15 --> minimum stuff count
    ldrb        w16, [x0, #7]       // W16 --> maximum stuff count
    ld1         {v16.8h, v17.8h}, [x11] // v16, v17: channel 0 data
    ld1         {v20.8h, v21.8h}, [x13] // v20, v21: channel 1 data
    prfm        pldl1keep, [x12]
    add         x13, x12, x7, lsl #5
1:
    ld1         {v24.8h, v25.8h}, [x13] // v24, v25: filter kernel for current phase
    sub         v28.4s, v28.4s, v28.4s
    sub         v29.4s, v29.4s, v29.4s
    prfm        pldl1keep, [x13, #96]
2:
    ld2         {v30.h, v31.h}[0], [x1], #4
    subs        w3, w3, #1
    add         w5, w5, #1
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v20.16b, v20.16b, v21.16b, #2
    ccmp        w5, w6, #0x4, eq        // if W3 == 0: sets w5 - w6; if W3 != 0, sets (nZcv) EQ
    ext         v17.16b, v17.16b, v30.16b, #2
    ext         v21.16b, v21.16b, v31.16b, #2
    b.ne        3f                  // if conditional compare was applied, and W5 != W6, then save state and exit
    cmp         w5, w6              // need to fetch more source samples?
    b.lo        2b                  // yes
    mov         w5, #0              // reset stuff index

    smlal       v28.4s, v16.4h, v24.4h
    prfm        pldl1keep, [x1, #16]
    smlal       v29.4s, v20.4h, v24.4h
    add         w7, w7, #1
    smlal2      v28.4s, v16.8h, v24.8h
    cmp         w7, w4
    smlal2      v29.4s, v20.8h, v24.8h
    smlal       v28.4s, v17.4h, v25.4h
    csel        w7, wzr, w7, hs
    smlal       v29.4s, v21.4h, v25.4h
    smlal2      v28.4s, v17.8h, v25.8h
    add         x13, x12, x7, lsl #5
    smlal2      v29.4s, v21.8h, v25.8h
    prfm        pstl1strm, [x2, #8]
    adds        w9, w9, w8
    addv        s28, v28.4s
    add         w11, w9, w4
    addv        s29, v29.4s
    csel        w6, w15, w16, lt
    csel        w9, w11, w9, lt
    sqrshrn     h28, s28, #15
    sqrshrn     h29, s29, #15
    add         x14, x14, #1
    st2         {v28.h, v29.h}[0], [x2], #4
    cbnz        w3, 1b
3:
    strb        w5, [x0, #2]        // write back stuffIndex
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    ldr         x13, [x10, #8]      // X12 --> delay line channel 1, initial vector location
    strh        w9, [x0, #10]
    st1         {v16.8h, v17.8h}, [x11]
    st1         {v20.8h, v21.8h}, [x13]
4:
    mov         x0, x14
    ret

    .size       decimating2Ch16TapFilter, [.-decimating2Ch16TapFilter]

    .align      6
    .type       interpolating2Ch32TapFilter, %function

interpolating2Ch32TapFilter:
    ldr         x10, [x0, #24]      // X10 --> pDelayLines
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    sxtw        x7, w7
    ldr         x12, [x0, #16]      // X12 --> ptr to kernel ptrs, indexed by interpolationIndex
    ld1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11], #64 // v16, v17, v18, v19: channel 0 data
    ld1         {v20.8h, v21.8h, v22.8h, v23.8h}, [x11] // v20, v21, v22, v23: channel 1 data
    cbz         w3, 2f
    ldr         x12, [x12]          // X12 --> filter elements of phase 0
    mov         x14, #0
    add         x13, x12, x7, lsl #6
1:
    ld1         {v24.8h, v25.8h, v26.8h, v27.8h}, [x13] // v24, v25: filter kernel for current phase
    add         w7, w7, #1
    sub         v28.4s, v28.4s, v28.4s
    cmp         w7, w4
    sub         v29.4s, v29.4s, v29.4s
    csel        w7, wzr, w7, hs
    cmp         w5, w6
    add         x13, x12, x7, lsl #6
    b.hs        2f
    ld2         {v30.h, v31.h}[0], [x1], #4
    sub         w3, w3, #1
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v20.16b, v20.16b, v21.16b, #2
    ext         v17.16b, v17.16b, v18.16b, #2
    ext         v21.16b, v21.16b, v22.16b, #2
    ext         v18.16b, v18.16b, v19.16b, #2
    ext         v22.16b, v22.16b, v23.16b, #2
    ext         v19.16b, v19.16b, v30.16b, #2
    ext         v23.16b, v23.16b, v31.16b, #2
    prfm        pldl1strm, [x1, #64]
2:
    smlal       v28.4s, v16.4h, v24.4h
    smlal       v29.4s, v20.4h, v24.4h
    prfm        pldl1keep, [x13, #192]
    smlal2      v28.4s, v16.8h, v24.8h
    smlal2      v29.4s, v20.8h, v24.8h
    prfm        pstl1strm, [x2, #64]
    smlal       v28.4s, v17.4h, v25.4h
    smlal       v29.4s, v21.4h, v25.4h
    smlal2      v28.4s, v17.8h, v25.8h
    smlal2      v29.4s, v21.8h, v25.8h
    smlal       v28.4s, v18.4h, v26.4h
    smlal       v29.4s, v22.4h, v26.4h
    smlal2      v28.4s, v18.8h, v26.8h
    smlal2      v29.4s, v22.8h, v26.8h
    smlal       v28.4s, v19.4h, v27.4h
    smlal       v29.4s, v23.4h, v27.4h
    smlal2      v28.4s, v19.8h, v27.8h
    smlal2      v29.4s, v23.8h, v27.8h
    add         w9, w9, w8
    mov         w15, #1
    cmp         w9, w4
    sub         w11, w9, w4
    addv        s28, v28.4s
    addv        s29, v29.4s
    csel        w6, w15, wzr, hs
    csel        w9, w11, w9, hs
    sqrshrn     h28, s28, #15
    sqrshrn     h29, s29, #15
    add         x14, x14, #1
    st2         {v28.h, v29.h}[0], [x2], #4
    cbnz        w3, 1b
3:
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    strh        w9, [x0, #10]
    mov         x0, x14
    st1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11], #64
    st1         {v20.8h, v21.8h, v22.8h, v23.8h}, [x11]
    ret

    .size       interpolating2Ch32TapFilter, [.-interpolating2Ch32TapFilter]

    .align      6
    .type       decimating2Ch32TapFilter, %function

decimating2Ch32TapFilter:
    prfm        pldl1strm, [x1]
    mov         x14, #0             // X14 --> return value, number of output samples produced
    cbz         w3, 4f
    ldr         x12, [x0, #16]      // X12 --> ptr to kernel ptrs, indexed by interpolationIndex
    ldr         x10, [x0, #24]      // X10 --> pDelayLines
    nop
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    nop
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x12, [x12]          // X12 --> filter elements of phase 0
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    ldrb        w15, [x0, #6]       // W15 --> minimum stuff count
    ldrb        w16, [x0, #7]       // W16 --> maximum stuff count
    ld1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11], #64    // v16, v17, v18, v19: channel 0 data
    add         x13, x12, x7, lsl #6
    ld1         {v20.8h, v21.8h, v22.8h, v23.8h}, [x11]         // v20, v21, v22, v23: channel 1 data
1:
    sub         v28.4s, v28.4s, v28.4s
    ld1         {v24.8h, v25.8h, v26.8h, v27.8h}, [x13] // v24, v25: filter kernel for current phase
    sub         v29.4s, v29.4s, v29.4s
    nop
2:
    ld2         {v30.h, v31.h}[0], [x1], #4
    subs        w3, w3, #1
    add         w5, w5, #1
    ccmp        w5, w6, #0x4, eq        // if W3 == 0: sets w5 - w6; if W3 != 0, sets (nZcv) EQ
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v20.16b, v20.16b, v21.16b, #2
    ext         v17.16b, v17.16b, v18.16b, #2
    ext         v21.16b, v21.16b, v22.16b, #2
    ext         v18.16b, v18.16b, v19.16b, #2
    ext         v22.16b, v22.16b, v23.16b, #2
    ext         v19.16b, v19.16b, v30.16b, #2
    ext         v23.16b, v23.16b, v31.16b, #2
    b.ne        3f                  // if conditional compare was applied, and W5 != W6, then save state and exit
    cmp         w5, w6              // need to fetch more source samples?
    b.lo        2b                  // yes

    smlal       v28.4s, v16.4h, v24.4h
    mov         w5, #0              // reset stuff index
    smlal       v29.4s, v20.4h, v24.4h
    prfm        pldl1strm, [x1, #64]
    smlal2      v28.4s, v16.8h, v24.8h
    smlal2      v29.4s, v20.8h, v24.8h
    add         w7, w7, #1
    prfm        pstl1strm, [x2, #64]
    smlal       v28.4s, v17.4h, v25.4h
    cmp         w7, w4
    smlal       v29.4s, v21.4h, v25.4h
    csel        w7, wzr, w7, hs
    smlal2      v28.4s, v17.8h, v25.8h
    add         x13, x12, x7, lsl #6
    smlal2      v29.4s, v21.8h, v25.8h
    prfm        pldl1keep, [x13, #128]
    smlal       v28.4s, v18.4h, v26.4h
    smlal       v29.4s, v22.4h, v26.4h
    smlal2      v28.4s, v18.8h, v26.8h
    smlal2      v29.4s, v22.8h, v26.8h
    smlal       v28.4s, v19.4h, v27.4h
    smlal       v29.4s, v23.4h, v27.4h
    smlal2      v28.4s, v19.8h, v27.8h
    smlal2      v29.4s, v23.8h, v27.8h
    adds        w9, w9, w8
    add         w11, w9, w4
    addv        s28, v28.4s
    addv        s29, v29.4s
    csel        w6, w15, w16, lt
    csel        w9, w11, w9, lt
    sqrshrn     h28, s28, #15
    sqrshrn     h29, s29, #15
    add         x14, x14, #1
    st2         {v28.h, v29.h}[0], [x2], #4
    cbnz        w3, 1b
3:
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    strb        w5, [x0, #2]        // write back stuffIndex
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    strh        w9, [x0, #10]       // write back phase
    st1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11], #64
    st1         {v20.8h, v21.8h, v22.8h, v23.8h}, [x11]
4:
    mov         x0, x14
    ret

    .size       decimating2Ch32TapFilter, [.-decimating2Ch32TapFilter]

    .align      6
    .type       interpolating6Ch32TapFilter, %function

interpolating6Ch32TapFilter:
    sub         x13, sp, #256
    sub         sp, sp, #256        // make room to save v0 through v15
    ldr         x10, [x0, #24]      // X10 --> pDelayLines
    st1         {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
    st1         {v4.8h, v5.8h, v6.8h, v7.8h}, [x13], #64
    st1         {v8.8h, v9.8h, v10.8h, v11.8h}, [x13], #64
    st1         {v12.8h, v13.8h, v14.8h, v15.8h}, [x13]
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    sxtw        x7, w7
    ldr         x12, [x0, #16]      // X12 --> ptr to kernel ptrs, indexed by interpolationIndex
    ld1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11], #64 // v16, v17, v18, v19: channel 0 data
    ld1         {v20.8h, v21.8h, v22.8h, v23.8h}, [x11], #64 // v20, v21, v22, v23: channel 1 data
    ld1         {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64    // v0, v1, v2, v3: channel 2 data
    ld1         {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64    // v4, v5, v6, v7: channel 3 data
    ld1         {v8.8h, v9.8h, v10.8h, v11.8h}, [x11], #64  // v8, v9, v10, v11: channel 4 data
    ld1         {v12.8h, v13.8h, v14.8h, v15.8h}, [x11]     // v12, v13, v14, v15: channel 5 data
    cbz         w3, 2f
    ldr         x12, [x12]          // X12 --> filter elements of phase 0
    mov         x14, #0
    prfm        pldl1strm, [x1]
    prfm        pstl1keep, [x2]
    add         x13, x12, x7, lsl #6
1:
    ld1         {v24.8h, v25.8h, v26.8h, v27.8h}, [x13] // v24, v25: filter kernel for current phase
    add         w7, w7, #1
    sub         v28.4s, v28.4s, v28.4s
    cmp         w7, w4
    sub         v29.4s, v29.4s, v29.4s
    csel        w7, wzr, w7, hs
    cmp         w5, w6
    add         x13, x12, x7, lsl #6
    b.hs        2f
    ld2         {v30.h, v31.h}[0], [x1], #4
    sub         w3, w3, #1
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v20.16b, v20.16b, v21.16b, #2
    ext         v17.16b, v17.16b, v18.16b, #2
    ext         v21.16b, v21.16b, v22.16b, #2
    ext         v18.16b, v18.16b, v19.16b, #2
    ext         v22.16b, v22.16b, v23.16b, #2
    ext         v19.16b, v19.16b, v30.16b, #2
    ext         v23.16b, v23.16b, v31.16b, #2
    ld2         {v30.h, v31.h}[0], [x1], #4
    ext         v0.16b, v0.16b, v1.16b, #2
    ext         v4.16b, v4.16b, v5.16b, #2
    ext         v1.16b, v1.16b, v2.16b, #2
    ext         v5.16b, v5.16b, v6.16b, #2
    ext         v2.16b, v2.16b, v3.16b, #2
    ext         v6.16b, v6.16b, v7.16b, #2
    ext         v3.16b, v3.16b, v30.16b, #2
    ext         v7.16b, v7.16b, v31.16b, #2
    ld2         {v30.h, v31.h}[0], [x1], #4
    ext         v8.16b, v8.16b, v9.16b, #2
    ext         v12.16b, v12.16b, v13.16b, #2
    ext         v9.16b, v9.16b, v10.16b, #2
    ext         v13.16b, v13.16b, v14.16b, #2
    ext         v10.16b, v10.16b, v11.16b, #2
    ext         v14.16b, v14.16b, v15.16b, #2
    ext         v11.16b, v11.16b, v30.16b, #2
    ext         v15.16b, v15.16b, v31.16b, #2
    prfm        pldl1keep, [x1, #64]
2:
    sub         v30.4s, v30.4s, v30.4s
    sub         v31.4s, v31.4s, v31.4s
    smlal       v28.4s, v16.4h, v24.4h
    smlal       v29.4s, v20.4h, v24.4h
    prfm        pldl1keep, [x13, #128]
    smlal       v30.4s, v0.4h, v24.4h
    smlal       v31.4s, v4.4h, v24.4h
    prfm        pstl1keep, [x2, #64]
    smlal2      v28.4s, v16.8h, v24.8h
    smlal2      v29.4s, v20.8h, v24.8h
    smlal2      v30.4s, v0.8h, v24.8h
    smlal2      v31.4s, v4.8h, v24.8h
    smlal       v28.4s, v17.4h, v25.4h
    smlal       v29.4s, v21.4h, v25.4h
    smlal       v30.4s, v1.4h, v25.4h
    smlal       v31.4s, v5.4h, v25.4h
    smlal2      v28.4s, v17.8h, v25.8h
    smlal2      v29.4s, v21.8h, v25.8h
    smlal2      v30.4s, v1.8h, v25.8h
    smlal2      v31.4s, v5.8h, v25.8h
    smlal       v28.4s, v18.4h, v26.4h
    smlal       v29.4s, v22.4h, v26.4h
    smlal       v30.4s, v2.4h, v26.4h
    smlal       v31.4s, v6.4h, v26.4h
    smlal2      v28.4s, v18.8h, v26.8h
    smlal2      v29.4s, v22.8h, v26.8h
    smlal2      v30.4s, v2.8h, v26.8h
    smlal2      v31.4s, v6.8h, v26.8h
    smlal       v28.4s, v19.4h, v27.4h
    smlal       v29.4s, v23.4h, v27.4h
    smlal       v30.4s, v3.4h, v27.4h
    smlal       v31.4s, v7.4h, v27.4h
    smlal2      v28.4s, v19.8h, v27.8h
    smlal2      v29.4s, v23.8h, v27.8h
    smlal2      v30.4s, v3.8h, v27.8h
    smlal2      v31.4s, v7.8h, v27.8h
    addv        s28, v28.4s
    addv        s29, v29.4s
    addv        s30, v30.4s
    addv        s31, v31.4s
    sqrshrn     h28, s28, #15
    sqrshrn     h29, s29, #15
    sqrshrn     h30, s30, #15
    sqrshrn     h31, s31, #15
    st2         {v28.h, v29.h}[0], [x2], #4
    sub         v28.4s, v28.4s, v28.4s
    sub         v29.4s, v29.4s, v29.4s
    st2         {v30.h, v31.h}[0], [x2], #4
    smlal       v28.4s, v8.4h, v24.4h
    smlal       v29.4s, v12.4h, v24.4h
    smlal2      v28.4s, v8.8h, v24.8h
    smlal2      v29.4s, v12.8h, v24.8h
    smlal       v28.4s, v9.4h, v25.4h
    smlal       v29.4s, v13.4h, v25.4h
    smlal2      v28.4s, v9.8h, v25.8h
    smlal2      v29.4s, v13.8h, v25.8h
    smlal       v28.4s, v10.4h, v26.4h
    smlal       v29.4s, v14.4h, v26.4h
    smlal2      v28.4s, v10.8h, v26.8h
    smlal2      v29.4s, v14.8h, v26.8h
    smlal       v28.4s, v11.4h, v27.4h
    smlal       v29.4s, v15.4h, v27.4h
    smlal2      v28.4s, v11.8h, v27.8h
    smlal2      v29.4s, v15.8h, v27.8h
    add         w9, w9, w8
    mov         w15, #1
    cmp         w9, w4
    sub         w11, w9, w4
    addv        s28, v28.4s
    addv        s29, v29.4s
    csel        w6, w15, wzr, hs
    csel        w9, w11, w9, hs
    sqrshrn     h28, s28, #15
    sqrshrn     h29, s29, #15
    add         x14, x14, #1
    st2         {v28.h, v29.h}[0], [x2], #4
    cbnz        w3, 1b
3:
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    strh        w9, [x0, #10]
    mov         x0, x14
    st1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11], #64
    st1         {v20.8h, v21.8h, v22.8h, v23.8h}, [x11], #64
    st1         {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
    st1         {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
    st1         {v8.8h, v9.8h, v10.8h, v11.8h}, [x11], #64
    st1         {v12.8h, v13.8h, v14.8h, v15.8h}, [x11]
    ld1         {v0.8h, v1.8h, v2.8h, v3.8h}, [sp], #64
    ld1         {v4.8h, v5.8h, v6.8h, v7.8h}, [sp], #64
    ld1         {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
    ld1         {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
    ret

    .size       interpolating6Ch32TapFilter, [.-interpolating6Ch32TapFilter]

    .align      6
    .type       decimating6Ch32TapFilter, %function

decimating6Ch32TapFilter:
    mov         x14, #0             // X14 --> return value, number of output samples produced
    cbz         w3, 4f
    sub         x13, sp, #256
    sub         sp, sp, #256        // make room to save v0 through v15
    st1         {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
    st1         {v4.8h, v5.8h, v6.8h, v7.8h}, [x13], #64
    st1         {v8.8h, v9.8h, v10.8h, v11.8h}, [x13], #64
    st1         {v12.8h, v13.8h, v14.8h, v15.8h}, [x13]
    ldr         x12, [x0, #16]      // X12 --> ptr to kernel ptrs, indexed by interpolationIndex
    ldr         x10, [x0, #24]      // X10 --> pDelayLines
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x12, [x12]          // X12 --> filter elements of phase 0
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    ldrb        w15, [x0, #6]       // W15 --> minimum stuff count
    ldrb        w16, [x0, #7]       // W16 --> maximum stuff count
    sxtw        x7, w7
    ld1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11], #64    // v16, v17, v18, v19: channel 0 data
    add         x13, x12, x7, lsl #6
    ld1         {v20.8h, v21.8h, v22.8h, v23.8h}, [x11], #64 // v20, v21, v22, v23: channel 1 data
    ld1         {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64    // v0, v1, v2, v3: channel 2 data
    ld1         {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64    // v4, v5, v6, v7: channel 3 data
    ld1         {v8.8h, v9.8h, v10.8h, v11.8h}, [x11], #64  // v8, v9, v10, v11: channel 4 data
    ld1         {v12.8h, v13.8h, v14.8h, v15.8h}, [x11]     // v12, v13, v14, v15: channel 5 data
1:
    ld1         {v24.8h, v25.8h, v26.8h, v27.8h}, [x13] // v24, v25: filter kernel for current phase
2:
    ld4         {v28.h, v29.h, v30.h, v31.h}[0], [x1], #8
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v20.16b, v20.16b, v21.16b, #2
    ext         v0.16b, v0.16b, v1.16b, #2
    ext         v4.16b, v4.16b, v5.16b, #2
    ext         v17.16b, v17.16b, v18.16b, #2
    ext         v21.16b, v21.16b, v22.16b, #2
    ext         v1.16b, v1.16b, v2.16b, #2
    ext         v5.16b, v5.16b, v6.16b, #2
    ext         v18.16b, v18.16b, v19.16b, #2
    ext         v22.16b, v22.16b, v23.16b, #2
    ext         v2.16b, v2.16b, v3.16b, #2
    ext         v6.16b, v6.16b, v7.16b, #2
    ext         v19.16b, v19.16b, v28.16b, #2
    ext         v23.16b, v23.16b, v29.16b, #2
    ext         v3.16b, v3.16b, v30.16b, #2
    ext         v7.16b, v7.16b, v31.16b, #2
    ld2         {v30.h, v31.h}[0], [x1], #4
    ext         v8.16b, v8.16b, v9.16b, #2
    ext         v12.16b, v12.16b, v13.16b, #2
    subs        w3, w3, #1
    ext         v9.16b, v9.16b, v10.16b, #2
    ext         v13.16b, v13.16b, v14.16b, #2
    add         w5, w5, #1
    ext         v10.16b, v10.16b, v11.16b, #2
    ext         v14.16b, v14.16b, v15.16b, #2
    ccmp        w5, w6, #0x4, eq        // if W3 == 0: sets w5 - w6; if W3 != 0, sets (nZcv) EQ
    ext         v11.16b, v11.16b, v30.16b, #2
    ext         v15.16b, v15.16b, v31.16b, #2
    prfm        pldl1keep, [x1, #64]

    b.ne        3f                  // if conditional compare was applied, and W5 != W6, then save state and exit
    cmp         w5, w6              // need to fetch more source samples?
    b.lo        2b                  // yes

    sub         v28.4s, v28.4s, v28.4s
    sub         v29.4s, v29.4s, v29.4s
    mov         w5, #0              // reset stuff index
    sub         v30.4s, v30.4s, v30.4s
    sub         v31.4s, v31.4s, v31.4s
    prfm        pldl1strm, [x1, #64]

    smlal       v28.4s, v16.4h, v24.4h
    add         w7, w7, #1
    smlal       v29.4s, v20.4h, v24.4h
    cmp         w7, w4
    smlal       v30.4s, v0.4h, v24.4h
    csel        w7, wzr, w7, hs
    smlal       v31.4s, v4.4h, v24.4h
    add         x13, x12, x7, lsl #6
    smlal2      v28.4s, v16.8h, v24.8h
    prfm        pldl1keep, [x13, #128]
    smlal2      v29.4s, v20.8h, v24.8h
    smlal2      v30.4s, v0.8h, v24.8h
    smlal2      v31.4s, v4.8h, v24.8h
    prfm        pstl1strm, [x2, #64]
    smlal       v28.4s, v17.4h, v25.4h
    smlal       v29.4s, v21.4h, v25.4h
    smlal       v30.4s, v1.4h, v25.4h
    smlal       v31.4s, v5.4h, v25.4h
    smlal2      v28.4s, v17.8h, v25.8h
    smlal2      v29.4s, v21.8h, v25.8h
    smlal2      v30.4s, v1.8h, v25.8h
    smlal2      v31.4s, v5.8h, v25.8h
    smlal       v28.4s, v18.4h, v26.4h
    smlal       v29.4s, v22.4h, v26.4h
    smlal       v30.4s, v2.4h, v26.4h
    smlal       v31.4s, v6.4h, v26.4h
    smlal2      v28.4s, v18.8h, v26.8h
    smlal2      v29.4s, v22.8h, v26.8h
    smlal2      v30.4s, v2.8h, v26.8h
    smlal2      v31.4s, v6.8h, v26.8h
    smlal       v28.4s, v19.4h, v27.4h
    smlal       v29.4s, v23.4h, v27.4h
    smlal       v30.4s, v3.4h, v27.4h
    smlal       v31.4s, v7.4h, v27.4h
    smlal2      v28.4s, v19.8h, v27.8h
    smlal2      v29.4s, v23.8h, v27.8h
    smlal2      v30.4s, v3.8h, v27.8h
    smlal2      v31.4s, v7.8h, v27.8h
    addv        s28, v28.4s
    addv        s29, v29.4s
    addv        s30, v30.4s
    addv        s31, v31.4s
    sqrshrn     h28, s28, #15
    sqrshrn     h29, s29, #15
    sqrshrn     h30, s30, #15
    sqrshrn     h31, s31, #15
    st2         {v28.h, v29.h}[0], [x2], #4
    sub         v28.4s, v28.4s, v28.4s
    sub         v29.4s, v29.4s, v29.4s
    st2         {v30.h, v31.h}[0], [x2], #4
    smlal       v28.4s, v8.4h, v24.4h
    smlal       v29.4s, v12.4h, v24.4h
    smlal2      v28.4s, v8.8h, v24.8h
    smlal2      v29.4s, v12.8h, v24.8h
    smlal       v28.4s, v9.4h, v25.4h
    smlal       v29.4s, v13.4h, v25.4h
    smlal2      v28.4s, v9.8h, v25.8h
    smlal2      v29.4s, v13.8h, v25.8h
    smlal       v28.4s, v10.4h, v26.4h
    smlal       v29.4s, v14.4h, v26.4h
    smlal2      v28.4s, v10.8h, v26.8h
    smlal2      v29.4s, v14.8h, v26.8h
    smlal       v28.4s, v11.4h, v27.4h
    smlal       v29.4s, v15.4h, v27.4h
    smlal2      v28.4s, v11.8h, v27.8h
    smlal2      v29.4s, v15.8h, v27.8h
    adds        w9, w9, w8
    add         w11, w9, w4
    addv        s28, v28.4s
    addv        s29, v29.4s
    csel        w6, w15, w16, lt
    csel        w9, w11, w9, lt
    sqrshrn     h28, s28, #15
    sqrshrn     h29, s29, #15
    add         x14, x14, #1
    st2         {v28.h, v29.h}[0], [x2], #4
    cbnz        w3, 1b
3:
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    strb        w5, [x0, #2]        // write back stuffIndex
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    strh        w9, [x0, #10]       // write back phase
    st1         {v16.8h, v17.8h, v18.8h, v19.8h}, [x11], #64
    st1         {v20.8h, v21.8h, v22.8h, v23.8h}, [x11], #64
    st1         {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
    st1         {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
    st1         {v8.8h, v9.8h, v10.8h, v11.8h}, [x11], #64
    st1         {v12.8h, v13.8h, v14.8h, v15.8h}, [x11]
    ld1         {v0.8h, v1.8h, v2.8h, v3.8h}, [sp], #64
    ld1         {v4.8h, v5.8h, v6.8h, v7.8h}, [sp], #64
    ld1         {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
    ld1         {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
4:
    mov         x0, x14
    ret

    .size       decimating6Ch32TapFilter, [.-decimating6Ch32TapFilter]

    .align      6
    .type       interpolating6Ch16TapFilter, %function

interpolating6Ch16TapFilter:
    sub         x13, sp, #128
    sub         sp, sp, #128        // make room to save v0 through v15
    ldr         x10, [x0, #24]      // X10 --> pDelayLines
    st1         {v0.8h, v1.8h}, [x13], #32
    st1         {v4.8h, v5.8h}, [x13], #32
    st1         {v8.8h, v9.8h}, [x13], #32
    st1         {v12.8h, v13.8h}, [x13]
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    sxtw        x7, w7
    ldr         x12, [x0, #16]      // X12 --> ptr to kernel ptrs, indexed by interpolationIndex
    ld1         {v16.8h, v17.8h}, [x11], #32    // v16, v17: channel 0 data
    ld1         {v20.8h, v21.8h}, [x11], #32    // v20, v21: channel 1 data
    mov         x14, #0
    ld1         {v0.8h, v1.8h}, [x11], #32      // v0, v1: channel 2 data
    ld1         {v4.8h, v5.8h}, [x11], #32      // v4, v5: channel 3 data
    ld1         {v8.8h, v9.8h}, [x11], #32      // v8, v9: channel 4 data
    ld1         {v12.8h, v13.8h}, [x11]         // v12, v13: channel 5 data
    cbz         w3, 2f
    ldr         x12, [x12]          // X12 --> filter elements of phase 0
    prfm        pldl1strm, [x1]
    prfm        pstl1keep, [x2]
    add         x13, x12, x7, lsl #5
1:
    ld1         {v24.8h, v25.8h}, [x13] // v24, v25: filter kernel for current phase
    add         w7, w7, #1
    cmp         w5, w6
    b.hs        2f
    ld4         {v28.h, v29.h, v30.h, v31.h}[0], [x1], #8
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v20.16b, v20.16b, v21.16b, #2
    ld2         {v26.h, v27.h}[0], [x1], #4
    ext         v0.16b, v0.16b, v1.16b, #2
    ext         v4.16b, v4.16b, v5.16b, #2
    ext         v17.16b, v17.16b, v28.16b, #2
    ext         v21.16b, v21.16b, v29.16b, #2
    prfm        pldl1keep, [x1, #72]
    ext         v1.16b, v1.16b, v30.16b, #2
    ext         v5.16b, v5.16b, v31.16b, #2
    ext         v8.16b, v8.16b, v9.16b, #2
    ext         v12.16b, v12.16b, v13.16b, #2
    sub         w3, w3, #1
    ext         v9.16b, v9.16b, v26.16b, #2
    ext         v13.16b, v13.16b, v27.16b, #2
2:
    sub         v28.4s, v28.4s, v28.4s
    sub         v29.4s, v29.4s, v29.4s
    sub         v30.4s, v30.4s, v30.4s
    sub         v31.4s, v31.4s, v31.4s
    smlal       v28.4s, v16.4h, v24.4h
    sub         v26.4s, v26.4s, v26.4s
    smlal       v29.4s, v20.4h, v24.4h
    cmp         w7, w4
    smlal       v30.4s, v0.4h, v24.4h
    sub         v27.4s, v27.4s, v27.4s
    smlal       v31.4s, v4.4h, v24.4h
    csel        w7, wzr, w7, hs
    smlal       v26.4s, v8.4h, v24.4h
    add         x13, x12, x7, lsl #5
    smlal       v27.4s, v12.4h, v24.4h
    smlal2      v28.4s, v16.8h, v24.8h
    prfm        pldl1keep, [x13, #192]
    smlal2      v29.4s, v20.8h, v24.8h
    add         w9, w9, w8
    smlal2      v30.4s, v0.8h, v24.8h
    mov         w15, #1
    prfm        pstl1keep, [x2, #72]
    smlal2      v31.4s, v4.8h, v24.8h
    cmp         w9, w4
    smlal2      v26.4s, v8.8h, v24.8h
    sub         w11, w9, w4
    smlal2      v27.4s, v12.8h, v24.8h
    csel        w6, w15, wzr, hs
    smlal       v28.4s, v17.4h, v25.4h
    csel        w9, w11, w9, hs
    smlal       v29.4s, v21.4h, v25.4h
    smlal       v30.4s, v1.4h, v25.4h
    smlal       v31.4s, v5.4h, v25.4h
    smlal       v26.4s, v9.4h, v25.4h
    smlal       v27.4s, v13.4h, v25.4h
    smlal2      v28.4s, v17.8h, v25.8h
    smlal2      v29.4s, v21.8h, v25.8h
    smlal2      v30.4s, v1.8h, v25.8h
    smlal2      v31.4s, v5.8h, v25.8h
    smlal2      v26.4s, v9.8h, v25.8h
    smlal2      v27.4s, v13.8h, v25.8h
    addv        s28, v28.4s
    addv        s29, v29.4s
    addv        s30, v30.4s
    addv        s31, v31.4s
    addv        s26, v26.4s
    addv        s27, v27.4s
    sqrshrn     h28, s28, #15
    sqrshrn     h29, s29, #15
    sqrshrn     h30, s30, #15
    sqrshrn     h31, s31, #15
    sqrshrn     h26, s26, #15
    sqrshrn     h27, s27, #15
    add         x14, x14, #1
    st4         {v28.h, v29.h, v30.h, v31.h}[0], [x2], #8
    st2         {v26.h, v27.h}[0], [x2], #4
    cbnz        w3, 1b
3:
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    strh        w9, [x0, #10]
    mov         x0, x14
    st1         {v16.8h, v17.8h}, [x11], #32
    st1         {v20.8h, v21.8h}, [x11], #32
    st1         {v0.8h, v1.8h}, [x11], #32
    st1         {v4.8h, v5.8h}, [x11], #32
    st1         {v8.8h, v9.8h}, [x11], #32
    st1         {v12.8h, v13.8h}, [x11]
    ld1         {v0.8h, v1.8h}, [sp], #32
    ld1         {v4.8h, v5.8h}, [sp], #32
    ld1         {v8.8h, v9.8h}, [sp], #32
    ld1         {v12.8h, v13.8h}, [sp], #32
    ret

    .size       interpolating6Ch16TapFilter, [.-interpolating6Ch16TapFilter]

    .align      6
    .type       decimating6Ch16TapFilter, %function

decimating6Ch16TapFilter:
    mov         x14, #0             // X14 --> return value, number of output samples produced
    cbz         w3, 4f
    sub         x13, sp, #128
    sub         sp, sp, #128        // make room to save v0 through v15
    st1         {v0.8h, v1.8h}, [x13], #32
    st1         {v4.8h, v5.8h}, [x13], #32
    st1         {v8.8h, v9.8h}, [x13], #32
    st1         {v12.8h, v13.8h}, [x13]
    prfm        pldl1strm, [x1]
    ldr         x12, [x0, #16]      // X12 --> ptr to kernel ptrs, indexed by interpolationIndex
    ldr         x10, [x0, #24]      // X10 --> pDelayLines
    ldrb        w4, [x0, #0]        // W4 --> interpolationFactor
    ldrb        w5, [x0, #2]        // W5 --> stuffIndex
    ldrb        w6, [x0, #3]        // W6 --> stuffCount
    ldrb        w7, [x0, #5]        // W7 --> interpolationIndex
    ldrsh       w8, [x0, #8]        // W8 --> ddaTerm
    ldrsh       w9, [x0, #10]       // W9 --> phase
    ldr         x12, [x12]          // X12 --> filter elements of phase 0
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    ldrb        w15, [x0, #6]       // W15 --> minimum stuff count
    ldrb        w16, [x0, #7]       // W16 --> maximum stuff count
    ld1         {v16.8h, v17.8h}, [x11], #32    // v16, v17, v18, v19: channel 0 data
    add         x13, x12, x7, lsl #5
    ld1         {v20.8h, v21.8h}, [x11], #32    // v20, v21: channel 1 data
    ld1         {v0.8h, v1.8h}, [x11], #32      // v0, v1: channel 2 data
    ld1         {v4.8h, v5.8h}, [x11], #32      // v4, v5: channel 3 data
    ld1         {v8.8h, v9.8h}, [x11], #32      // v8, v9: channel 4 data
    ld1         {v12.8h, v13.8h}, [x11]         // v12, v13: channel 5 data
1:
    ld1         {v24.8h, v25.8h}, [x13] // v24, v25: filter kernel for current phase
2:
    ld4         {v28.h, v29.h, v30.h, v31.h}[0], [x1], #8
    subs        w3, w3, #1
    ld2         {v26.h, v27.h}[0], [x1], #4
    ext         v16.16b, v16.16b, v17.16b, #2
    ext         v20.16b, v20.16b, v21.16b, #2
    add         w5, w5, #1
    ext         v0.16b, v0.16b, v1.16b, #2
    ext         v4.16b, v4.16b, v5.16b, #2
    ccmp        w5, w6, #0x4, eq        // if W3 == 0: sets w5 - w6; if W3 != 0, sets (nZcv) EQ
    ext         v17.16b, v17.16b, v28.16b, #2
    ext         v21.16b, v21.16b, v29.16b, #2
    ext         v1.16b, v1.16b, v30.16b, #2
    ext         v5.16b, v5.16b, v31.16b, #2
    ext         v8.16b, v8.16b, v9.16b, #2
    ext         v12.16b, v12.16b, v13.16b, #2
    ext         v9.16b, v9.16b, v26.16b, #2
    ext         v13.16b, v13.16b, v27.16b, #2
    prfm        pldl1keep, [x1, #128]

    b.ne        3f                  // if conditional compare was applied, and W5 != W6, then save state and exit
    cmp         w5, w6              // need to fetch more source samples?
    b.lo        2b                  // yes

    mov         w5, #0              // reset stuff index
    sub         v26.4s, v26.4s, v26.4s
    sub         v27.4s, v27.4s, v27.4s
    sub         v28.4s, v28.4s, v28.4s
    sub         v29.4s, v29.4s, v29.4s
    prfm        pstl1strm, [x2, #128]
    sub         v30.4s, v30.4s, v30.4s
    sub         v31.4s, v31.4s, v31.4s

    smlal       v28.4s, v16.4h, v24.4h
    adds        w9, w9, w8
    smlal       v29.4s, v20.4h, v24.4h
    add         w7, w7, #1
    smlal       v30.4s, v0.4h, v24.4h
    add         w11, w9, w4
    smlal       v31.4s, v4.4h, v24.4h
    csel        w6, w15, w16, lt
    smlal       v26.4s, v8.4h, v24.4h
    csel        w9, w11, w9, lt
    smlal       v27.4s, v12.4h, v24.4h
    cmp         w7, w4
    smlal2      v28.4s, v16.8h, v24.8h
    csel        w7, wzr, w7, hs
    smlal2      v29.4s, v20.8h, v24.8h
    add         x13, x12, x7, lsl #5
    smlal2      v30.4s, v0.8h, v24.8h
    prfm        pldl1keep, [x13, #128]
    smlal2      v31.4s, v4.8h, v24.8h
    smlal2      v26.4s, v8.8h, v24.8h
    smlal2      v27.4s, v12.8h, v24.8h
    smlal       v28.4s, v17.4h, v25.4h
    smlal       v29.4s, v21.4h, v25.4h
    smlal       v30.4s, v1.4h, v25.4h
    smlal       v31.4s, v5.4h, v25.4h
    smlal       v26.4s, v9.4h, v25.4h
    smlal       v27.4s, v13.4h, v25.4h
    smlal2      v28.4s, v17.8h, v25.8h
    smlal2      v29.4s, v21.8h, v25.8h
    smlal2      v30.4s, v1.8h, v25.8h
    smlal2      v31.4s, v5.8h, v25.8h
    smlal2      v26.4s, v9.8h, v25.8h
    smlal2      v27.4s, v13.8h, v25.8h
    addv        s28, v28.4s
    addv        s29, v29.4s
    addv        s30, v30.4s
    addv        s31, v31.4s
    addv        s26, v26.4s
    addv        s27, v27.4s
    sqrshrn     h28, s28, #15
    sqrshrn     h29, s29, #15
    sqrshrn     h30, s30, #15
    sqrshrn     h31, s31, #15
    sqrshrn     h26, s26, #15
    sqrshrn     h27, s27, #15
    add         x14, x14, #1
    st4         {v28.h, v29.h, v30.h, v31.h}[0], [x2], #8
    st2         {v26.h, v27.h}[0], [x2], #4
    cbnz        w3, 1b
3:
    ldr         x11, [x10, #0]      // X11 --> delay line channel 0, initial vector location
    strb        w5, [x0, #2]        // write back stuffIndex
    strb        w6, [x0, #3]        // write back stuffCount
    strb        w7, [x0, #5]        // write back interpolationIndex
    strh        w9, [x0, #10]
    st1         {v16.8h, v17.8h}, [x11], #32
    st1         {v20.8h, v21.8h}, [x11], #32
    st1         {v0.8h, v1.8h}, [x11], #32
    st1         {v4.8h, v5.8h}, [x11], #32
    st1         {v8.8h, v9.8h}, [x11], #32
    st1         {v12.8h, v13.8h}, [x11]
    ld1         {v0.8h, v1.8h}, [sp], #32
    ld1         {v4.8h, v5.8h}, [sp], #32
    ld1         {v8.8h, v9.8h}, [sp], #32
    ld1         {v12.8h, v13.8h}, [sp], #32
4:
    mov         x0, x14
    ret

    .size       decimating6Ch16TapFilter, [.-decimating6Ch16TapFilter]
