/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

.macro push xreg
    str \xreg, [sp, #(-8)]!
.endm

.macro push2 xreg1, xreg2
    stp \xreg1, \xreg2, [sp, #(-16)]!
.endm

.macro pop xreg
    ldr \xreg, [sp], #8
.endm

.macro pop2 xreg1, xreg2
    ldp \xreg1, \xreg2, [sp], #16
.endm

/* BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 * x0: rp
 * x1: ap
 * x2: num
 * x3: w
 */
.section ".text.nndetailCryptoBignumMulAddWords"
.global nndetailCryptoBignumMulAddWords
nndetailCryptoBignumMulAddWords:
    mov                     x15, xzr
    cbz                     x2, MulAddEnd

    lsr                     x6, x2, #2
    cbz                     x6, MulAddLess4

    sub                     x2, x2, x6, lsl #2
    push2                   x16, x17

MulAddLoop4:

    ldp                     w4, w5, [x1], #8
    ldp                     w16, w7, [x1], #8
    ldp                     w8, w9, [x0]
    ldp                     w10, w11, [x0,#8]

    umaddl                  x4, w3, w4, x8
    umaddl                  x5, w3, w5, x9
    umaddl                  x16, w3, w16, x10
    umaddl                  x7, w3, w7, x11

    add                     x12, x4, x15, lsr #32
    add                     x13, x5, x12, lsr #32
    stp                     w12, w13, [x0], #8
    add                     x14, x16, x13, lsr #32
    add                     x15, x7, x14, lsr #32
    stp                     w14, w15, [x0], #8

    sub                     x6, x6, #1
    cbnz                    x6, MulAddLoop4

    pop2                    x16, x17

MulAddLess4:

    lsr                     x6, x2, #1
    cbz                     x6, MulAddLess2

    sub                     x2, x2, x6, lsl #1

MulAddLoop2:

    ldp                     w4, w5, [x1], #8
    ldp                     w8, w9, [x0]

    umaddl                  x4, w3, w4, x8
    umaddl                  x5, w3, w5, x9

    sub                     x6, x6, #1

    add                     x14, x4, x15, lsr #32
    add                     x15, x5, x14, lsr #32

    stp                     w14, w15, [x0], #8

    cbnz                    x6, MulAddLoop2

MulAddLess2:

    cbz                     x2, MulAddEnd

    ldr                     w4, [x1], #4
    ldr                     w8, [x0]

    umaddl                  x4, w3, w4, x8
    add                     x15, x4, x15, lsr #32

    str                     w15, [x0], #4

MulAddEnd:

    lsr                     x0, x15, #32

    ret
.size	nndetailCryptoBignumMulAddWords,.-nndetailCryptoBignumMulAddWords

/* BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 * x0: rp
 * x1: ap
 * x2: num
 * x3: w
 */
.section ".text.nndetailCryptoBignumMulWords"
.global nndetailCryptoBignumMulWords
nndetailCryptoBignumMulWords:
    mov                     x15, xzr
    cbz                     x2, MulEnd

    lsr                     x8, x2, #2
    cbz                     x8, MulLess4

    sub                     x2, x2, x8, lsl #2

MulLoop4:

    ldp                     w4, w5, [x1], #8
    ldp                     w6, w7, [x1], #8

    umull                   x4, w3, w4
    umull                   x5, w3, w5
    umull                   x6, w3, w6
    umull                   x7, w3, w7

    add                     x12, x4, x15, lsr #32
    add                     x13, x5, x12, lsr #32
    add                     x14, x6, x13, lsr #32
    add                     x15, x7, x14, lsr #32

    sub                     x8, x8, #1

    stp                     w12, w13, [x0], #8
    stp                     w14, w15, [x0], #8

    cbnz                    x8, MulLoop4

MulLess4:

    lsr                     x6, x2, #1
    cbz                     x6, MulLess2

    sub                     x2, x2, x6, lsl #1

MulLoop2:

    ldp                     w4, w5, [x1], #8

    umull                   x4, w3, w4
    umull                   x5, w3, w5

    sub                     x6, x6, #1

    add                     x14, x4, x15, lsr #32
    add                     x15, x5, x14, lsr #32

    stp                     w14, w15, [x0], #8

    cbnz                    x6, MulLoop2

MulLess2:

    cbz                     x2, MulEnd

    ldr                     w4, [x1], #4

    umull                   x4, w3, w4
    add                     x15, x4, x15, lsr #32

    str                     w15, [x0], #4

MulEnd:

    lsr                     x0, x15, #32

    ret
.size	nndetailCryptoBignumMulWords,.-nndetailCryptoBignumMulWords

/* BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 * x0: r
 * x1: a
 * x2: b
 * x3: n
 */
.section ".text.nndetailCryptoBignumAddWords"
.global nndetailCryptoBignumAddWords
nndetailCryptoBignumAddWords:
    msr                     NZCV, xzr
    cbz                     x3, AddEnd

    push2                   x16, x17
    push2                   x18, x19
    push2                   x20, x21

    lsr                     x20, x3, #4
    cbz                     x20, AddLess16

    sub                     x3, x3, x20, lsl #4

AddLoop16:

    ldp                     x4,   x5, [x1], #16
    ldp                     x12, x13, [x2], #16
    ldp                     x6,   x7, [x1], #16
    ldp                     x14, x15, [x2], #16
    ldp                     x8,   x9, [x1], #16
    ldp                     x16, x17, [x2], #16
    ldp                     x10, x11, [x1], #16
    ldp                     x18, x19, [x2], #16

    adcs                    x4,  x4,  x12
    adcs                    x5,  x5,  x13

    stp                     x4,  x5,  [x0], #16

    adcs                    x6,  x6,  x14
    adcs                    x7,  x7,  x15

    stp                     x6,  x7,  [x0], #16

    adcs                    x8,  x8,  x16
    adcs                    x9,  x9,  x17

    stp                     x8,  x9,  [x0], #16

    adcs                    x10, x10,  x18
    adcs                    x11, x11,  x19

    stp                     x10, x11,  [x0], #16

    sub                     x20, x20, #1
    cbnz                    x20, AddLoop16

AddLess16:

    lsr                     x15, x3, #2
    cbz                     x15, AddLess4

    sub                     x3, x3, x15, lsl #2

AddLoop4:

    ldp                     x4,  x5,  [x1], #16
    ldp                     x8,  x9,  [x2], #16

    sub                     x15, x15, #1

    adcs                    x4,  x4,  x8
    adcs                    x5,  x5,  x9

    stp                     x4,  x5,  [x0], #16

    cbnz                    x15, AddLoop4

AddLess4:

    cbz                     x3, AddLoopEnd

AddLoop:

    ldr                     w4, [x1], #4
    ldr                     w8, [x2], #4
    adcs                    w4,  w4,  w8
    str                     w4, [x0], #4

    sub                     x3, x3, #1
    cbnz                    x3, AddLoop

AddLoopEnd:

    pop2                    x20, x21
    pop2                    x18, x19
    pop2                    x16, x17

AddEnd:

    adc                     x0, xzr, xzr

    ret
.size nndetailCryptoBignumAddWords,.-nndetailCryptoBignumAddWords

/* BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 * x0: r
 * x1: a
 * x2: b
 * x3: n
 */
.section ".text.nndetailCryptoBignumSubWords"
.global nndetailCryptoBignumSubWords
nndetailCryptoBignumSubWords:

    mov                     x4, #0x20000000
    msr                     NZCV, x4
    cbz                     x3, SubEnd

    push2                   x16, x17
    push2                   x18, x19
    push2                   x20, x21

    lsr                     x20, x3, #4
    cbz                     x20, SubLess16

    sub                     x3, x3, x20, lsl #4

SubLoop16:

    ldp                     x4,   x5, [x1], #16
    ldp                     x12, x13, [x2], #16
    ldp                     x6,   x7, [x1], #16
    ldp                     x14, x15, [x2], #16
    ldp                     x8,   x9, [x1], #16
    ldp                     x16, x17, [x2], #16
    ldp                     x10, x11, [x1], #16
    ldp                     x18, x19, [x2], #16

    sbcs                    x4,  x4,  x12
    sbcs                    x5,  x5,  x13

    stp                     x4,  x5,  [x0], #16

    sbcs                    x6,  x6,  x14
    sbcs                    x7,  x7,  x15

    stp                     x6,  x7,  [x0], #16

    sbcs                    x8,  x8,  x16
    sbcs                    x9,  x9,  x17

    stp                     x8,  x9,  [x0], #16

    sbcs                    x10, x10,  x18
    sbcs                    x11, x11,  x19

    stp                     x10, x11,  [x0], #16

    sub                     x20, x20, #1
    cbnz                    x20, SubLoop16

SubLess16:

    lsr                     x15, x3, #2
    cbz                     x15, SubLess4

    sub                     x3, x3, x15, lsl #2

SubLoop4:

    ldp                     x4,  x5,  [x1], #16
    ldp                     x8,  x9,  [x2], #16

    sub                     x15, x15, #1

    sbcs                    x4,  x4,  x8
    sbcs                    x5,  x5,  x9

    stp                     x4,  x5,  [x0], #16

    cbnz                    x15, SubLoop4

SubLess4:

    cbz                     x3, SubLoopEnd

SubLoop:

    ldr                     w4, [x1], #4
    ldr                     w8, [x2], #4
    sbcs                    w4,  w4,  w8
    str                     w4, [x0], #4

    sub                     x3, x3, #1
    cbnz                    x3, SubLoop

SubLoopEnd:

    pop2                    x20, x21
    pop2                    x18, x19
    pop2                    x16, x17

SubEnd:

    cinc                    x0, xzr, cc

    ret
.size nndetailCryptoBignumSubWords,.-nndetailCryptoBignumSubWords

/* void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 * x0: r
 * x1: a
 * x2: b
 */
.section ".text.nndetailCryptoBignum_mul_comba8"
.global nndetailCryptoBignum_mul_comba8
nndetailCryptoBignum_mul_comba8:

    push2                   x16, x17
    push2                   x18, x19
    push2                   x20, x21
    push2                   x22, x23

    ldp                     w4,   w5, [x1]              // w4  = a0, w5  = a1
    ldp                     w12,  w13, [x2]             // w12 = b0, w13 = b1
    ldp                     w6,   w7, [x1, #8]          // w6  = a2, w7  = a3
    ldp                     w14,  w15, [x2, #8]         // w14 = b2, w15 = b3
    ldp                     w8,   w9, [x1, #16]         // w8  = a4, w9  = a5
    ldp                     w16,  w17, [x2, #16]        // w16 = b4, w17 = b5
    ldp                     w10,  w11, [x1, #24]        // w10 = a6, w11 = a7
    ldp                     w18,  w19, [x2, #24]        // w18 = b6, w19 = b7

    ////
    umull                   x1, w4, w12                 // a0*b0
    umull                   x2, w4, w13                 // a0*b1
    umull                   x3, w5, w12                 // a1*b0

    adds                    x21, x2, x1, lsr #32
    adc                     x22, xzr, xzr
    adds                    x21, x21, x3                // a0*b1 + a1*b0
    adc                     x22, x22, xzr

    stp                     w1, w21, [x0]

    ////
    umull                   x1, w6, w12                 // a2*b0
    umull                   x2, w5, w13                 // a1*b1
    umull                   x3, w4, w14                 // a0*b2

    adds                    x21, x1, x21, lsr #32
    adc                     x23, xzr, xzr
    adds                    x21, x21, x2
    adc                     x23, x23, xzr
    adds                    x21, x21, x3
    adc                     x23, x23, xzr

    str                     w21, [x0, #8]

    ////
    umaddl                  x1,  w7, w12, x22           // a3*b0
    umull                   x2,  w6, w13                // a2*b1
    umull                   x3,  w5, w14                // a1*b2
    umull                   x20, w4, w15                // a0*b3

    adds                    x21, x1, x21, lsr #32
    adc                     x22, xzr, xzr
    adds                    x21, x21, x2
    adc                     x22, x22, xzr
    adds                    x21, x21, x3
    adc                     x22, x22, xzr
    adds                    x21, x21, x20
    adc                     x22, x22, xzr

    str                     w21, [x0, #12]

    ////
    umaddl                  x1,  w8, w12, x23           // a4*b0
    umull                   x2,  w7, w13                // a3*b1
    umull                   x3,  w6, w14                // a2*b2
    umull                   x20, w5, w15                // a1*b3

    adds                    x21, x1, x21, lsr #32
    adc                     x23, xzr, xzr
    umull                   x1,  w4, w16                // a0*b4
    adds                    x21, x21, x2
    adc                     x23, x23, xzr
    adds                    x21, x21, x3
    adc                     x23, x23, xzr
    adds                    x21, x21, x20
    adc                     x23, x23, xzr
    adds                    x21, x21, x1
    adc                     x23, x23, xzr

    str                     w21, [x0, #16]

    ////
    umaddl                  x1,  w9, w12, x22           // a5*b0
    umull                   x2,  w8, w13                // a4*b1
    umull                   x3,  w7, w14                // a3*b2
    umull                   x20, w6, w15                // a2*b3

    adds                    x21, x1, x21, lsr #32
    adc                     x22, xzr, xzr
    adds                    x21, x21, x2
    adc                     x22, x22, xzr
    umull                   x1,  w5, w16                // a1*b4
    umull                   x2,  w4, w17                // a0*b5
    adds                    x21, x21, x3
    adc                     x22, x22, xzr
    adds                    x21, x21, x20
    adc                     x22, x22, xzr
    adds                    x21, x21, x1
    adc                     x22, x22, xzr
    adds                    x21, x21, x2
    adc                     x22, x22, xzr

    str                     w21, [x0, #20]

    ////
    umaddl                  x1, w10, w12, x23           // a6*b0
    umull                   x2,  w9, w13                // a5*b1
    umull                   x3,  w8, w14                // a4*b2
    umull                   x20, w7, w15                // a3*b3

    adds                    x21, x1, x21, lsr #32
    adc                     x23, xzr, xzr
    adds                    x21, x21, x2
    adc                     x23, x23, xzr
    adds                    x21, x21, x3
    adc                     x23, x23, xzr
    umull                   x1,  w6, w16                // a2*b4
    umull                   x2,  w5, w17                // a1*b5
    umull                   x3,  w4, w18                // a0*b6
    adds                    x21, x21, x20
    adc                     x23, x23, xzr
    adds                    x21, x21, x1
    adc                     x23, x23, xzr
    adds                    x21, x21, x2
    adc                     x23, x23, xzr
    adds                    x21, x21, x3
    adc                     x23, x23, xzr

    str                     w21, [x0, #24]

    ////
    umaddl                  x1, w11, w12, x22           // a7*b0
    umull                   x2, w10, w13                // a6*b1
    umull                   x3,  w9, w14                // a5*b2
    umull                   x20, w8, w15                // a4*b3

    adds                    x21, x1, x21, lsr #32
    adc                     x22, xzr, xzr
    adds                    x21, x21, x2
    adc                     x22, x22, xzr
    adds                    x21, x21, x3
    adc                     x22, x22, xzr
    adds                    x21, x21, x20
    adc                     x22, x22, xzr
    umull                   x1,  w7, w16                // a3*b4
    umull                   x2,  w6, w17                // a2*b5
    umull                   x3,  w5, w18                // a1*b6
    umull                   x20, w4, w19                // a0*b7
    adds                    x21, x21, x1
    adc                     x22, x22, xzr
    adds                    x21, x21, x2
    adc                     x22, x22, xzr
    adds                    x21, x21, x3
    adc                     x22, x22, xzr
    adds                    x21, x21, x20
    adc                     x22, x22, xzr

    str                     w21, [x0, #28]

    ////
    umaddl                  x1, w11, w13, x23           // a7*b1
    umull                   x2, w10, w14                // a6*b2
    umull                   x3,  w9, w15                // a5*b3
    umull                   x20, w8, w16                // a4*b4

    adds                    x21, x1, x21, lsr #32
    adc                     x23, xzr, xzr
    adds                    x21, x21, x2
    adc                     x23, x23, xzr
    adds                    x21, x21, x3
    adc                     x23, x23, xzr
    umull                   x1,  w7, w17                // a3*b5
    umull                   x2,  w6, w18                // a2*b6
    umull                   x3,  w5, w19                // a1*b7
    adds                    x21, x21, x20
    adc                     x23, x23, xzr
    adds                    x21, x21, x1
    adc                     x23, x23, xzr
    adds                    x21, x21, x2
    adc                     x23, x23, xzr
    adds                    x21, x21, x3
    adc                     x23, x23, xzr

    str                     w21, [x0, #32]

    ////
    umaddl                  x1, w11, w14, x22           // a7*b2
    umull                   x2, w10, w15                // a6*b3
    umull                   x3,  w9, w16                // a5*b4
    umull                   x20, w8, w17                // a4*b5

    adds                    x21, x1, x21, lsr #32
    adc                     x22, xzr, xzr
    adds                    x21, x21, x2
    adc                     x22, x22, xzr
    umull                   x1,  w7, w18                // a3*b6
    umull                   x2,  w6, w19                // a2*b7
    adds                    x21, x21, x3
    adc                     x22, x22, xzr
    adds                    x21, x21, x20
    adc                     x22, x22, xzr
    adds                    x21, x21, x1
    adc                     x22, x22, xzr
    adds                    x21, x21, x2
    adc                     x22, x22, xzr

    str                     w21, [x0, #36]

    ////
    umaddl                  x1, w11, w15, x23           // a7*b3
    umull                   x2, w10, w16                // a6*b4
    umull                   x3,  w9, w17                // a5*b5
    umull                   x20, w8, w18                // a4*b6

    adds                    x21, x1, x21, lsr #32
    adc                     x23, xzr, xzr
    adds                    x21, x21, x2
    adc                     x23, x23, xzr
    umull                   x1,  w7, w19                // a3*b7
    adds                    x21, x21, x3
    adc                     x23, x23, xzr
    adds                    x21, x21, x20
    adc                     x23, x23, xzr
    adds                    x21, x21, x1
    adc                     x23, x23, xzr

    str                     w21, [x0, #40]

    ////
    umaddl                  x1, w11, w16, x22           // a7*b4
    umull                   x2, w10, w17                // a6*b5
    umull                   x3,  w9, w18                // a5*b6
    umull                   x20, w8, w19                // a4*b7

    adds                    x21, x1, x21, lsr #32
    adc                     x22, xzr, xzr
    adds                    x21, x21, x2
    adc                     x22, x22, xzr
    adds                    x21, x21, x3
    adc                     x22, x22, xzr
    adds                    x21, x21, x20
    adc                     x22, x22, xzr

    str                     w21, [x0, #44]

    ////
    umaddl                  x1, w11, w17, x23           // a7*b5
    umull                   x2, w10, w18                // a6*b6
    umull                   x3,  w9, w19                // a5*b7

    adds                    x21, x1, x21, lsr #32
    adc                     x23, xzr, xzr
    adds                    x21, x21, x2
    adc                     x23, x23, xzr
    adds                    x21, x21, x3
    adc                     x23, x23, xzr

    str                     w21, [x0, #48]

    ////
    umaddl                  x1, w11, w18, x22           // a7*b6
    umull                   x2, w10, w19                // a6*b7
    umaddl                  x3, w11, w19, x23           // a7*b7

    adds                    x1, x1, x21, lsr #32
    adc                     x22, xzr, xzr
    adds                    x1, x1, x2
    adc                     x22, x22, xzr

    adds                    x2, x3, x1, lsr #32

    stp                     w1, w2, [x0, #52]

    add                     x1, x22, x2, lsr #32
    str                     w1, [x0, #60]

    pop2                    x22, x23
    pop2                    x20, x21
    pop2                    x18, x19
    pop2                    x16, x17

    ret
.size nndetailCryptoBignum_mul_comba8,.-nndetailCryptoBignum_mul_comba8

/* void bn_sqr_comba8(BN_ULONG *r,const BN_ULONG *a)
 * x0: r
 * x1: a
 */
.section ".text.nndetailCryptoBignum_sqr_comba8"
.global nndetailCryptoBignum_sqr_comba8
nndetailCryptoBignum_sqr_comba8:

    ldp                     w4,   w5, [x1]              // w4  = a0, w5  = a1
    ldp                     w6,   w7, [x1, #8]          // w6  = a2, w7  = a3
    ldp                     w8,   w9, [x1, #16]         // w8  = a4, w9  = a5
    ldp                     w10,  w11, [x1, #24]        // w10 = a6, w11 = a7

    ////
    umull                   x1,   w4,  w4               // a0*a0
    umull                   x2,   w4,  w5               // a0*a1
    umull                   x3,   w4,  w6               // a0*a2

    adds                    x14, x2, x1, lsr #32
    adc                     x12, xzr, xzr
    adds                    x14, x2, x14                // a0*a1 + a1*a0
    adc                     x12, x12, xzr

    stp                     w1, w14, [x0]

    //// a0*a2 + a1*a1 + a2*a0
    umull                   x2,  w5,  w5                // a1*a1
    umull                   x1,  w4,  w7                // a0*a3

    adds                    x14, x3, x14, lsr #32
    adc                     x13, xzr, xzr
    adds                    x14, x3, x14
    adc                     x13, x13, xzr
    adds                    x14, x2, x14
    adc                     x13, x13, xzr

    str                     w14, [x0, #8]

    //// a3*a0 + a2*a1 + a1*a2 + a0*a3
    umull                   x2,   w5,  w6               // a1*a2
    umull                   x3,   w4,  w8               // a0*a4

    adds                    x14, x12, x14, lsr #32
    adc                     x12, xzr, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr

    str                     w14, [x0, #12]

    //// a4*a0 + a3*a1 + a2*a2 + a1*a3 + a0*a4
    umull                   x2, w5, w7                  // a1*a3
    umaddl                  x1, w6, w6, x13             // a2*a2

    adds                    x14, x3, x14, lsr #32
    adc                     x13, xzr, xzr
    adds                    x14, x3, x14
    adc                     x13, x13, xzr
    adds                    x14, x2, x14
    adc                     x13, x13, xzr
    adds                    x14, x2, x14
    adc                     x13, x13, xzr
    adds                    x14, x1, x14
    adc                     x13, x13, xzr

    str                     w14, [x0, #16]

    //// a5*a0 + a4*a1 + a3*a2 + a2*a3 + a1*a4 + a0*a5
    umull                   x1, w9, w4                  // a5*a0
    umull                   x2, w8, w5                  // a4*a1
    umull                   x3, w7, w6                  // a3*a2

    adds                    x14, x12, x14, lsr #32
    adc                     x12, xzr, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr
    adds                    x14, x3, x14
    adc                     x12, x12, xzr
    adds                    x14, x3, x14
    adc                     x12, x12, xzr

    str                     w14, [x0, #20]

    //// a6*a0 + a5*a1 + a4*a2 + a3*a3 + a2*a4 + a1*a5 + a0*a6
    umaddl                  x1, w7, w7, x13             // a3*a3
    umull                   x2, w9, w5                  // a5*a1
    umull                   x3, w8, w6                  // a4*a2

    adds                    x14, x1, x14, lsr #32
    adc                     x13, xzr, xzr
    umull                   x1, w10, w4                 // a6*a0
    adds                    x14, x2, x14
    adc                     x13, x13, xzr
    adds                    x14, x2, x14
    adc                     x13, x13, xzr
    adds                    x14, x3, x14
    adc                     x13, x13, xzr
    adds                    x14, x3, x14
    adc                     x13, x13, xzr
    adds                    x14, x1, x14
    adc                     x13, x13, xzr
    adds                    x14, x1, x14
    adc                     x13, x13, xzr

    str                     w14, [x0, #24]

    //// a7*a0 + a6*a1 + a5*a2 + a4*a3 + a3*a4 + a2*a5 + a1*a6 + a0*a7
    umull                   x1, w11, w4                 // a7*a0
    umull                   x2, w10, w5                 // a6*a1
    umull                   x3, w9, w6                  // a5*a2

    adds                    x14, x12, x14, lsr #32
    adc                     x12, xzr, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    umull                   x1, w7, w8                  // a3*a4
    adds                    x14, x2, x14
    adc                     x12, x12, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr
    adds                    x14, x3, x14
    adc                     x12, x12, xzr
    adds                    x14, x3, x14
    adc                     x12, x12, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr

    str                     w14, [x0, #28]

    //// a7*a1 + a6*a2 + a5*a3 + a4*a4 + a3*a5 + a2*a6 + a1*a7
    umaddl                  x1, w8, w8, x13             // a4*a4
    umull                   x2, w10, w6                 // a6*a2
    umull                   x3, w9, w7                  // a5*a3


    adds                    x14, x1, x14, lsr #32
    adc                     x13, xzr, xzr
    adds                    x14, x2, x14
    adc                     x13, x13, xzr
    adds                    x14, x2, x14
    adc                     x13, x13, xzr
    umull                   x1, w11, w5                 // a7*a1
    adds                    x14, x3, x14
    adc                     x13, x13, xzr
    adds                    x14, x3, x14
    adc                     x13, x13, xzr
    adds                    x14, x1, x14
    adc                     x13, x13, xzr
    adds                    x14, x1, x14
    adc                     x13, x13, xzr

    str                     w14, [x0, #32]

    //// a7*a2 + a6*a3 + a5*a4 + a4*a5 + a3*a6 + a2*a7
    umull                   x1, w11, w6                 // a7*a2
    umull                   x2, w10, w7                 // a6*a3
    umull                   x3, w9, w8                  // a5*a4

    adds                    x14, x12, x14, lsr #32
    adc                     x12, xzr, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr
    adds                    x14, x3, x14
    adc                     x12, x12, xzr
    adds                    x14, x3, x14
    adc                     x12, x12, xzr

    str                     w14, [x0, #36]

    //// a7*a3 + a6*a4 + a5*a5 + a4*a6 + a3*a7
    umull                   x1, w11, w7                 // a7*a3
    umull                   x2, w10, w8                 // a6*a4
    umaddl                  x3,  w9, w9, x13            // a5*a5

    adds                    x14, x1, x14, lsr #32
    adc                     x13, xzr, xzr
    adds                    x14, x1, x14
    adc                     x13, x13, xzr
    adds                    x14, x2, x14
    adc                     x13, x13, xzr
    adds                    x14, x2, x14
    adc                     x13, x13, xzr
    adds                    x14, x3, x14
    adc                     x13, x13, xzr

    str                     w14, [x0, #40]

    //// a7*a4 + a6*a5 + a5*a6 + a4*a7
    umull                   x1, w11, w8                 // a7*a4
    umull                   x2, w10, w9                 // a6*a5
    umaddl                  x3, w10, w10, x13           // a6*a6

    adds                    x14, x12, x14, lsr #32
    adc                     x12, xzr, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    adds                    x14, x1, x14
    adc                     x12, x12, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr

    str                     w14, [x0, #44]

    //// a7*a5 + a6*a6 + a5*a7
    umull                   x1, w11, w9                 // a7*a5
    umull                   x2, w11, w10                // a7*a6

    adds                    x14, x3, x14, lsr #32
    adc                     x13, xzr, xzr
    adds                    x14, x1, x14
    adc                     x13, x13, xzr
    adds                    x14, x1, x14
    adc                     x13, x13, xzr

    str                     w14, [x0, #48]

    //// a7*a6 + a6*a7
    umaddl                  x1, w11, w11, x13           // a7*a7

    adds                    x14, x12, x14, lsr #32
    adc                     x12, xzr, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr
    adds                    x14, x2, x14
    adc                     x12, x12, xzr

    str                     w14, [x0, #52]

    //// a7*a7
    add                     x1, x1, x14, lsr #32
    add                     x2, x12, x1, lsr #32

    stp                     w1, w2, [x0, #56]

    ret
.size nndetailCryptoBignum_sqr_comba8,.-nndetailCryptoBignum_sqr_comba8

