﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#include <nn/TargetConfigs/build_Cpu.h>
#include "kern_AssemblyOffset.h"
#include "../kern_KPageTableDefinition.h"
#include "../../ARM64/kern_RegisterDefinition.h"

#define LOAD_IMMEDIATE_VALUE(reg, value)                    \
    mov     reg, #((value) & 0xffff);                       \
    movk    reg, #(((value) >> 16) & 0xffff), lsl #16;      \
    movk    reg, #(((value) >> 32) & 0xffff), lsl #32;      \
    movk    reg, #(((value) >> 48) & 0xffff), lsl #48

#define LOAD_IMMEDIATE_VALUE_32(reg, value)                 \
    mov     reg, #((value) & 0xffff);                       \
    movk    reg, #(((value) >> 16) & 0xffff), lsl #16;

#define LOAD_VALUE_FROM_LABEL(reg, label)                   \
    adr     reg, label;                                     \
    ldr     reg, [reg]

#define PAGE_TABLE_ATTR \
        (HW_MMU_APTABLE_U_NA | HW_MMU_UXNTABLE | 3)
#define PAGE_R_X_ATTR \
        (HW_MMU_XN |              HW_MMU_AF | HW_MMU_SH_ISHARED | HW_MMU_AP_S_RO_U_NA | HW_MMU_ATTR_NORMAL | 3)
#define PAGE_R___ATTR \
        (HW_MMU_XN | HW_MMU_PXN | HW_MMU_AF | HW_MMU_SH_ISHARED | HW_MMU_AP_S_RO_U_NA | HW_MMU_ATTR_NORMAL | 3)
#define PAGE_RW__ATTR \
        (HW_MMU_XN | HW_MMU_PXN | HW_MMU_AF | HW_MMU_SH_ISHARED | HW_MMU_AP_S_RW_U_NA | HW_MMU_ATTR_NORMAL | 3)
#define PAGE_RWX_ATTR \
        (HW_MMU_XN |              HW_MMU_AF | HW_MMU_SH_ISHARED | HW_MMU_AP_S_RW_U_NA | HW_MMU_ATTR_NORMAL | 3)
#define BLOCK_RWX_ATTR \
        (HW_MMU_XN |              HW_MMU_AF | HW_MMU_SH_ISHARED | HW_MMU_AP_S_RW_U_NA | HW_MMU_ATTR_NORMAL | 1)

#define MAIR_VALUE ( \
        (HW_MAIR_ATTR_DEVICE_NGNRNE    << (HW_MMU_ATTRINDEX_DEVICE_NGNRNE * 8)) |   \
        (HW_MAIR_ATTR_DEVICE_NGNRE     << (HW_MMU_ATTRINDEX_DEVICE_NGNRE  * 8)) |   \
        (HW_MAIR_ATTR_NORMAL_ICRW_OCRW << (HW_MMU_ATTRINDEX_NORMAL        * 8)) |   \
        (HW_MAIR_ATTR_NORMAL_INC_ONC   << (HW_MMU_ATTRINDEX_NORMAL_NC     * 8)) |   \
        0)

#define TCR_VALUE (0 \
        | HW_TCR_AS_16BIT                           \
        | HW_TCR_IPS_64G                            \
        | HW_TCR_TG1_4K                             \
        | HW_TCR_SH1_INNER_SHARE                    \
        | HW_TCR_ORGN1_WBWA_CACHE                   \
        | HW_TCR_IRGN1_WBWA_CACHE                   \
        | HW_TCR_A1_TTBR0                           \
        | HW_TCR_T1SZ(64 - NN_KERN_T1_SPACE_SHIFT)  \
        | HW_TCR_TG0_4K                             \
        | HW_TCR_SH0_INNER_SHARE                    \
        | HW_TCR_ORGN0_WBWA_CACHE                   \
        | HW_TCR_IRGN0_WBWA_CACHE                   \
        | HW_TCR_T0SZ(64 - NN_KERN_T1_SPACE_SHIFT)  \
        )

#if defined(NN_BUILD_CONFIG_CPU_CORTEX_A53_AARCH64)
#define CPUECTLR_VALUE (0                                   \
        | HW_CPUECTLR_SMPEN                                 \
        | HW_CPUECTLR_FP_RETENTION_0                        \
        | HW_CPUECTLR_CPU_RETENTION_0                       \
        )
#define CPUACTLR_VALUE (0       \
        | HW_CPUACTLR_RADIS_128 \
        | HW_CPUACTLR_L1RADIS_4 \
        | HW_CPUACTLR_NPFSTRM_2 \
        | HW_CPUACTLR_DSTDIS    \
        | HW_CPUACTLR_STRIDE_2  \
        | HW_CPUACTLR_L1PCTL_5  \
        )
#elif defined(NN_BUILD_CONFIG_CPU_CORTEX_A57_AARCH64)
#define CPUECTLR_VALUE (0                                   \
        | HW_CPUECTLR_SMPEN                                 \
        | HW_CPUECTLR_CPU_RETENTION_0                       \
        | HW_CPUECTLR_L2_DATA_PREFETCH_DISTANCE_8           \
        | HW_CPUECTLR_L2_INST_PREFETCH_DISTANCE_3           \
        )

#define CPUACTLR_VALUE (0                                   \
        | HW_CPUACTLR_RADIS_12                              \
        | HW_CPUACTLR_L1RADIS_4                             \
        | HW_CPUACTLR_NON_CACHEABLE_STREAMING_ENHANCEMENT   \
        )
#else
#error not defined NN_BUILD_CONFIG_CPU
#endif

#define SCTLR_VALUE (0          \
        | HW_SCTLR_UCI          \
        | HW_SCTLR_EE_LITTLE    \
        | HW_SCTLR_E0E_LITTLE   \
        | HW_SCTLR_NTWE         \
        | HW_SCTLR_NTWI         \
        | HW_SCTLR_UCT          \
        | HW_SCTLR_DZE          \
        | HW_SCTLR_I            \
        | HW_SCTLR_SED          \
        | HW_SCTLR_CP15BEN      \
        | HW_SCTLR_C            \
        | HW_SCTLR_M            \
        | HW_SCTLR_RES1         \
        )

/*
 * 1. カーネルコードを物理メモリの位置にコピーしてジャンプする
 * 2. キャッシュ、TLBの無効化
 * 2. L1 PT, MMU の設定
 * 3. bss のクリア
 * 4. カーネルコードの実行
 * 5. スタックを切りかえて再実行
 */
    .section ".start","xa"
    .align  2
    .globl  _start
    .type   _start, @function
_start:
    msr     daifset, #0xF

    mrs     x1, CurrentEL
    cmp     x1, #0x4
    b.eq    3f

    cmp     x1, #0x8
    b.eq    2f

    // EL3
1:  b       1b

    // EL2
2:  bl      .Lfrom_el2_to_el1

    // EL1
3:
    mrs     x0, mpidr_el1
    and     x0, x0, #(HW_MPIDR_EL1_AFF0_MASK)
    cbnz    x0, .LSlaveCoreEntry

    adr     x1, _start
    LOAD_IMMEDIATE_VALUE(x0, NN_KERN_P_ADDR_CODE)

    cmp     x0, x1
    b.eq    .L_start_nocopy

    // copy kernel
    LOAD_IMMEDIATE_VALUE(x0, NN_KERN_P_ADDR_CODE)
    adr     x1, _start
    LOAD_IMMEDIATE_VALUE(x2, NN_KERN_P_ADDR_CODE_END)
    bl      .LCodeCopy

    // copy initial process
    LOAD_VALUE_FROM_LABEL(x0, .L__slab_pt_end)
    LOAD_IMMEDIATE_VALUE(x1, NN_KERN_P_ADDR_CODE)
    add     x0, x0, x1
    LOAD_IMMEDIATE_VALUE(x5, NN_KERN_P_ADDR_INITIAL_PROCESS_SIZE)
    sub     x0, x0, x5
    LOAD_IMMEDIATE_VALUE(x2, NN_KERN_RAMDISK_OFFSET)
    adr     x1, _start
    add     x1, x2, x1

    ldr     w2, [x1, #INITIAL_PROCESS_IMAGE_HEADER_TOTALLENGTH]
    ldr     w3, [x1, #INITIAL_PROCESS_IMAGE_HEADER_SIGNATURE]
    LOAD_IMMEDIATE_VALUE(x4, INITIAL_PROCESS_IMAGE_HEADER_VALIDSIGNATURE)
    cmp     w3, w4
    b.ne    1f
    cmp     x5, x2
    b.cs    2f

    // シグネチャが誤っていた場合、ヘッダだけコピーしておく（後のチェックで PANIC させる）
1:  mov     x2, #INITIAL_PROCESS_IMAGE_HEADER_SIZE
2:  add     x2, x0, x2
    bl      .LCodeCopy

    dsb     sy
    ic      ialluis
    isb

    LOAD_IMMEDIATE_VALUE(x0, NN_KERN_P_ADDR_CODE)
    br      x0
2:  b       2b

.L_start_nocopy:
    mrs     x1, midr_el1
    ubfx    x2, x1, #24, #8 // Implementer
    cmp     x2, #0x41       // is ARM?
    b.ne    .LCheckCpuEnd
    ubfx    x2, x1, #4, #12 // PartNum
    LOAD_IMMEDIATE_VALUE(x3, 0xD07)
    cmp     x2, x3          // is Cortex-A57 ?
    b.eq    .LCheckCpuCortexA57
    LOAD_IMMEDIATE_VALUE(x3, 0xD03)
    cmp     x2, x3          // is Cortex-A53 ?
    b.eq    .LCheckCpuCortexA53
    b       .LCheckCpuEnd

.LCheckCpuCortexA57:
#ifndef NN_BUILD_CONFIG_CPU_CORTEX_A57
1:  b       1b
#endif
    b       .LCheckCpuEnd
.LCheckCpuCortexA53:
#ifndef NN_BUILD_CONFIG_CPU_CORTEX_A53
1:  b       1b
#endif
    b       .LCheckCpuEnd
.LCheckCpuEnd:

    // キャッシュ、TLB のクリア
    bl      .LInvalidateCacheLocal
    dsb     sy
    bl      .LInvalidateCacheShare
    dsb     sy
    bl      .LInvalidateCacheLocal
    dsb     sy
    tlbi    vmalle1is
    dsb     sy
    isb

    // DRAM がマップされている物理アドレス空間に対してアクセス可能であることをチェック
.LCheckMemoryAccess:
#if !defined(NN_BUILD_CONFIG_HARDWARE_NX)
#if defined NN_SDK_BUILD_DEVELOP || defined NN_SDK_BUILD_DEBUG
    LOAD_IMMEDIATE_VALUE(x0, NN_KERN_P_ADDR_MAIN_MEMORY + NN_KERN_P_ADDR_RESERVED_LO_SIZE)
    LOAD_IMMEDIATE_VALUE(x1, NN_KERN_P_ADDR_MAIN_MEMORY_END - NN_KERN_P_ADDR_RESERVED_HI_SIZE)
1:
    // チェック対象のアドレスからロード
    ldr     x4, [x0]
    cmp     x4, #0x0000000000000000
    b.eq    2f
    cmp     x4, #0xffffffffffffffff
    b.eq    2f
    b       3f
2:
    // 0x0000... or 0xffff... をロードした場合:
    neg     x2, x4
    mov     x3, x4
    //  1. ビット反転した値をストア
    str     x2, [x0]
    //  2. ロードして一致確認
    ldr     x4, [x0]
    cmp     x2, x4
    b.ne    5f
    //  3. 元の値をストア
    str     x3, [x0]
    //  4. ロードして一致確認
    ldr     x4, [x0]
    cmp     x3, x4
    b.ne    5f
    b       4f
3:
    // それ以外の値をロードした場合:
    //  1. 同じ値を書き戻す
    str x4, [x0]
4:
    add     x0, x0, #0x1000 // 4KB ずつアドレスをインクリメント
    cmp     x0, x1
    b.lt    1b

    b       .LCheckMemoryAccessEnd
5:
    b       5b // チェック失敗時の無限ループ
#endif // #if defined NN_SDK_BUILD_DEVELOP || defined NN_SDK_BUILD_DEBUG
#endif // #if !defined(NN_BUILD_CONFIG_HARDWARE_NX)
.LCheckMemoryAccessEnd:

    // NN_KERN_P_ADDR_PT_HEAP_END から後ろのメモリを動的に使う
    LOAD_VALUE_FROM_LABEL(x19, .L__slab_pt_end)
    adr     x0, _start
    add     x19, x19, x0

    add     x19, x19, #(HW_MMU_PAGE_SIZE - 1)
    and     x19, x19, #(0 - HW_MMU_PAGE_SIZE)
                            // x19: 空き領域の先頭。後ろにずらしていく。

    // ASLR のオフセット計算
    // (random  % (NN_KERN_V_ADDR_KERNEL_SIZE / NN_KERN_ASLR_ALIGN)) * NN_KERN_ASLR_ALIGN
    bl      .LGetRandom
    LOAD_IMMEDIATE_VALUE(x1, NN_KERN_V_ADDR_KERNEL_SIZE)
    LOAD_VALUE_FROM_LABEL(x2, .L__slab_pt_end)
    add     x2, x2, #(HW_MMU_L2_BLOCK_SIZE * 2)     // アロケーション用バッファ領域
    sub     x1, x1, x2
    lsr     x1, x1, #(NN_KERN_ASLR_SHIFT)
    udiv    x2, x0, x1
    msub    x0, x2, x1, x0
    lsl     x0, x0, #(NN_KERN_ASLR_SHIFT)

    LOAD_IMMEDIATE_VALUE(x1, (NN_KERN_P_ADDR_CODE & (HW_MMU_L2_BLOCK_SIZE - 1)))
    add     x0, x0, x1
    LOAD_IMMEDIATE_VALUE(x23, NN_KERN_V_ADDR_KERNEL)
    add     x23, x23, x0    // x23: カーネルのロードアドレス

    mov     x0, #HW_MMU_PAGE_SIZE
    bl      .LAllocPage
    mov     x20, x0         // x20: L1 Page 物理アドレス


    // L2 Page Table Size計算
    LOAD_VALUE_FROM_LABEL(x2, .L__bss_end)
    mov     x1, x23
    add     x2, x2, x1
    LOAD_IMMEDIATE_VALUE(x3, (HW_MMU_L1_BLOCK_SIZE - 1))
    add     x2, x2, x3
    lsr     x2, x2, #HW_MMU_L1_BLOCK_SHIFT
    lsr     x1, x1, #HW_MMU_L1_BLOCK_SHIFT
    sub     x0, x2, x1
    lsl     x0, x0, #HW_MMU_L1_BLOCK_SHIFT
    lsr     x0, x0, #HW_MMU_L1_BLOCK_SHIFT
    lsl     x0, x0, #HW_MMU_PAGE_SHIFT
    bl      .LAllocPage
    mov     x21, x0         // x21: L2 Page 物理アドレス


    // L3 Page Table Size計算
    LOAD_VALUE_FROM_LABEL(x2, .L__bss_end)
    mov     x1, x23
    add     x2, x2, x1
    LOAD_IMMEDIATE_VALUE(x3, (HW_MMU_L2_BLOCK_SIZE - 1))
    add     x2, x2, x3
    lsr     x2, x2, #HW_MMU_L2_BLOCK_SHIFT
    lsr     x1, x1, #HW_MMU_L2_BLOCK_SHIFT
    sub     x0, x2, x1
    lsl     x0, x0, #HW_MMU_L2_BLOCK_SHIFT
    lsr     x0, x0, #HW_MMU_L2_BLOCK_SHIFT
    lsl     x0, x0, #HW_MMU_PAGE_SHIFT
    bl      .LAllocPage
    mov     x22, x0         // x22: L3 Page 物理アドレス


    // -----------------------------------------------------------------------
    // L3ページテーブル(ttbr1)の設定
    // -----------------------------------------------------------------------
    ubfx    x0, x23, #(HW_MMU_PAGE_SHIFT), #HW_MMU_NUM_PTE_SHIFT

    LOAD_IMMEDIATE_VALUE(x6, HW_MMU_PAGE_SIZE * 16 - 1)
    LOAD_IMMEDIATE_VALUE(x7, ~(HW_MMU_PAGE_SIZE * 16 - 1))

    // EX
    LOAD_IMMEDIATE_VALUE(x1, PAGE_R_X_ATTR)
    LOAD_VALUE_FROM_LABEL(x2, .L__ex_start)
    LOAD_VALUE_FROM_LABEL(x3, .L__ro_start)
    adr     x4, _start
    add     x2, x2, x4
    add     x3, x3, x4

    add     x8, x2, x6
    and     x8, x8, x7  // x8: RoundUp(x2, HW_MMU_PAGE_SIZE * 16)
    and     x9, x3, x7  // x9: RoundDown(x3, HW_MMU_PAGE_SIZE * 16)
1:  cmp     x2, x3
    b.eq    2f
    orr     x5, x1, x2

    cmp     x8, x2
    cset    w10, ls
    cmp     x2, x9
    cset    w11, cc
    and     w10, w11, w10
    lsl     x10, x10, #52
    orr     x5, x5, x10
    str     x5, [x22, x0, lsl #3]
    add     x0, x0, #1
    add     x2, x2, #(HW_MMU_PAGE_SIZE)
    b       1b
2:

    // RO
    LOAD_IMMEDIATE_VALUE(x1, PAGE_R___ATTR)
    LOAD_VALUE_FROM_LABEL(x3, .L__rw_start)
    adr     x4, _start
    add     x3, x3, x4

    add     x8, x2, x6
    and     x8, x8, x7  // x8: RoundUp(x2, HW_MMU_PAGE_SIZE * 16)
    and     x9, x3, x7  // x9: RoundDown(x3, HW_MMU_PAGE_SIZE * 16)
1:  cmp     x2, x3
    b.eq    2f
    orr     x5, x1, x2

    cmp     x8, x2
    cset    w10, ls
    cmp     x2, x9
    cset    w11, cc
    and     w10, w11, w10
    lsl     x10, x10, #52
    orr     x5, x5, x10
    str     x5, [x22, x0, lsl #3]
    add     x0, x0, #1
    add     x2, x2, #(HW_MMU_PAGE_SIZE)
    b       1b
2:

    // RW
    LOAD_IMMEDIATE_VALUE(x1, PAGE_RW__ATTR)
    LOAD_VALUE_FROM_LABEL(x3, .L__bss_end)
    adr     x4, _start
    add     x3, x3, x4

    add     x8, x2, x6
    and     x8, x8, x7  // x8: RoundUp(x2, HW_MMU_PAGE_SIZE * 16)
    and     x9, x3, x7  // x9: RoundDown(x3, HW_MMU_PAGE_SIZE * 16)
1:  cmp     x2, x3
    b.eq    2f
    orr     x5, x1, x2

    cmp     x8, x2
    cset    w10, ls
    cmp     x2, x9
    cset    w11, cc
    and     w10, w11, w10
    lsl     x10, x10, #52
    orr     x5, x5, x10
    str     x5, [x22, x0, lsl #3]
    add     x0, x0, #1
    add     x2, x2, #(HW_MMU_PAGE_SIZE)
    b       1b
2:

    // -----------------------------------------------------------------------
    // L2ページテーブル(ttbr1)の設定
    // -----------------------------------------------------------------------
    // ttbr1
    ubfx    x0, x23, #HW_MMU_L2_BLOCK_SHIFT, #HW_MMU_NUM_PTE_SHIFT
    LOAD_IMMEDIATE_VALUE(x1, PAGE_TABLE_ATTR)
    LOAD_VALUE_FROM_LABEL(x3, .L__bss_end)
    mov     x2, x23
    add     x3, x3, x2
    LOAD_IMMEDIATE_VALUE(x4, (HW_MMU_L2_BLOCK_SIZE - 1))
    add     x3, x3, x4
    lsr     x2, x2, #HW_MMU_L2_BLOCK_SHIFT
    lsl     x2, x2, #HW_MMU_L2_BLOCK_SHIFT
    lsr     x3, x3, #HW_MMU_L2_BLOCK_SHIFT
    lsl     x3, x3, #HW_MMU_L2_BLOCK_SHIFT
    mov     x6, x22

1:  cmp     x2, x3
    b.eq    2f
    orr     x5, x1, x6
    str     x5, [x21, x0, lsl #3]
    add     x0, x0, #1
    add     x2, x2, #(HW_MMU_L2_BLOCK_SIZE)
    add     x6, x6, #(HW_MMU_PAGE_SIZE)
    b       1b
2:


    // -----------------------------------------------------------------------
    // L1ページテーブル(ttbr1)の設定
    // -----------------------------------------------------------------------
    ubfx    x0, x23, #HW_MMU_L1_BLOCK_SHIFT, #HW_MMU_NUM_PTE_SHIFT
    LOAD_IMMEDIATE_VALUE(x1, PAGE_TABLE_ATTR)
    LOAD_VALUE_FROM_LABEL(x3, .L__bss_end)
    mov     x2, x23
    add     x3, x3, x2
    LOAD_IMMEDIATE_VALUE(x4, (HW_MMU_L1_BLOCK_SIZE - 1))
    add     x3, x3, x4
    lsr     x2, x2, #HW_MMU_L1_BLOCK_SHIFT
    lsl     x2, x2, #HW_MMU_L1_BLOCK_SHIFT
    lsr     x3, x3, #HW_MMU_L1_BLOCK_SHIFT
    lsl     x3, x3, #HW_MMU_L1_BLOCK_SHIFT
    mov     x6, x21
    mov     x7, #(HW_MMU_L1_BLOCK_SIZE)

1:  cmp     x2, x3
    b.eq    2f
    orr     x5, x1, x6
    str     x5, [x20, x0, lsl #3]
    add     x0, x0, #1
    add     x2, x2, x7
    add     x6, x6, #(HW_MMU_PAGE_SIZE)
    b       1b
2:


    mov     x0, #HW_MMU_PAGE_SIZE
    bl      .LAllocPage
    mov     x24, x0         // x24: L1 Page 物理アドレス ttbr0

    // L2 Page Table Size計算
    LOAD_VALUE_FROM_LABEL(x2, .L__slab_pt_end)
    adr     x1, _start
    add     x2, x2, x1
    add     x2, x2, #(HW_MMU_L2_BLOCK_SIZE * 2)     // アロケーション用バッファ領域
    LOAD_IMMEDIATE_VALUE(x3, (HW_MMU_L1_BLOCK_SIZE - 1))
    add     x2, x2, x3
    lsr     x2, x2, #HW_MMU_L1_BLOCK_SHIFT
    lsr     x1, x1, #HW_MMU_L1_BLOCK_SHIFT
    sub     x0, x2, x1
    lsl     x0, x0, #HW_MMU_L1_BLOCK_SHIFT
    lsr     x0, x0, #HW_MMU_L1_BLOCK_SHIFT
    lsl     x0, x0, #HW_MMU_PAGE_SHIFT
    bl      .LAllocPage
    mov     x25, x0         // x25: L2 Page 物理アドレス ttbr0

#if (NN_KERN_P_ADDR_CODE & (HW_MMU_L2_BLOCK_SIZE - 1))
    mov     x0, #HW_MMU_PAGE_SIZE
    bl      .LAllocPage
    mov     x26, x0         // x26: L3 Page 物理アドレス ttbr0
    // -----------------------------------------------------------------------
    // L3ページテーブル(ttbr0)の設定
    // -----------------------------------------------------------------------
    // ttbr0

    LOAD_IMMEDIATE_VALUE(x0, NN_KERN_P_ADDR_CODE)

    ubfx    x0, x0, #HW_MMU_PAGE_SHIFT, #HW_MMU_NUM_PTE_SHIFT

    LOAD_IMMEDIATE_VALUE(x1, PAGE_RWX_ATTR)
    LOAD_IMMEDIATE_VALUE(x2, NN_KERN_P_ADDR_CODE)
    LOAD_IMMEDIATE_VALUE(x3, NN_KERN_P_ADDR_CODE)
    LOAD_IMMEDIATE_VALUE(x4, (HW_MMU_L2_BLOCK_SIZE - 1))
    add     x3, x3, x4
    lsr     x3, x3, #HW_MMU_L2_BLOCK_SHIFT
    lsl     x3, x3, #HW_MMU_L2_BLOCK_SHIFT
1:  cmp     x2, x3
    b.eq    2f
    orr     x5, x1, x2
    str     x5, [x26, x0, lsl #3]
    add     x0, x0, #1
    add     x2, x2, #(HW_MMU_PAGE_SIZE)
    b       1b
2:

    // -----------------------------------------------------------------------
    // L2ページテーブル(ttbr0)の設定
    // -----------------------------------------------------------------------
    // ttbr0
    LOAD_IMMEDIATE_VALUE(x0, NN_KERN_P_ADDR_CODE)
    ubfx    x0, x0, #HW_MMU_L2_BLOCK_SHIFT, #HW_MMU_NUM_PTE_SHIFT
    LOAD_IMMEDIATE_VALUE(x1, PAGE_TABLE_ATTR)
    orr     x5, x1, x26
    str     x5, [x25, x0, lsl #3]
#else
    // -----------------------------------------------------------------------
    // L2ページテーブル(ttbr0)の設定
    // -----------------------------------------------------------------------
    // ttbr0
    LOAD_IMMEDIATE_VALUE(x0, NN_KERN_P_ADDR_CODE)
    ubfx    x0, x0, #HW_MMU_L2_BLOCK_SHIFT, #HW_MMU_NUM_PTE_SHIFT
    LOAD_IMMEDIATE_VALUE(x2, NN_KERN_P_ADDR_CODE)
    LOAD_IMMEDIATE_VALUE(x1, BLOCK_RWX_ATTR)
    orr     x5, x1, x2
    str     x5, [x25, x0, lsl #3]
#endif

    add     x0, x0, #1

    LOAD_IMMEDIATE_VALUE(x1, BLOCK_RWX_ATTR)
    LOAD_IMMEDIATE_VALUE(x2, NN_KERN_P_ADDR_CODE + HW_MMU_L2_BLOCK_SIZE)
    lsr     x2, x2, #HW_MMU_L2_BLOCK_SHIFT
    lsl     x2, x2, #HW_MMU_L2_BLOCK_SHIFT
    LOAD_VALUE_FROM_LABEL(x3, .L__slab_pt_end)
    adr     x4, _start
    add     x3, x3, x4
    add     x3, x3, #(HW_MMU_L2_BLOCK_SIZE * 2)     // アロケーション用バッファ領域
    lsr     x3, x3, #HW_MMU_L2_BLOCK_SHIFT
    lsl     x3, x3, #HW_MMU_L2_BLOCK_SHIFT
    mov     x7, #(HW_MMU_L2_BLOCK_SIZE)
1:  cmp     x2, x3
    b.eq    2f
    orr     x5, x1, x2
    str     x5, [x25, x0, lsl #3]
    add     x0, x0, #1
    add     x2, x2, x7
    b       1b
2:


    // -----------------------------------------------------------------------
    // L1ページテーブル(ttbr0)の設定
    // -----------------------------------------------------------------------
    LOAD_IMMEDIATE_VALUE(x0, NN_KERN_P_ADDR_CODE)
    ubfx    x0, x0, #HW_MMU_L1_BLOCK_SHIFT, #HW_MMU_NUM_PTE_SHIFT
    LOAD_IMMEDIATE_VALUE(x1, PAGE_TABLE_ATTR)
    LOAD_IMMEDIATE_VALUE(x2, NN_KERN_P_ADDR_CODE)
    lsr     x2, x2, #HW_MMU_L1_BLOCK_SHIFT
    lsl     x2, x2, #HW_MMU_L1_BLOCK_SHIFT
    LOAD_VALUE_FROM_LABEL(x3, .L__slab_pt_end)
    adr     x4, _start
    add     x3, x3, x4
    add     x3, x3, #(HW_MMU_L2_BLOCK_SIZE * 2)     // アロケーション用バッファ領域
    LOAD_IMMEDIATE_VALUE(x4, (HW_MMU_L1_BLOCK_SIZE - 1))
    add     x3, x3, x4
    lsr     x3, x3, #HW_MMU_L1_BLOCK_SHIFT
    lsl     x3, x3, #HW_MMU_L1_BLOCK_SHIFT
    mov     x6, x25
    mov     x7, #(HW_MMU_L1_BLOCK_SIZE)
1:  cmp     x2, x3
    b.eq    2f
    orr     x5, x1, x6
    str     x5, [x24, x0, lsl #3]
    add     x0, x0, #1
    add     x2, x2, x7
    add     x6, x6, #(HW_MMU_PAGE_SIZE)
    b       1b
2:


    // -----------------------------------------------------------------------
    // MMU レジスタの設定
    // -----------------------------------------------------------------------
    msr     ttbr0_el1, x24
    msr     ttbr1_el1, x20
    LOAD_IMMEDIATE_VALUE(x0, TCR_VALUE)
    msr     tcr_el1,  x0
    LOAD_IMMEDIATE_VALUE(x0, MAIR_VALUE)
    msr     mair_el1,  x0

    // ImplementationDefined
    mrs     x1, midr_el1
    ubfx    x2, x1, #24, #8 // Implementer
    cmp     x2, #0x41       // is ARM?
    b.ne    .LImplementationDefinedEnd
    ubfx    x2, x1, #4, #12 // PartNum
    LOAD_IMMEDIATE_VALUE(x3, 0xD07)
    cmp     x2, x3          // is Cortex-A57 ?
    b.eq    .LImplementationDefinedCortexA57
    LOAD_IMMEDIATE_VALUE(x3, 0xD03)
    cmp     x2, x3          // is Cortex-A53 ?
    b.eq    .LImplementationDefinedCortexA53
    b       .LImplementationDefinedEnd

.LImplementationDefinedCortexA57:
#ifdef NN_BUILD_CONFIG_CPU_CORTEX_A57
    // CPUACTLR
    LOAD_IMMEDIATE_VALUE(x0, CPUACTLR_VALUE)
    ubfx    x2, x1, #20, #4 // Variant
    cbz     x2, 1f
    cmp     x2, #1
    b.ne    2f
    ubfx    x2, x1, #0, #4 // Rev
    cbz     x2, 1f
    cmp     x2, #1
    b.ne    2f
1:  orr     x0, x0, #(1 << 59)
2:  msr     s3_1_c15_c2_0,  x0

    // CPUECTLR
    LOAD_IMMEDIATE_VALUE(x0, CPUECTLR_VALUE)
    msr     s3_1_c15_c2_1,  x0
    b       .LImplementationDefinedEnd
#endif

.LImplementationDefinedCortexA53:
#ifdef NN_BUILD_CONFIG_CPU_CORTEX_A53
    // CPUACTLR
    LOAD_IMMEDIATE_VALUE(x0, CPUACTLR_VALUE)
    msr     s3_1_c15_c2_0,  x0

    // CPUECTLR
    LOAD_IMMEDIATE_VALUE(x0, CPUECTLR_VALUE)
    msr     s3_1_c15_c2_1,  x0
    b       .LImplementationDefinedEnd
#endif

.LImplementationDefinedEnd:
    dsb     sy
    isb

    // MMU有効化
    LOAD_IMMEDIATE_VALUE(x0, SCTLR_VALUE)
    msr     sctlr_el1,  x0
    dsb     sy
    isb

    // sp をセット(まずは論物一致)
    mov     x0, #HW_MMU_PAGE_SIZE
    bl      .LAllocPage
    add     x0, x0, #HW_MMU_PAGE_SIZE
    mov     sp, x0

    // カーネルの仮想アドレスにジャンプ
    LOAD_IMMEDIATE_VALUE(x1, NN_KERN_P_ADDR_CODE)
    adr     x0, 1f
    sub     x0, x0, x1
    add     x0, x0, x23
    br      x0
1:

    // bss クリア
    adr     x0, _start
    LOAD_VALUE_FROM_LABEL(x1, .L__bss_start)
    LOAD_VALUE_FROM_LABEL(x2, .L__bss_end)
    add     x1, x1, x0
    add     x2, x2, x0

1:  cmp     x1, x2
    b.cs    2f
    str     xzr, [x1], #8
    b       1b
2:

    adr     x2, ExceptionVector
    msr     vbar_el1, x2


    // リロケーション
    //  x0 <- load base address
    //  x1 <- .dynamic address
    adr     x0, _start
    LOAD_VALUE_FROM_LABEL(x1, .L_DYNAMIC)
    add     x1, x1, x0
    bl      _ZN2nn4kern4init3Elf5SetupEmPKNS2_5Elf643DynE

    // 初期化
    mov     x0, x19
    bl      _ZN2nn4kern4init6ARMv8A5Step0Em

    LOAD_VALUE_FROM_LABEL(x19, .Lg_InitArgumentsAddress)
    adr     x1, _start
    add     x19, x19, x1
    ldr     x20, [x19, #0]

    b       .LJmpEntry

.LCodeCopy:
    // x0: dst
    // x1: src
    // x2: dst_end
    // clobber: x0, x1, x3, x4, x30
    cmp     x0, x2
    b.cs    3f
    ldp     x3, x4, [x1]
    stp     x3, x4, [x0]
    dc      cvac, x0
    add     x0, x0, #16
    add     x1, x1, #16
    b       .LCodeCopy
3:  ret

.LAllocPage:
    //     x0: size
    // return: x0 addr
    // clobber: x1, x30
    mov     x1, x19

    add     x19, x19, x0
    add     x19, x19, #(HW_MMU_PAGE_SIZE - 1)
    and     x19, x19, #(0 - HW_MMU_PAGE_SIZE)

    mov     x0, x1
1:  cmp     x1, x19
    b.cs    2f
    str     xzr, [x1], #8
    b       1b
2:  ret

.LGetRandom:
// return: x0 random value
// clobber: x0, x1, x2, x3, x4, x5, x6, x7
//          x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18
#ifdef NN_BUILD_CONFIG_HARDWARE_NX
    // SMC GenerateRandom : 0xC3000005
    LOAD_IMMEDIATE_VALUE(x0, 0xC3000005)
    mov     x1, #8
    smc     #1
1:  cbnz    x0, 1b
    mov     x0, x1
#else
    // tick の SHA1 ハッシュ値を擬似ランダム値として返す
    // SHA1 の計算ルーチンは、入力値が固定長の短い値であることを利用して簡略化している
    // SHA1 用の変数 (A,B,C,D,E,f,w) については、一般的な SHA1 の説明を参照すること

    // x0-x3: work
    // x4 (w4): A
    // x5 (w5): B
    // x6 (w6): C
    // x7 (w7): D
    // x8 (w8): E
    // x9 (w9): f
    // x10: round counter
    // x11 - x18: w (アクセスの際は SHA1_GET_W および SHA1_SET_W を使う)

    /* .LGetRandom 専用マクロ */

/*
 * .LGetRandom の SHA1 計算に用いる w のアクセサ
 *
 * 同時に必要な w が限られることを利用してレジスタを節約し、x11 - x18 に w[0] - w[79] を格納
 *      x11 L: w[0],  w[16], w[32], w[48], w[64]
 *      x11 H: w[1],  w[17], w[33], w[49], w[65]
 *      x12 L: w[2],  w[18], w[34], w[50], w[66]
 *      x12 H: w[3],  w[19], w[35], w[51], w[67]
 *      x13 L: w[4],  w[20], w[36], w[52], w[68]
 *      x13 H: w[5],  w[21], w[37], w[53], w[69]
 *      x14 L: w[6],  w[22], w[38], w[54], w[70]
 *      x14 H: w[7],  w[23], w[39], w[55], w[71]
 *      x15 L: w[8],  w[24], w[40], w[56], w[72]
 *      x15 H: w[9],  w[25], w[41], w[57], w[73]
 *      x16 L: w[10], w[26], w[42], w[58], w[74]
 *      x16 H: w[11], w[27], w[43], w[59], w[75]
 *      x17 L: w[12], w[28], w[44], w[60], w[76]
 *      x17 H: w[13], w[29], w[45], w[61], w[77]
 *      x18 L: w[14], w[30], w[46], w[62], w[78]
 *      x18 H: w[15], w[31], w[47], w[63], w[79]
 */

/*
 * w[index] の値をロードする
 */
#define SHA1_GET_W(regOut, regIndex) \
    /* index を mask (一時的に regOut に格納) */ \
    and regOut, regIndex, #0xf;   \
                                    \
    /* x11-x18 のうち該当するレジスタを更新 */ \
    cmp regOut, #1;                 \
    b.eq 1f;                        \
    cmp regOut, #2;                 \
    b.eq 2f;                        \
    cmp regOut, #3;                 \
    b.eq 3f;                        \
    cmp regOut, #4;                 \
    b.eq 4f;                        \
    cmp regOut, #5;                 \
    b.eq 5f;                        \
    cmp regOut, #6;                 \
    b.eq 6f;                        \
    cmp regOut, #7;                 \
    b.eq 7f;                        \
    cmp regOut, #8;                 \
    b.eq 8f;                        \
    cmp regOut, #9;                 \
    b.eq 9f;                        \
    cmp regOut, #10;                \
    b.eq 10f;                       \
    cmp regOut, #11;                \
    b.eq 11f;                       \
    cmp regOut, #12;                \
    b.eq 12f;                       \
    cmp regOut, #13;                \
    b.eq 13f;                       \
    cmp regOut, #14;                \
    b.eq 14f;                       \
    cmp regOut, #15;                \
    b.eq 15f;                       \
0:  and regOut, x11, #0xffffffff;   \
    b 16f;                          \
1:  lsr regOut, x11, #32;           \
    b 16f;                          \
2:  and regOut, x12, #0xffffffff;   \
    b 16f;                          \
3:  lsr regOut, x12, #32;           \
    b 16f;                          \
4:  and regOut, x13, #0xffffffff;   \
    b 16f;                          \
5:  lsr regOut, x13, #32;           \
    b 16f;                          \
6:  and regOut, x14, #0xffffffff;   \
    b 16f;                          \
7:  lsr regOut, x14, #32;           \
    b 16f;                          \
8:  and regOut, x15, #0xffffffff;   \
    b 16f;                          \
9:  lsr regOut, x15, #32;           \
    b 16f;                          \
10: and regOut, x16, #0xffffffff;   \
    b 16f;                          \
11: lsr regOut, x16, #32;           \
    b 16f;                          \
12: and regOut, x17, #0xffffffff;   \
    b 16f;                          \
13: lsr regOut, x17, #32;           \
    b 16f;                          \
14: and regOut, x18, #0xffffffff;   \
    b 16f;                          \
15: lsr regOut, x18, #32;           \
16:                                 \

/*
 * レジスタの値を w[index] にセットする
 */
#define SHA1_SET_W(regValue, regIndex)  \
    /* index を mask */ \
    and regIndex, regIndex, #0xf;       \
                                        \
    /* 値を x11-x18 に格納する準備としてあらかじめ regValue を shift/mask しておく */ \
    tst regIndex, #0x1;                 \
    b.ne 1f;                            \
    and regValue, regValue, #0xffffffff; \
    b 2f;                               \
1:  lsl regValue, regValue, #32;        \
    /* x11-x18 のうち該当するレジスタを更新 */ \
2:  cmp regIndex, #1;                   \
    b.eq 1f;                            \
    cmp regIndex, #2;                   \
    b.eq 2f;                            \
    cmp regIndex, #3;                   \
    b.eq 3f;                            \
    cmp regIndex, #4;                   \
    b.eq 4f;                            \
    cmp regIndex, #5;                   \
    b.eq 5f;                            \
    cmp regIndex, #6;                   \
    b.eq 6f;                            \
    cmp regIndex, #7;                   \
    b.eq 7f;                            \
    cmp regIndex, #8;                   \
    b.eq 8f;                            \
    cmp regIndex, #9;                   \
    b.eq 9f;                            \
    cmp regIndex, #10;                  \
    b.eq 10f;                           \
    cmp regIndex, #11;                  \
    b.eq 11f;                           \
    cmp regIndex, #12;                  \
    b.eq 12f;                           \
    cmp regIndex, #13;                  \
    b.eq 13f;                           \
    cmp regIndex, #14;                  \
    b.eq 14f;                           \
    cmp regIndex, #15;                  \
    b.eq 15f;                           \
0:  and x11, x11, #0xffffffff00000000;  \
    orr x11, x11, regValue;             \
    b 16f;                              \
1:  and x11, x11, #0x00000000ffffffff;  \
    orr x11, x11, regValue;             \
    b 16f;                              \
2:  and x12, x12, #0xffffffff00000000;  \
    orr x12, x12, regValue;             \
    b 16f;                              \
3:  and x12, x12, #0x00000000ffffffff;  \
    orr x12, x12, regValue;             \
    b 16f;                              \
4:  and x13, x13, #0xffffffff00000000;  \
    orr x13, x13, regValue;             \
    b 16f;                              \
5:  and x13, x13, #0x00000000ffffffff;  \
    orr x13, x13, regValue;             \
    b 16f;                              \
6:  and x14, x14, #0xffffffff00000000;  \
    orr x14, x14, regValue;             \
    b 16f;                              \
7:  and x14, x14, #0x00000000ffffffff;  \
    orr x14, x14, regValue;             \
    b 16f;                              \
8:  and x15, x15, #0xffffffff00000000;  \
    orr x15, x15, regValue;             \
    b 16f;                              \
9:  and x15, x15, #0x00000000ffffffff;  \
    orr x15, x15, regValue;             \
    b 16f;                              \
10: and x16, x16, #0xffffffff00000000;  \
    orr x16, x16, regValue;             \
    b 16f;                              \
11: and x16, x16, #0x00000000ffffffff;  \
    orr x16, x16, regValue;             \
    b 16f;                              \
12: and x17, x17, #0xffffffff00000000;  \
    orr x17, x17, regValue;             \
    b 16f;                              \
13: and x17, x17, #0x00000000ffffffff;  \
    orr x17, x17, regValue;             \
    b 16f;                              \
14: and x18, x18, #0xffffffff00000000;  \
    orr x18, x18, regValue;             \
    b 16f;                              \
15: and x18, x18, #0x00000000ffffffff;  \
    orr x18, x18, regValue;             \
16:                                     \

    /* 実装 */

    // Init w[0]-w[15] (chunk)
    mrs x11, cntpct_el0 // tick
    mov x12, #0x0000000080000000 // stop bit
    mov x13, #0
    mov x14, #0
    mov x15, #0
    mov x16, #0
    mov x17, #0
    mov x18, #0x0000004000000000 // data size in bits (64)

    // Init A,B,C,D,E
    LOAD_IMMEDIATE_VALUE_32(w4, 0x67452301)
    LOAD_IMMEDIATE_VALUE_32(w5, 0xEFCDAB89)
    LOAD_IMMEDIATE_VALUE_32(w6, 0x98BADCFE)
    LOAD_IMMEDIATE_VALUE_32(w7, 0x10325476)
    LOAD_IMMEDIATE_VALUE_32(w8, 0xC3D2E1F0)

    // Loop for chunk
    mov x10, #0
.LGetRandom_StartOfRound:
    cmp x10, #16
    b.lt .LGetRandom_CalculateF

    // Calculate w[x10] (x1: accumlator)
    mov x1, #0

    add x0, x10, #-3
    SHA1_GET_W(x0, x0)
    eor w1, w1, w0

    add x0, x10, #-8
    SHA1_GET_W(x0, x0)
    eor w1, w1, w0

    add x0, x10, #-14
    SHA1_GET_W(x0, x0)
    eor w1, w1, w0

    add x0, x10, #-16
    SHA1_GET_W(x0, x0)
    eor w1, w1, w0

    ror w1, w1, #31 // 1 bit left rotate

    mov x0, x10
    SHA1_SET_W(x1, x0)

.LGetRandom_CalculateF:
    cmp x10, #20
    b.lt 1f
    cmp x10, #40
    b.lt 2f
    cmp x10, #60
    b.lt 3f
    b 4f
1:
    // round 0-19
    eor w9, w6, w7
    and w9, w9, w5
    eor w9, w9, w7
    LOAD_IMMEDIATE_VALUE_32(w2, 0x5A827999)
    b 5f
2:
    // round 20-39
    eor w9, w5, w6
    eor w9, w9, w7
    LOAD_IMMEDIATE_VALUE_32(w2, 0x6ED9EBA1)
    b 5f
3:
    // round 40-59
    eor w9, w5, w6
    and w9, w9, w7
    and w2, w5, w6
    add w9, w9, w2
    LOAD_IMMEDIATE_VALUE_32(w2, 0x8F1BBCDC)
    b 5f
4:
    // round 60-79
    eor w9, w5, w6
    eor w9, w9, w7
    LOAD_IMMEDIATE_VALUE_32(w2, 0xCA62C1D6)
    b 5f
5:
    // w2 (temp) += w[x10] + leftrotate(w4, 5) + w9 + w8
    SHA1_GET_W(x0, x10)
    add w2, w2, w0
    ror w0, w4, #27 // 5 bit left rotate
    add w2, w2, w0
    add w2, w2, w9
    add w2, w2, w8

    // rotate A, B, C, D, E
    mov w8, w7
    mov w7, w6
    ror w6, w5, #2 // 30 bit left rotate
    mov w5, w4
    mov w4, w2

    // End of round (80 round で終了)
    adds x10, x10, #1
    cmp x10, #80
    b.lt .LGetRandom_StartOfRound

    // 最終的な A を結果として返す
    LOAD_IMMEDIATE_VALUE_32(w0, 0x67452301)
    add w0, w4, w0

    // 乱数生成の痕跡を消す
    mov x1, #0
    mov x2, #0
    mov x3, #0
    mov x4, #0
    mov x5, #0
    mov x6, #0
    mov x7, #0
    mov x8, #0
    mov x9, #0
    mov x10, #0
    mov x11, #0
    mov x12, #0
    mov x13, #0
    mov x14, #0
    mov x15, #0
    mov x16, #0
    mov x17, #0
    mov x18, #0
#endif
    ret

.LInvalidateCache:
    mrs     x8, clidr_el1
    ubfx    x8, x8, #24, #3
    cbz     x8, 4f
    mov     x9, xzr
1:  lsl     x11, x9, #1
    msr     csselr_el1, x11
    isb
    mrs     x13, ccsidr_el1
    ubfx    w15, w13, #13, #15
    ubfx    w12, w13, #3, #10
    and     w13, w13, #7
    mov     w10, wzr
    add     w13, w13, #4
    clz     w14, w12
    add     w15, w15, #1
2:  mov     w16, wzr
    lsl     w17, w10, w14
3:  lsl     w18, w16, w13
    add     w16, w16, #1
    orr     w18, w18, w17
    sxtw    x18, w18
    orr     x18, x18, x11
    dc      isw, x18
    cmp     w15, w16
    b.ne    3b
    cmp     w10, w12
    add     w10, w10, #1
    b.ne    2b
    dsb     sy
    isb
    add     w9, w9, #1
    cmp     x9, x8
    b.cc    1b
4:  ret


.LInvalidateCacheShare:
    mrs     x9, clidr_el1
    ubfx    w10, w9, #21, #3
    ubfx    w8, w9, #24, #3
    cmp     w10, w8
    b.hi    4f
    ubfx    x9, x9, #21, #3
1:  lsl     w11, w9, #1
    sxtw    x12, w11
    msr     csselr_el1, x12
    isb
    mrs     x13, ccsidr_el1
    ubfx    w15, w13, #13, #15
    ubfx    w12, w13, #3, #10
    and     w13, w13, #0x7
    mov     w10, wzr
    add     w13, w13, #0x4
    clz     w14, w12
    add     w15, w15, #0x1
2:  lsl     w17, w10, w14
    mov     w16, wzr
    orr     w17, w17, w11
3:  lsl     w18, w16, w13
    add     w16, w16, #0x1
    orr     w18, w17, w18
    sxtw    x18, w18
    dc      isw, x18
    cmp     w15, w16
    b.ne    3b
    cmp     w10, w12
    add     w10, w10, #0x1
    b.ne    2b
    cmp     x9, x8
    add     x9, x9, #0x1
    b.cc    1b
4:  ret


.LInvalidateCacheLocal:
    mrs     x8, clidr_el1
    ubfx    w8, w8, #21, #3
    cbz     w8, 4f
    mov     x9, xzr
1:  lsl     x11, x9, #1
    msr     csselr_el1, x11
    isb
    mrs     x13, ccsidr_el1
    ubfx    w15, w13, #13, #15
    ubfx    w12, w13, #3, #10
    and     w13, w13, #0x7
    mov     w10, wzr
    add     w13, w13, #0x4
    clz     w14, w12
    add     w15, w15, #0x1
2:  lsl     w17, w10, w14
    mov     w16, wzr
    orr     w17, w17, w11
3:  lsl     w18, w16, w13
    add     w16, w16, #0x1
    orr     w18, w17, w18
    sxtw    x18, w18
    dc      isw, x18
    cmp     w15, w16
    b.ne    3b
    cmp     w10, w12
    add     w10, w10, #0x1
    b.ne    2b
    add x9, x9, #0x1
    cmp x9, x8
    b.ne    1b
4:  ret

// x0: coreNo
.LSlaveCoreEntry:
    mrs     x1, midr_el1
    ubfx    x2, x1, #24, #8 // Implementer
    cmp     x2, #0x41       // is ARM?
    b.ne    .LCheckSlaveCpuEnd
    ubfx    x2, x1, #4, #12 // PartNum
    LOAD_IMMEDIATE_VALUE(x3, 0xD07)
    cmp     x2, x3          // is Cortex-A57 ?
    b.eq    .LCheckSlaveCpuCortexA57
    LOAD_IMMEDIATE_VALUE(x3, 0xD03)
    cmp     x2, x3          // is Cortex-A53 ?
    b.eq    .LCheckSlaveCpuCortexA53
    b       .LCheckSlaveCpuEnd

.LCheckSlaveCpuCortexA57:
#ifndef NN_BUILD_CONFIG_CPU_CORTEX_A57
1:  b       1b
#endif
    b       .LCheckSlaveCpuEnd
.LCheckSlaveCpuCortexA53:
#ifndef NN_BUILD_CONFIG_CPU_CORTEX_A53
1:  b       1b
#endif
    b       .LCheckSlaveCpuEnd
.LCheckSlaveCpuEnd:

    // キャッシュ(localのみ)、TLB のクリア
    bl      .LInvalidateCacheLocal
    dsb     sy
    tlbi    vmalle1is
    dsb     sy
    isb

    LOAD_VALUE_FROM_LABEL(x19, .Lg_InitArgumentsAddress)
    adr     x1, _start
    add     x19, x19, x1

1:  ldr     x20, [x19, w0, uxtw #3]
    cbz     x20, 1b

    ldr     x1, [x20, #(0 * 8)]
    msr     ttbr0_el1, x1
    ldr     x1, [x20, #(1 * 8)]
    msr     ttbr1_el1, x1
    ldr     x1, [x20, #(2 * 8)]
    msr     tcr_el1, x1
    ldr     x1, [x20, #(3 * 8)]
    msr     mair_el1, x1

    // ImplementationDefined
    mrs     x1, midr_el1
    ubfx    x2, x1, #24, #8 // Implementer
    cmp     x2, #0x41       // is ARM?
    b.ne    .LSlaveImplementationDefinedEnd
    ubfx    x2, x1, #4, #12 // PartNum
    LOAD_IMMEDIATE_VALUE(x3, 0xD07)
    cmp     x2, x3          // is Cortex-A57 ?
    b.eq    .LSlaveImplementationDefinedCortexA57
    LOAD_IMMEDIATE_VALUE(x3, 0xD03)
    cmp     x2, x3          // is Cortex-A53 ?
    b.eq    .LSlaveImplementationDefinedCortexA53
    b       .LSlaveImplementationDefinedEnd

.LSlaveImplementationDefinedCortexA57:
.LSlaveImplementationDefinedCortexA53:
    ldr     x1, [x20, #(4 * 8)]
    msr     s3_1_c15_c2_0,  x1
    ldr     x1, [x20, #(5 * 8)]
    msr     s3_1_c15_c2_1,  x1
.LSlaveImplementationDefinedEnd:
    dsb     sy
    isb

    ldr     x1, [x20, #(6 * 8)]
    msr     sctlr_el1,  x1

    dsb     sy
    isb

    ldr     x1, [x20, #(7 * 8)]
    msr     vbar_el1, x1
    dsb     sy
    isb

.LJmpEntry:
    // fpu 無効
    msr     cpacr_el1, xzr
    isb

    bl      _ZN2nn4kern4init6ARMv8A23InitializeDebugRegisterEv

    ldr     x1, [x20, #(8 * 8)]
    mov     sp, x1

    ldr     x1, [x20, #(9 * 8)]
    ldr     x0, [x20, #(10 * 8)]
    br      x1
1:  b       1b

.Lfrom_el2_to_el1:
    // ACTLR_EL3.L2ACTLR_EL1=1
    // ACTLR_EL3.L2ECTLR_EL1=1
    // ACTLR_EL3.L2CTLR_EL1=1
    // ACTLR_EL3.CPUECTLR=1
    // ACTLR_EL3.CPUACTLR=1
    mov     x0, #((1<<6) | (1<<5) | (1<<4) | (1<<1) | (1<<0))
    msr     actlr_el2, x0

    LOAD_IMMEDIATE_VALUE(x0, 0x0000000080000000)
    msr     hcr_el2, x0

    LOAD_IMMEDIATE_VALUE(x0, 0x00c00800)
    msr     sctlr_el1, x0

    LOAD_IMMEDIATE_VALUE(x0, 0xffffffff)
    msr     dacr32_el2, x0

    msr     elr_el2, x30
    mov     x0, #(HW_PSR_EL1H_MODE | HW_PSR_FIQ_DISABLE | HW_PSR_IRQ_DISABLE)
    msr     spsr_el2, x0
    eret

    .balign 8
.L__ex_start:
    .quad   __ex_start  - _start
.L__ex_end:
    .quad   __ex_end    - _start
.L__ro_start:
    .quad   __ro_start  - _start
.L__ro_end:
    .quad   __ro_end    - _start
.L__rw_start:
    .quad   __rw_start  - _start
.L__rw_end:
    .quad   __rw_end    - _start
.L__bss_start:
    .quad   __bss_start - _start
.L__bss_end:
    .quad   __bss_end   - _start
.L__slab_pt_end:
    .quad   __slab_pt_end - _start
.L_DYNAMIC:
    .quad   _DYNAMIC    - _start
.Lg_InitArgumentsAddress:
    .quad   _ZN2nn4kern4init6ARMv8A22g_InitArgumentsAddressE - _start
