﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

//=============================================================================
//  DEMOCopy.c
//
//     This is fast copy code for the DEMO library.
//
//=============================================================================

#include <gfx/demo.h>

// data cache block flush
asm void _DCBF(void *addr)
{
%reg addr
        dcbf 0, addr
}

// data cache block invalidate
asm void _DCBI(void *addr)
{
%reg addr
        dcbi 0, addr
}

// data cache block touch (load)
asm void _DCBT(void *addr)
{
%reg addr
        dcbt 0, addr
}

// data cache block zero
asm void _DCBZ(void *addr)
{
%reg addr
        dcbz 0, addr
}

// This copy makes no assumptions about source & dest alignment.
// Both source & dest are flushed out of the CPU cache.
// The buffers must not overlap. This function does not use locked cache DMA.
//
void DEMOFastCopy(void *dst, void *src, u32 size)
{
    register u8 *dEnd;
    register u8 *spp;
    register u8 *dpp;
    register u64 c0, c1, c2, c3;
    u8 *epp;

    spp = reinterpret_cast< u8* >( src );
    dpp = reinterpret_cast< u8* >( dst );

    // Since both src & dst are likely not aligned to 32-bytes, we
    // can't make a loop where both src & dst pointers are aligned to
    // 32 bytes.  We therefore make only the dst pointer be aligned.

    // We use flush on src rather than invalidate to avoid possibility
    // of corrupting data immediately before or after src buffer
    // (that lies within the same cache lines, if src is not aligned).

    // First, copy up to the first 32-byte boundary:
    dEnd = (u8 *) (((u32)dst + 31) & ~31);
    if ((u8 *)dst + size < dEnd)
    {
        dEnd = (u8 *)dst + size;
    }
    if (dpp < dEnd)
    {
        while(dpp < dEnd)
        {
            *dpp++ = *spp++;
        }
        dEnd = dpp - 32;
        _DCBF( dEnd );  // flush what we just wrote
    }

    // Then, copy all the aligned 32-byte lines:
    dEnd = (u8 *) (((u32)dst + size) & ~31);
    while(dpp < dEnd)
    {
        c0 = ((u64 *) spp)[0]; // read src line
        c1 = ((u64 *) spp)[1];
        c2 = ((u64 *) spp)[2];
        c3 = ((u64 *) spp)[3];
        _DCBF( spp );          // push out source line
        _DCBZ( dpp );          // use dcbz to avoid reading in dst
        ((u64 *) dpp)[0] = c0; // write dst line
        ((u64 *) dpp)[1] = c1;
        ((u64 *) dpp)[2] = c2;
        ((u64 *) dpp)[3] = c3;
        _DCBF( dpp );          // flush what we just wrote
        spp += 32;
        dpp += 32;
    }

    // Finally, copy any remaining sub-32-byte chunk:
    epp = spp;
    dEnd = ((u8 *)dst + size);
    if (dpp < dEnd)
    {
        while(dpp < dEnd)
        {
            *dpp++ = *spp++;
        }
        _DCBF(spp - 1);
        _DCBF( dpp );    // flush what we just wrote
    }

    // If the number of bytes copied in step 1 was >= the number of bytes
    // left in the first source row, then this wasn't flushed:
    _DCBF( src );

    // If the number of bytes copied in step 3 was >= the number of bytes
    // left in the row before the last source row, or if step 3 wasn't
    // necessary but the last source row wasn't full, then this wasn't flushed:
    _DCBF(epp - 1);
}

