#include <windows.h>
#include <stdio.h>

bool	IsPowerOfTwo( int value )
{
	int num_bits_set;
	int mask;

	num_bits_set = 0;

	for( mask = 1; mask <= 0x8000000; mask <<= 1 )
	{
		if( mask & value )
		{
			num_bits_set++;
		}
	}
	
	return ( num_bits_set == 1 );
}

int	GetLog2( int value )
{
	int i;

	i = 0;
	if( value > 0 )
	{		
		while( value != 1 )
		{
			value >>= 1;
			i++;
		}

		return i;
	}
	
	return -1;
}

int	PadSize( int value )
{
	int log2 = GetLog2(value);
	if ((1 << log2) < value)
		log2++;

	return 1 << log2;
}

bool FileExists( char* path )
{
	HANDLE file_handle;
	WIN32_FIND_DATA find_data;
	
	file_handle = FindFirstFile( path, &find_data );  
	
	if( file_handle != INVALID_HANDLE_VALUE )
	{
		FindClose( file_handle );
	}
	
	return ( file_handle != INVALID_HANDLE_VALUE );
}

// Is file 2 newer than file one
bool FileIsNewer( char* file1, char* file2 )
{
	HANDLE file_handle_1, file_handle_2;
	WIN32_FIND_DATA find_data_1, find_data_2;
	
	file_handle_1 = FindFirstFile( file1, &find_data_1 );  
	if( file_handle_1 == INVALID_HANDLE_VALUE )
	{
		return true;
	}

	file_handle_2 = FindFirstFile( file2, &find_data_2 );  
	if( file_handle_2 == INVALID_HANDLE_VALUE )
	{
		FindClose( file_handle_1 );
		return false;
	}
	
	FindClose( file_handle_1 );
	FindClose( file_handle_2 );
	return( CompareFileTime( &find_data_2.ftLastWriteTime, &find_data_1.ftLastWriteTime ) > 0 );
}

bool MakeFileRW( char* path )
{
	if (!FileExists(path))
	{
		//fprintf(stderr, "MakeFileRW: Can't find %s\n", path);
		return false;
	}

	DWORD attrib = GetFileAttributes(path);
	if (attrib == 0xFFFFFFFF)
	{
		fprintf(stderr, "MakeFileRW: Can't read attributes for %s\n", path);
		return false;
	}

	attrib &= ~(FILE_ATTRIBUTE_READONLY);

	if (!SetFileAttributes(path, attrib))
	{
		fprintf(stderr, "MakeFileRW: Can't set attributes for %s\n", path);
		return false;
	}

	return true;
}


// Xbox texture swizzling stuff.
#pragma warning ( disable: 4035 )
static inline DWORD __fastcall Log2( DWORD Value )
{ 
	__asm { bsf eax, [Value] }; 
}
#pragma warning ( default: 4035 )

static inline void GetMasks2( int Width, int Height, DWORD *pMaskU, DWORD *pMaskV )
{
	DWORD LogWidth, LogHeight, Log;

	LogWidth = Log2(Width); 
	LogHeight = Log2(Height);

	Log = min(LogWidth, LogHeight);

    DWORD LowerMask = (1 << (Log << 1)) - 1;
    DWORD UpperMask = ~LowerMask;

    DWORD MaskU = (LogWidth > LogHeight) ? (0x55555555 | UpperMask)
                                        : (0x55555555 & LowerMask);

    DWORD MaskV = (LogWidth < LogHeight) ? (0xaaaaaaaa | UpperMask)
                                        : (0xaaaaaaaa & LowerMask);

	MaskU &= ((1 << (LogWidth + LogHeight)) - 1); //we're letting u & v just loop, so
	MaskV &= ((1 << (LogWidth + LogHeight)) - 1); //we need to limit the bits to the ones we need.

	*pMaskU = MaskU;
	*pMaskV = MaskV;
}

void swiz2d_8bit( void* pSrc, void* pDest, int Width, int Height ) 
{
	DWORD MaskU, MaskV, AddValU, AddValV;

	GetMasks2(Width, Height, &MaskU, &MaskV);

	AddValU = (-64) & MaskU;
	AddValV = (-128) & MaskV;

	//eax = scratch
	//ebx = u
	//ecx = y
	//edx = pitch
	//esi = pSrc 
	//edi = pDest 

	_asm {
		mov esi, pSrc;					//source + (0, 0)
		mov edi, pDest;					//dest

		mov edx, [Width];				//edx = width
		xor ebx, ebx;					//swiz(u)
		xor ecx, ecx;					//swiz(v)

		align 16						//seems to help speed a little

Start:

			  movq mm0, [esi];				//00 01 02 03 04 05 06 07
			  movq mm1, [esi + edx];		//10 11 12 13 14 15 16 17
			  add esi, edx;					//pSrc + (u, v + 1)
			  mov eax, ebx;					//eax = swiz(u)
			  movq mm4, [esi + edx];		//20 21 22 23 24 25 26 27
			  movq mm5, [esi + edx * 2];	//30 31 32 33 34 35 36 37
			  or eax, ecx;					//eax = swiz(u) | swiz(b)
			  movq mm6, mm4;				//20 21 22 23 24 25 26 27
			  movq mm2, mm0;				//00 01 02 03 04 05 06 07
			  
			  punpckhwd mm6, mm5;			//24 25 34 35 26 27 36 37

			  lea esi, [esi + edx * 4];		//pSrc + (u, v + 5)
			  punpckhwd mm2, mm1			//04 05 14 15 06 07 16 17
			  punpcklwd mm4, mm5;			//20 21 30 31 22 23 30 33

			  movq mm3, [esi];				//50 51 52 53 54 55 56 57
			  movq mm5, [esi+edx];			//60 61 62 63 64 65 66 67
			  movq mm7, [esi+edx*2];		//70 71 72 73 74 75 76 77
			  sub esi, edx					//pSrc + (u, v + 4)
			  punpcklwd mm0, mm1;			//00 01 10 11 02 03 12 13
			  movq mm1, [esi];				//40 41 42 43 44 45 46 47

			  movq [edi + eax      ], mm0;	//00 01 10 11 02 03 12 13
			  movq [edi + eax +   8], mm4;	//20 21 30 31 22 23 30 33
			  movq [edi + eax +  16], mm2;  //04 05 14 15 06 07 16 17
			  movq [edi + eax +  24], mm6;	//24 25 34 35 26 27 36 37
		  
			  movq mm0, mm1					//40 41 42 43 44 45 46 47
			  movq mm4, mm5					//60 61 62 63 64 65 66 67

			  punpcklwd mm0, mm3			//40 41 50 51 42 43 52 53
			  punpcklwd mm4, mm7			//60 61 70 71 62 63 72 73
			  punpckhwd mm1, mm3			//44 45 54 55 46 47 56 57
			  punpckhwd mm5, mm7			//64 65 74 75 66 67 76 77

			  movq [edi + eax +  32], mm0;	//40 41 50 51 42 43 52 53
			  movq [edi + eax +  40], mm4;	//60 61 70 71 62 63 72 73
			  movq [edi + eax +  48], mm1;  //44 45 54 55 46 47 56 57
			  movq [edi + eax +  56], mm5;	//64 65 74 75 66 67 76 77

			  sub esi, edx
			  sub esi, edx
			  sub esi, edx
			  sub esi, edx					//pSrc + (u, v)

			  sub ebx, [AddValU];			//part 1 of: swiz(u) += 8

			  //this number of nops seems to be optimal.
			  _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop}
			  _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop}
			  _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop} _asm{nop}
			  _asm{nop} _asm{nop} _asm{nop} _asm{nop} //_asm{nop} //_asm{nop} _asm{nop} _asm{nop}

			  add esi, 8;					//u += 8
			  and ebx, [MaskU];				//(continued): swiz(u) += 8;
			jnz Start;						//if swiz(u) hasn't looped back to 0, repeat

		  sub ecx, [AddValV];			//part 1 of: swiz(v) += 8		

		  lea esi, [esi + edx * 8]		//pSrc + (u, v + 9) //(v has been incrimented by u looping)
		  sub esi, edx;					//pSrc + (u, v + 8)
		//(it has already looped around to 1 row below where we were. This moves
		//it to the second row down, since we are dealing with 8x8 blocks)

		  and ecx, [MaskV];				//(continued): swiz(v) += 8
		  jnz Start;					//if v is not done, keep going

		emms;		//done with mmx
	}
}


void swiz2d_16bit (void* pSrc, void* pDest, int Width, int Height) 
{
	DWORD	MaskU, MaskV, AddValU, AddValV;
	void*	pStoredDest	= NULL;
	void*	pNewDest	= NULL;
	
	if(((DWORD)pDest ) & 15 )
	{
		// Allocate another buffer big enough to ensure we can align it corrctly.
		pStoredDest = pDest;
		pNewDest	= new unsigned short[Width * Height + 16];
		pDest		= (void*)(((DWORD)pNewDest + 15 ) & ~15 );
	}

	GetMasks2(Width, Height, &MaskU, &MaskV);

	AddValU = (-16) & MaskU;
	AddValV = (-8) & MaskV;

	//eax = scratch
	//ebx = u
	//ecx = y
	//edx = pitch
	//esi = pSrc (this is not changed)
	//edi = pDest (this is incrimented)

	_asm {
		mov esi, pSrc;					
		mov edi, pDest;

		mov edx, [Width];
		xor ebx, ebx;
		xor ecx, ecx;
		add edx, edx;					//edx = width * 2

Start:
		mov eax, ebx;					//eax = u
		movlps xmm0, [esi];				//xmm0 = {a, b, e, f, ?, ?, ?, ?}
		or eax, ecx;					//eax = u | v
		movlps xmm1, [esi + edx];		//xmm1 = {c, d, g, h, ?, ?, ?, ?}
		unpcklps xmm0, xmm1;			//xmm1 = {a, b, c, d, e, f, g, h}
		sub ebx, [AddValU];				//part 1 of adding 4 to u

		movaps [edi + eax*2], xmm0;		//dest[u | v] = {a, b, c, d, e, f, g, h}

		add esi, 8;						//move source pointer to next block
		and ebx, [MaskU];				//ebx = the next u coordinate
		jnz Start;						//if u hasn't looped back to 0, repeat

		sub ecx, [AddValV];				//part 1 of adding 2 to v		
		add esi, edx;					//move src pointer to next row
		//(this moves it to the second row down, since it has already looped
		//to the first row down)

		and ecx, [MaskV];				//ecx = the next v coordinate
		jnz Start;						//if v is not done, keep going
	}

	// If we had to align the destination buffer, copy it back to the original buffer.
	if( pStoredDest )
	{
		memcpy( pStoredDest, pDest, Width * Height * 2 );
		delete [] pNewDest;		
	}
}



void swiz2d_32bit( void *pSrc, void *pDest, int Width, int Height )
{
	DWORD MaskU, MaskV, AddValU, AddValV;
	void*	pStoredDest	= NULL;
	void*	pNewDest	= NULL;
	
	if(((DWORD)pDest ) & 15 )
	{
		// Allocate another buffer big enough to ensure we can align it corrctly.
		pStoredDest = pDest;
		pNewDest = new unsigned int[Width * Height + 16];
		pDest = (void*)(((DWORD)pNewDest + 15 ) & ~15 );
	}

	GetMasks2(Width, Height, &MaskU, &MaskV);

	AddValU = (-4) & MaskU;
	AddValV = (-8) & MaskV;

	//eax = scratch
	//ebx = u
	//ecx = y
	//edx = pitch
	//esi = pSrc (this is not changed)
	//edi = pDest (this is incrimented)

	_asm {
		mov esi, pSrc;
		mov edi, pDest;

		mov edx, [Width];
		xor ebx, ebx;
		xor ecx, ecx;
		shl edx, 2;						//edx = width * 4

Start:
			mov eax, ebx;				//eax = u

			movlps xmm0, [esi];			//xmm0 = {a, b}

			or eax, ecx;				//eax = u | v

			movhps xmm0, [esi+edx];		//xmm0 = {a, b, c, d}

			sub ebx, [AddValU];			//part 1 of adding 2 to u

			movaps [edi + eax*4], xmm0;	//dest[u | v] = {a,b,c,d}

			add esi, 8;					//move source pointer to next block
		
			and ebx, [MaskU];			//part 2 of "u += 2"
			jnz Start;					//if u hasn't looped to 0, keep going

		  sub ecx, [AddValV];			//part 1 of "v += 2"

		  add esi, edx;					//move source pointer to 2 rows down

		  and ecx, [MaskV];				//part 1 of "v += 2"
		  jnz Start;					//if v hasn't looped to 0, repeat
	}

	// If we had to align the destination buffer, copy it back to the original buffer.
	if( pStoredDest )
	{
		memcpy( pStoredDest, pDest, Width * Height * 4 );
		delete [] pNewDest;		
	}
}




static unsigned int	swizzle_table[4096];
static bool			swizzle_table_generated = false;

#define TWIDDLE(_u, _v) ((swizzle_table[(_v)] << 1) | (swizzle_table[(_u)]))

/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
static void generateSwizzleTable( void )
{
	if( !swizzle_table_generated )
	{
		for( unsigned int i = 0, value = 0; i < 4096; i++ )
		{
			swizzle_table[i] = value;
			value += 0x2AAAAAAB;
			value &= 0x55555555;
		}
		swizzle_table_generated = true;
	}
}


/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
void SwizzleTexture( void *dstBuffer, void *srcBuffer, int width, int height, int depth, int stride )
{
	int tilesX, tilesY;
    int tilesSizeX, tilesSizeY;
    int tileSize;

	// Tiny textures.
	if( width <= 2 || height <= 1 ) 
	{		
		memcpy( dstBuffer, srcBuffer, ( width * height * depth ) / 8 );
		return;
	} 

	// We use the fast versions where possible.
	if( depth == 8 )
	{
		if(( width >= 16 ) && ( height >= 8 ))
		{
			swiz2d_8bit( srcBuffer, dstBuffer, width, height );
			return;
		}
	}
	else if( depth == 16 )
	{
		if(( width >= 16 ) && ( height >= 8 ))
		{
			swiz2d_16bit( srcBuffer, dstBuffer, width, height );
			return;
		}
	}
	else if( depth == 32 )
	{
		if(( width >= 8 ) && ( height >= 8 ))
		{
			swiz2d_32bit( srcBuffer, dstBuffer, width, height );
			return;
		}
	}
	else
	{
		exit( 0 );
	}
	
	// Okay, use the slow version.	
	generateSwizzleTable();

	if( width > height )
    {
        tilesX = width / height;
        tilesY = 1;

        tilesSizeX = width / tilesX;
        tilesSizeY = height;
    }
    else
    {
        tilesX = 1;
        tilesY = height / width;

        tilesSizeX = width;
        tilesSizeY = height / tilesY;
    }

    tileSize = tilesSizeX * tilesSizeY;

	switch (depth)
	{
		case 4:
	    case 8:
        {
			int j;

			for( j = 0; j < tilesY; j++)
            {
                int i;

                for (i = 0; i < tilesX; i++)
                {
					int y;
                    unsigned char *base;

                    base = (unsigned char *)(((unsigned char *)dstBuffer) +
                                       ((tileSize * tilesX) * j) +
                                       (tileSize * i));

                    for (y = 0; y < tilesSizeY; y++)
            {
                unsigned char    *srcPixel;
                int     x;

                        srcPixel = (unsigned char *)(((unsigned char *)srcBuffer) +
                                               (stride * (tilesSizeY * j)) +
                                               (tilesSizeX * i) +
                                               (stride * y));

                        for (x = 0; x < tilesSizeX; x++)
                {
                    unsigned char    *dstPixel;
                        dstPixel = (unsigned char *)(base + TWIDDLE(x, y));
		                *dstPixel = *srcPixel;

                    srcPixel++;
                }
            }
        }
            }
        }
        break;

    case 16:
        {
            int j;

            for (j = 0; j < tilesY; j++)
            {
                int i;

                for (i = 0; i < tilesX; i++)
                {
            int y;
                    unsigned char *base;

                    base = (unsigned char *)(((unsigned short *)dstBuffer) +
                                       ((tileSize * tilesX) * j) +
                                       (tileSize * i));

                    for (y = 0; y < tilesSizeY; y++)
            {
                unsigned short    *srcPixel;
                int     x;

                        srcPixel = (unsigned short *)(((unsigned char *)srcBuffer) +
                                                (stride * (tilesSizeY * j)) +
                                                (2 * tilesSizeX * i) +
                                                (stride * y));

                        for (x = 0; x < tilesSizeX; x++)
                {
                    unsigned short    *dstPixel;
                    dstPixel = (unsigned short *)(base + (TWIDDLE(x, y) << 1));
                    *dstPixel = *srcPixel;

                    srcPixel++;
                }
            }
        }
            }
        }
        break;

    case 24:
    case 32:
        {
            int j;

            for (j = 0; j < tilesY; j++)
            {
                int i;

                for (i = 0; i < tilesX; i++)
                {
            int y;
                    unsigned char *base;

                    base = (unsigned char *)(((unsigned int *)dstBuffer) +
                                       ((tileSize * tilesX) * j) +
                                       (tileSize * i));

                    for (y = 0; y < tilesSizeY; y++)
            {
                unsigned int    *srcPixel;
                int     x;

                        srcPixel = (unsigned int *)(((unsigned char *)srcBuffer) +
                                                (stride * (tilesSizeY * j)) +
                                                (4 * tilesSizeX * i) +
                                                (stride * y));

                        for (x = 0; x < tilesSizeX; x++)
                {
                    unsigned int    *dstPixel;
                    dstPixel = (unsigned int *)(base + (TWIDDLE(x, y) << 2));
                    *dstPixel = *srcPixel;

                    srcPixel++;
                }
            }
        }
            }
        }
        break;

    default:
		exit( 0 );
        break;
    }
}
