﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#include "stdafx.h"
#include "types.h"
#include <assert.h>
#include <stdio.h>
#include <vector>
#include <string>

#pragma warning ( push )
#pragma warning ( disable: 4302 4311 4312 )
#include "cafe/gx2/gx2Enum.h"
#include "cafe/gx2/gx2Constant.h"
#include "cafe/gx2/gx2Misc.h"
#include "cafe/gx2/gx2Shaders.h"
//#include "gx2ShadersInt.h"
#include "cafe/gx2/gx2Surface.h"
#include "cafe/gx2/gx2Texture.h"
#pragma warning ( pop )

#include <sdk_ver.h>
#include "cafe/gfd.h"
#include "gfdMem.h"

#define GSH_DEFAULT_FILENAME "out.gsh"

// definitions for private structures

// GFD specific Shader structures to repack structure between 32 bit and 64 bit

typedef struct _GFDVertexShaderRegs
{
    u32 reg[GX2_NUM_VERTEX_SHADER_REGISTERS];
} GFDVertexShaderRegs;

typedef struct _GFDGeometryShaderRegs
{
    u32 reg[GX2_NUM_GEOMETRY_SHADER_REGISTERS];
} GFDGeometryShaderRegs;

typedef struct _GFDPixelShaderRegs
{
    u32 reg[GX2_NUM_PIXEL_SHADER_REGISTERS];
} GFDPixelShaderRegs;

#if CAFE_OS_SDK_VERSION >= 21104
typedef struct _GFDComputeShaderRegs
{
    u32 reg[GX2_NUM_COMPUTE_SHADER_REGISTERS];
} GFDComputeShaderRegs;
#endif
typedef struct _GFDVertexShader
{
    GFDVertexShaderRegs _regs;
    u32                 shaderSize;
    u32                 shaderPtr;
    GX2ShaderMode       shaderMode;
    u32                 numUniformBlocks;
    u32                 uniformBlocks;
    u32                 numUniforms;
    u32                 uniformVars;
    u32                 numInitialValues;
    u32                 initialValues;
    u32                 _numLoops;
    u32                 _loopVars;
    u32                 numSamplers;
    u32                 samplerVars;
    u32                 numAttribs;
    u32                 attribVars;
    u32                 ringItemsize;
    u32                 hasStreamOut;
    u32                 streamOutVertexStride[GX2_MAX_STREAMOUT_BUFFERS];
    GX2RBuffer          shaderProgram;
} GFDVertexShader;

// GFD specific Shader structures to repack structure between 32 bit and 64 bit

typedef struct _GFDGeometryShader
{
    GFDGeometryShaderRegs _regs;
    u32                 shaderSize;
    u32                 shaderPtr;
    u32                 copyShaderSize;
    u32                 copyShaderPtr;
    GX2ShaderMode       shaderMode;
    u32                 numUniformBlocks;
    u32                 uniformBlocks;
    u32                 numUniforms;
    u32                 uniformVars;
    u32                 numInitialValues;
    u32                 initialValues;
    u32                 _numLoops;
    u32                 _loopVars;
    u32                 numSamplers;
    u32                 samplerVars;
    u32                 ringItemsize;
    u32                 hasStreamOut;
    u32                 streamOutVertexStride[GX2_MAX_STREAMOUT_BUFFERS];
    GX2RBuffer          shaderProgram;
    GX2RBuffer          copyShaderProgram;
} GFDGeometryShader;

// GFD specific Shader structures to repack structure between 32 bit and 64 bit

typedef struct _GFDPixelShader
{
    GFDPixelShaderRegs  _regs;
    u32                 shaderSize;
    u32                 shaderPtr;
    GX2ShaderMode       shaderMode;
    u32                 numUniformBlocks;
    u32                 uniformBlocks;
    u32                 numUniforms;
    u32                 uniformVars;
    u32                 numInitialValues;
    u32                 initialValues;
    u32                 _numLoops;
    u32                 _loopVars;
    u32                 numSamplers;
    u32                 samplerVars;
    GX2RBuffer          shaderProgram;
} GFDPixelShader;

#if CAFE_OS_SDK_VERSION >= 21104
typedef struct _GFDComputeShader
{
    GFDComputeShaderRegs  _regs;
    u32                 shaderSize;
    u32                 shaderPtr;
    u32                 numUniformBlocks;
    u32                 uniformBlocks;
    u32                 numUniforms;
    u32                 uniformVars;
    u32                 numInitialValues;
    u32                 initialValues;
    u32                 _numLoops;
    u32                 _loopVars;
    u32                 numSamplers;
    u32                 samplerVars;
    u32                 layout_size_x;
    u32                 layout_size_y;
    u32                 layout_size_z;
    u32                 Over64Mode;
    u32                 numWavesPerSIMD;
    GX2RBuffer          shaderProgram;
} GFDComputeShader;
#endif

// GFD specific Uniform Block structures to repack structure between 32 bit and 64 bit

typedef struct _GFDUniformBlock
{
    u32          name;
    u32          location;
    u32          size;
} GFDUniformBlock;

// GFD specific Uniform Var structures to repack structure between 32 bit and 64 bit

typedef struct _GFDUniformVar
{
    u32          name;
    GX2VarType   type;
    u32          arrayCount;
    u32          offset;
    u32          blockIndex;
} GFDUniformVar;

// GFD specific Attrib Var structures to repack structure between 32 bit and 64 bit

typedef struct _GFDAttribVar
{
    u32          name;
    GX2VarType   type;
    u32          arrayCount;
    u32          location;
} GFDAttribVar;

// GFD specific Sampler Var structures to repack structure between 32 bit and 64 bit

typedef struct _GFDSamplerVar
{
    u32            name;
    GX2SamplerType type;
    u32            location;
} GFDSamplerVar;
#if 0
typedef struct _GFDLoopVar
{
    u32 reg[GX2_NUM_LOOP_VAR_U32_WORDS];
} GFDLoopVar;
#endif
// name conversions

// Note: These arrays must be kept in sync with the enum lists in gx2Enum.h
// Those lists must be kept in sync with the compiler output.
// The latter check is done elsewhere.  The former is below.
static const char *varTypeName[] = {
    "GX2_VAR_TYPE_VOID",    // 0
    "GX2_VAR_TYPE_BOOL",    // 1
    "GX2_VAR_TYPE_INT",     // 2
    "GX2_VAR_TYPE_UINT",    // 3
    "GX2_VAR_TYPE_FLOAT",   // 4
    "GX2_VAR_TYPE_DOUBLE",  // 5
    "GX2_VAR_TYPE_DVEC2",   // 6
    "GX2_VAR_TYPE_DVEC3",   // 7
    "GX2_VAR_TYPE_DVEC4",   // 8
    "GX2_VAR_TYPE_VEC2",    // 9
    "GX2_VAR_TYPE_VEC3",    // 10
    "GX2_VAR_TYPE_VEC4",    // 11
    "GX2_VAR_TYPE_BVEC2",   // 12
    "GX2_VAR_TYPE_BVEC3",   // 13
    "GX2_VAR_TYPE_BVEC4",   // 14
    "GX2_VAR_TYPE_IVEC2",   // 15
    "GX2_VAR_TYPE_IVEC3",   // 16
    "GX2_VAR_TYPE_IVEC4",   // 17
    "GX2_VAR_TYPE_UVEC2",   // 18
    "GX2_VAR_TYPE_UVEC3",   // 19
    "GX2_VAR_TYPE_UVEC4",   // 20
    "GX2_VAR_TYPE_MAT2",    // 21
    "GX2_VAR_TYPE_MAT2X3",  // 22
    "GX2_VAR_TYPE_MAT2X4",  // 23
    "GX2_VAR_TYPE_MAT3X2",  // 24
    "GX2_VAR_TYPE_MAT3",    // 25
    "GX2_VAR_TYPE_MAT3X4",  // 26
    "GX2_VAR_TYPE_MAT4X2",  // 27
    "GX2_VAR_TYPE_MAT4X3",  // 28
    "GX2_VAR_TYPE_MAT4",    // 29
    "GX2_VAR_TYPE_DMAT2",   // 30
    "GX2_VAR_TYPE_DMAT2X3", // 31
    "GX2_VAR_TYPE_DMAT2X4", // 32
    "GX2_VAR_TYPE_DMAT3X2", // 33
    "GX2_VAR_TYPE_DMAT3",   // 34
    "GX2_VAR_TYPE_DMAT3X4", // 35
    "GX2_VAR_TYPE_DMAT4X2", // 36
    "GX2_VAR_TYPE_DMAT4X3", // 37
    "GX2_VAR_TYPE_DMAT4"    // 38
};

static const char *samplerTypeName[] = {
    "GX2_SAMPLER_TYPE_1D",                        // 0
    "GX2_SAMPLER_TYPE_2D",                        // 1
    "GX2_SAMPLER_TYPE_2D_RECT",                   // 2
    "GX2_SAMPLER_TYPE_3D",                        // 3
    "GX2_SAMPLER_TYPE_CUBE",                      // 4
    "GX2_SAMPLER_TYPE_1D_SHADOW",                 // 5
    "GX2_SAMPLER_TYPE_2D_SHADOW",                 // 6
    "GX2_SAMPLER_TYPE_2D_RECT_SHADOW",            // 7
    "GX2_SAMPLER_TYPE_CUBE_SHADOW",               // 8
    "GX2_SAMPLER_TYPE_1D_ARRAY",                  // 9
    "GX2_SAMPLER_TYPE_2D_ARRAY",                  // 10
    "GX2_SAMPLER_TYPE_1D_ARRAY_SHADOW",           // 11
    "GX2_SAMPLER_TYPE_2D_ARRAY_SHADOW",           // 12
    "GX2_SAMPLER_TYPE_CUBE_ARRAY",                // 13
    "GX2_SAMPLER_TYPE_CUBE_ARRAY_SHADOW",         // 14
    "GX2_SAMPLER_TYPE_BUFFER",                    // 15
    "GX2_SAMPLER_TYPE_RENDERBUFFER",              // 16
    "GX2_SAMPLER_TYPE_2D_MS",                     // 17
    "GX2_SAMPLER_TYPE_2D_MS_ARRAY",               // 18
    "GX2_SAMPLER_TYPE_INT_1D",                    // 19
    "GX2_SAMPLER_TYPE_INT_2D",                    // 20
    "GX2_SAMPLER_TYPE_INT_2D_RECT",               // 21
    "GX2_SAMPLER_TYPE_INT_3D",                    // 22
    "GX2_SAMPLER_TYPE_INT_CUBE",                  // 23
    "GX2_SAMPLER_TYPE_INT_1D_ARRAY",              // 24
    "GX2_SAMPLER_TYPE_INT_2D_ARRAY",              // 25
    "GX2_SAMPLER_TYPE_INT_CUBE_ARRAY",            // 26
    "GX2_SAMPLER_TYPE_INT_BUFFER",                // 27
    "GX2_SAMPLER_TYPE_INT_RENDERBUFFER",          // 28
    "GX2_SAMPLER_TYPE_INT_2D_MS",                 // 29
    "GX2_SAMPLER_TYPE_INT_2D_MS_ARRAY",           // 30
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_1D",           // 31
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_2D",           // 32
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_2D_RECT",      // 33
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_3D",           // 34
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_CUBE",         // 35
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_1D_ARRAY",     // 36
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_2D_ARRAY",     // 37
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_CUBE_ARRAY",   // 38
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_BUFFER",       // 39
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_RENDERBUFFER", // 40
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_2D_MS",        // 41
    "GX2_SAMPLER_TYPE_UNSIGNED_INT_2D_MS_ARRAY"   // 42
};

static const char *shaderModeName[] = {
    "GX2_SHADER_MODE_UNIFORM_REGISTER",
    "GX2_SHADER_MODE_UNIFORM_BLOCK",
    "GX2_SHADER_MODE_GEOMETRY_SHADER"
};

/// Cleans out extra debug flags attached to offset
u32 GFDCleanTag(u32 Offset)     {return Offset & ~GFD_TAG_MASK;}

/// Verifies offset stored in file is tagged with GFD_TAG_DAT
BOOL GFDCheckTagDAT(u32 Offset) {return (Offset & GFD_TAG_MASK) == GFD_TAG_DAT;}

/// Verifies offset stored in file is tagged with GFD_TAG_STR
BOOL GFDCheckTagSTR(u32 Offset) {return (Offset & GFD_TAG_MASK) == GFD_TAG_STR;}

#if 0
/// A block of null terminated strings all stored in the same char array
/// All strings are padded with zeros at the end to be integer multiple of words (4) long.
typedef struct _GFDStringTable
{
    u32    m_n;      ///< Current number of strings
    u32    m_nDB;    ///< Current size of pDataBlock in chars
    u32    m_maxDB;  ///< Max size of data block, again in chars
    char * m_pDB;    ///< Block of data containing all the strings;
} GFDStringTable;

/// A table of data segments, stored contiguously in the same int array as [Size, Data] pairs.
/// Data Tables are very similar to String Tables.  They just point arrays of data
/// rather than arrays null terminated strings.
typedef struct _GFDDataTable
{
    u32    m_n;      ///< Current number of data blocks in hunk (good for error checking)
    u32    m_nDB;    ///< Current size (e.g. next writeable offset) of m_pDB in chars
    u32    m_maxDB;  ///< Max size of data block, again in chars
    char * m_pDB;    ///< Block of data containing all the data
} GFDDataTable;
#endif

#pragma warning ( push )
#pragma warning ( disable: 4302 4311 4312 )

/// Create a new empty string table, with space for max strings;
GFDStringTable *GFDCreateStringTable(u32 max)
{
    GFDStringTable *pTable = (GFDStringTable *) malloc(sizeof(GFDStringTable));

    if ( pTable )
    {
        pTable->m_n = 0;

        pTable->m_nDB = 0;
        pTable->m_maxDB = max;
        pTable->m_pDB = (char *) malloc(pTable->m_maxDB * sizeof(char));
        memset(pTable->m_pDB, 0, pTable->m_maxDB * sizeof(char));
    }
    else
    {
        printf("Error! Failed to create string table!\n");
    }
    return pTable;
}

/// Frees the string table and all data associated with it
void GFDDestroyStringTable(GFDStringTable *pTable)
{
    if(pTable)
    {
        free(pTable->m_pDB);
        free(pTable);
        pTable = NULL;
    //    memset(pTable, 0, sizeof(GFDStringTable));     // paranioa to avoid any chance of reuse
    }
}

/// Grows maximum space in the string table to the new size.  Will not make table smaller.
void GFDGrowStringTableDB(GFDStringTable *pTable, u32 newMaxDB)
{
    if(newMaxDB > pTable->m_maxDB)  // if want more space, malloc it up, and copy old data over
    {
        char *pDB = (char*) malloc(newMaxDB * sizeof(char));     // create new memory
        memset(pDB, 0, newMaxDB * sizeof(char));
        memcpy(pDB, pTable->m_pDB, pTable->m_maxDB);              // copy old data into this new location

        free(pTable->m_pDB);                                    // cleanup, and save new values
        pTable->m_pDB   = pDB;
        pTable->m_maxDB = newMaxDB;
    }
}

/// Adds null terminated string to the string table.
/// Returns offset into the string table of the begining of this string
u32 GFDAddStringTable(GFDStringTable *pTable,  const char *str)
{
    size_t len = strlen(str);
    // (We want our strings to be word aligned to make transfer accross 'network' easier)
    int len_pad = (len + 1 + 3) & ~0x3;         // round length up to word boundary.. (+1 for first terminating 0)

    if(pTable->m_nDB + len_pad + 1 >= pTable->m_maxDB)
    {
        int newSize = pTable->m_nDB;
        if(len_pad >= newSize)  newSize = len_pad*2;        // be safe for really long strings
        newSize = (newSize + 0x1f) & ~0x1f;                 // round to nice size
        GFDGrowStringTableDB(pTable, pTable->m_nDB + newSize);      // double size if needed
    }

    // We could check here for a duplicate string already in the table,
    // and return that index, but let's skip that optimization for now.

    int n   = pTable->m_n;
    int off = pTable->m_nDB;

    // copy string into the string table
    strncpy_s(pTable->m_nDB + pTable->m_pDB, len+1, str, len+1);
    int zero = 0;   // fill trailing pad with zeros too...
    strncpy_s(pTable->m_nDB + pTable->m_pDB + len + 1, 1, (char*) &zero, len_pad - len-1); // changed by SSKK 01/06/2012
//    strncpy_s(pTable->m_nDB + pTable->m_pDB + len + 1, sizeof(u32), (char*) &zero, len_pad - len-1);

    // update our structs pointing to it
    pTable->m_nDB += len_pad;
    pTable->m_n   += 1;

    return off | GFD_TAG_STR;
}

/// Create a new empty string table, with space for max strings;
GFDDataTable *GFDCreateDataTable(u32 max)
{
    GFDDataTable *pTable = (GFDDataTable *) malloc(sizeof(GFDDataTable));

    if ( pTable )
    {
        pTable->m_n = 0;
        pTable->m_nDB = 0;
        pTable->m_maxDB = max * sizeof(u32);  // assume chars are average of 8 or so in size (will grow separatly from ppStrings)
        pTable->m_pDB = (char *) malloc(pTable->m_maxDB * sizeof(char));
        memset(pTable->m_pDB, 0xbb, pTable->m_maxDB * sizeof(char));
    }
    else
    {
        printf("Error! Failed to create data table!\n");
    }

    return pTable;
}

/// Frees the string table and all data associated with it
void GFDDestroyDataTable(GFDDataTable *pTable)
{
    if(pTable)
    {
        free(pTable->m_pDB);
        free(pTable);
        pTable = NULL;
     //   memset(pTable, 0, sizeof(GFDStringTable));  // paranioa to avoid any chance of reuse
    }
}

/// Grows maximum space of data table to the new size, in bytes.  Will not make table smaller.
void GFDGrowDataTableDB(GFDDataTable *pTable, u32 newMaxDB)
{
    if(newMaxDB > pTable->m_maxDB)  // if want more space, malloc it up, and copy old data over
    {
        char *pDB = (char *) malloc(newMaxDB * sizeof(char));  // create new memory
        memset(pDB, 0, newMaxDB * sizeof(char));
        memcpy(pDB, pTable->m_pDB, pTable->m_maxDB);           // copy old data into this new location

        free(pTable->m_pDB);                                   // cleanup, and save new values
        pTable->m_pDB   = pDB;
        pTable->m_maxDB = newMaxDB;
    }
}

/// Adds a new hunk of data to the data table, of specified length in bytes.
/// nBytes must be integer multiple of 4.
/// Returns byte offset into that table.
/// adds new block to our hunk table.  Returns new size
u32 GFDAddDataTable(GFDDataTable *pTable, void *data, u32 nBytes)
{
    assert( (nBytes & 0x3) == 0 && "nBytes must be multiple of 4");

    if(pTable->m_nDB + nBytes + sizeof(u32) >= pTable->m_maxDB)
    {
        size_t newSize = pTable->m_nDB;                                // double size
        if(nBytes >= newSize)  newSize = nBytes*2;                  // if doubling isn't enough, make bigger
        int finalSize = (pTable->m_nDB + newSize + 0x1f) & ~0x1f;   // finally round to nearest 32 bytes
        GFDGrowDataTableDB(pTable, finalSize);
    }

    u32 off = pTable->m_nDB;        // offset is data, after the length...

    memcpy(pTable->m_pDB + pTable->m_nDB, data, nBytes);

    // update our structs pointing to it
    pTable->m_nDB += nBytes;
    pTable->m_n   += 1;

    return off | GFD_TAG_DAT;        // OR in this silly constant to use for catching errors
}

//--------------------------------------------------------------------------

/// Repack a vertex shader from a 64-bit structure to a 32-bit structure.
/// We output a GX2VertexShader * for convenience, but it is not valid for 64-bit.
/// All pointers are cast to 32-bit integers. It is therefore 7*4 bytes shorter.
/// The return value is the resulting 32-bit structure size.
u32 GFDRepackVertexShaderFor32Bit(GX2VertexShader *pVSin64, GFDVertexShader *pVSout32)
{
    assert(sizeof(pVSout32->_regs) == sizeof(pVSin64->_regs));
    memcpy(&pVSout32->_regs, pVSin64->_regs, sizeof(pVSout32->_regs));

    pVSout32->shaderSize       =       pVSin64->shaderSize;
    pVSout32->shaderPtr        = (u32) pVSin64->shaderPtr;
    pVSout32->shaderMode       =       pVSin64->shaderMode;
    pVSout32->numUniformBlocks =       pVSin64->numUniformBlocks;
    pVSout32->uniformBlocks    = (u32) pVSin64->uniformBlocks;
    pVSout32->numUniforms      =       pVSin64->numUniforms;
    pVSout32->uniformVars      = (u32) pVSin64->uniformVars;
    pVSout32->numInitialValues =       pVSin64->numInitialValues;
    pVSout32->initialValues    = (u32) pVSin64->initialValues;
    pVSout32->_numLoops        =       pVSin64->_numLoops;
    pVSout32->_loopVars        = (u32) pVSin64->_loopVars;
    pVSout32->numSamplers      =       pVSin64->numSamplers;
    pVSout32->samplerVars      = (u32) pVSin64->samplerVars;
    pVSout32->numAttribs       =       pVSin64->numAttribs;
    pVSout32->attribVars       = (u32) pVSin64->attribVars;
    pVSout32->ringItemsize     =       pVSin64->ringItemsize;
    pVSout32->hasStreamOut     = (u32) pVSin64->hasStreamOut;

    assert(sizeof(pVSout32->streamOutVertexStride) == sizeof(pVSin64->streamOutVertexStride));
    memcpy(&pVSout32->streamOutVertexStride, pVSin64->streamOutVertexStride, sizeof(pVSout32->streamOutVertexStride));

    pVSout32->shaderProgram =         pVSin64->shaderProgram;

    return sizeof(GFDVertexShader);
}

/// Repack a pixel shader from a 64-bit structure to a 32-bit structure.
/// We output a GX2PixelShader * for convenience, but it is not valid for 64-bit.
/// All pointers are cast to 32-bit integers. It is therefore 6*4 bytes shorter.
/// The return value is the resulting 32-bit structure size.
u32 GFDRepackPixelShaderFor32Bit(GX2PixelShader *pPSin64, GFDPixelShader *pPSout32)
{
    assert(sizeof(pPSout32->_regs) == sizeof(pPSin64->_regs));
    memcpy(&pPSout32->_regs, pPSin64->_regs, sizeof(pPSout32->_regs));

    pPSout32->shaderSize       =       pPSin64->shaderSize;
    pPSout32->shaderPtr        = (u32) pPSin64->shaderPtr;
    pPSout32->shaderMode       =       pPSin64->shaderMode;
    pPSout32->numUniformBlocks =       pPSin64->numUniformBlocks;
    pPSout32->uniformBlocks    = (u32) pPSin64->uniformBlocks;
    pPSout32->numUniforms      =       pPSin64->numUniforms;
    pPSout32->uniformVars      = (u32) pPSin64->uniformVars;
    pPSout32->numInitialValues =       pPSin64->numInitialValues;
    pPSout32->initialValues    = (u32) pPSin64->initialValues;
    pPSout32->_numLoops        =       pPSin64->_numLoops;
    pPSout32->_loopVars        = (u32) pPSin64->_loopVars;
    pPSout32->numSamplers      =       pPSin64->numSamplers;
    pPSout32->samplerVars      = (u32) pPSin64->samplerVars;
    pPSout32->shaderProgram    =       pPSin64->shaderProgram;

    return sizeof(GFDPixelShader);
}

/// Repack a geometry shader from a 64-bit structure to a 32-bit structure.
/// We output a GX2GeometryShader * for convenience, but it is not valid for 64-bit.
/// All pointers are cast to 32-bit integers. It is therefore 6*4 bytes shorter.
/// The return value is the resulting 32-bit structure size.
u32 GFDRepackGeometryShaderFor32Bit(GX2GeometryShader *pGSin64, GFDGeometryShader *pGSout32)
{
    assert(sizeof(pGSout32->_regs) == sizeof(pGSin64->_regs));
    memcpy(&pGSout32->_regs, pGSin64->_regs, sizeof(pGSout32->_regs));

    pGSout32->shaderSize       =       pGSin64->shaderSize;
    pGSout32->shaderPtr        = (u32) pGSin64->shaderPtr;
    pGSout32->copyShaderSize   =       pGSin64->copyShaderSize;
    pGSout32->copyShaderPtr    = (u32) pGSin64->copyShaderPtr;
    pGSout32->shaderMode       =       pGSin64->shaderMode;
    pGSout32->numUniformBlocks =       pGSin64->numUniformBlocks;
    pGSout32->uniformBlocks    = (u32) pGSin64->uniformBlocks;
    pGSout32->numUniforms      =       pGSin64->numUniforms;
    pGSout32->uniformVars      = (u32) pGSin64->uniformVars;
    pGSout32->numInitialValues =       pGSin64->numInitialValues;
    pGSout32->initialValues    = (u32) pGSin64->initialValues;
    pGSout32->_numLoops        =       pGSin64->_numLoops;
    pGSout32->_loopVars        = (u32) pGSin64->_loopVars;
    pGSout32->numSamplers      =       pGSin64->numSamplers;
    pGSout32->samplerVars      = (u32) pGSin64->samplerVars;
    pGSout32->ringItemsize     =       pGSin64->ringItemsize;
    pGSout32->hasStreamOut     = (u32) pGSin64->hasStreamOut;

    assert(sizeof(pGSout32->streamOutVertexStride) == sizeof(pGSin64->streamOutVertexStride));
    memcpy(&pGSout32->streamOutVertexStride, pGSin64->streamOutVertexStride, sizeof(pGSout32->streamOutVertexStride));

    pGSout32->shaderProgram     =       pGSin64->shaderProgram;
    pGSout32->copyShaderProgram =       pGSin64->copyShaderProgram;

    return sizeof(GFDGeometryShader);
}

#if CAFE_OS_SDK_VERSION >= 21104
/// Repack a compute shader from a 64-bit structure to a 32-bit structure.
/// We output a GX2ComputeShader * for convenience, but it is not valid for 64-bit.
/// All pointers are cast to 32-bit integers. It is therefore 7*4 bytes shorter.
/// The return value is the resulting 32-bit structure size.
u32 GFDRepackComputeShaderFor32Bit(GX2ComputeShader *pCSin64, GFDComputeShader *pCSout32)
{
    assert(sizeof(pCSout32->_regs) == sizeof(pCSin64->_regs));
    memcpy(&pCSout32->_regs, pCSin64->_regs, sizeof(pCSout32->_regs));

    pCSout32->shaderSize        =       pCSin64->shaderSize;
    pCSout32->shaderPtr         = (u32) pCSin64->shaderPtr;
    pCSout32->numUniformBlocks  =       pCSin64->numUniformBlocks;
    pCSout32->uniformBlocks     = (u32) pCSin64->uniformBlocks;
    pCSout32->numUniforms       =       pCSin64->numUniforms;
    pCSout32->uniformVars       = (u32) pCSin64->uniformVars;
    pCSout32->numInitialValues  =       pCSin64->numInitialValues;
    pCSout32->initialValues     = (u32) pCSin64->initialValues;
    pCSout32->_numLoops         =       pCSin64->_numLoops;
    pCSout32->_loopVars         = (u32) pCSin64->_loopVars;
    pCSout32->numSamplers       =       pCSin64->numSamplers;
    pCSout32->samplerVars       = (u32) pCSin64->samplerVars;
    pCSout32->layout_size_x     =       pCSin64->layout_size_x;
    pCSout32->layout_size_y     =       pCSin64->layout_size_y;
    pCSout32->layout_size_z     =       pCSin64->layout_size_z;
    pCSout32->Over64Mode        =       pCSin64->Over64Mode;
    pCSout32->numWavesPerSIMD   =       pCSin64->numWavesPerSIMD;
    pCSout32->shaderProgram     = pCSin64->shaderProgram;

    return sizeof(GFDComputeShader);
}
#endif

/// Repack a uniform block array from a 64-bit structure to a 32-bit structure.
/// We output a GX2UniformBlock * for convenience, but it is not valid for 64-bit.
/// All pointers are cast to 32-bit integers. It is therefore 1*4*n bytes shorter.
/// The return value is the resulting 32-bit structure size.
u32 GFDRepackUniformBlockArrayFor32Bit(GX2UniformBlock *pUBin64, GFDUniformBlock *pUBout32, u32 n)
{
    for(u32 i=0; i<n; i++) {
        pUBout32[i].name     = (u32) pUBin64[i].name;
        pUBout32[i].location =       pUBin64[i].location;
        pUBout32[i].size     =       pUBin64[i].size;
    }
    return sizeof(GFDUniformBlock)*n;
}

/// Repack a uniform var array from a 64-bit structure to a 32-bit structure.
/// We output a GX2UniformVar * for convenience, but it is not valid for 64-bit.
/// All pointers are cast to 32-bit integers. It is therefore 2*4*n bytes shorter.
/// The return value is the resulting 32-bit structure size.
u32 GFDRepackUniformVarArrayFor32Bit(GX2UniformVar *pUVin64, GFDUniformVar *pUVout32, u32 n)
{
    for(u32 i=0; i<n; i++) {
        pUVout32[i].name       = (u32) pUVin64[i].name;
        pUVout32[i].type       =       pUVin64[i].type;
        pUVout32[i].arrayCount =       pUVin64[i].arrayCount;
        pUVout32[i].offset     =       pUVin64[i].offset;
        pUVout32[i].blockIndex =       pUVin64[i].blockIndex;
    }
    return sizeof(GFDUniformVar)*n;
}

/// Repack an attrib var array from a 64-bit structure to a 32-bit structure.
/// We output a GX2AttribVar * for convenience, but it is not valid for 64-bit.
/// All pointers are cast to 32-bit integers. It is therefore 1*4*n bytes shorter.
/// The return value is the resulting 32-bit structure size.
u32 GFDRepackAttribVarArrayFor32Bit(GX2AttribVar *pAVin64, GFDAttribVar *pAVout32, u32 n)
{
    for(u32 i=0; i<n; i++) {
        pAVout32[i].name       = (u32) pAVin64[i].name;
        pAVout32[i].type       =       pAVin64[i].type;
        pAVout32[i].arrayCount =       pAVin64[i].arrayCount;
        pAVout32[i].location   =       pAVin64[i].location;
    }
    return sizeof(GFDAttribVar)*n;
}

/// Repack a (texture) sampler var array from a 64-bit structure to a 32-bit structure.
/// We output a GX2SamplerVar * for convenience, but it is not valid for 64-bit.
/// All pointers are cast to 32-bit integers. It is therefore 1*4*n bytes shorter.
/// The return value is the resulting 32-bit structure size.
u32 GFDRepackSamplerVarArrayFor32Bit(GX2SamplerVar *pSVin64, GFDSamplerVar *pSVout32, u32 n)
{
    for(u32 i=0; i<n; i++) {
        pSVout32[i].name     = (u32) pSVin64[i].name;
        pSVout32[i].type     =       pSVin64[i].type;
        pSVout32[i].location =       pSVin64[i].location;
    }
    return sizeof(GFDSamplerVar)*n;
}
// ------------------------------------------------------------

/// Create the flat datablock representation of a GX2VertexShader structure
/// Call GFDDataTableDestroy() on returned object once doen with it.
GFDDataTable* GFDCreateBlockRelocateHeaderVSH(GX2VertexShader *pVS)
{
    // Create second data structure to hold flattened, offseted version of our original shader
    GX2VertexShader vsCopy;
    memcpy(&vsCopy,  pVS,  sizeof(GX2VertexShader));

    // Walk thru copy, converting all pointers to data blocks in table, and changing
    // addresses to offsets into the data block

    // Create data table to hold the structure elements
    GFDDataTable *pDT = GFDCreateDataTable(sizeof(GX2VertexShader) + vsCopy.numUniforms * 8 + 512);      // todo - pick better number

    // How many pointers do we need to patch? (The 11 here is empirically determined - asserts at end if if wrong)
    // 11 = 1 (shader itself) + 7 in GX2VertexShader + 1 (string table) + 1 (patch list) + 1 (trailer)
    int nElements = 11 + vsCopy.numUniformBlocks + vsCopy.numUniforms + vsCopy.numSamplers + vsCopy.numAttribs;
    int nE = 0;
    u32 size;
    u32 *pAddr   = (u32*) malloc( nElements * sizeof(u32));    // src offsets into data block that need patching
    u32 *pOffset = (u32*) malloc( nElements * sizeof(u32));    // dst offsets into data block for, containing offets to write

    memset(pOffset, 0, nElements * sizeof(u32));
    memset(pAddr, 0,  nElements * sizeof(u32));

    // 0: Store main structure itself  (we'll rewrite offsets at the end).
    // For allocation and alignment purposes, this needs to be the first hunk in the data table
    GFDVertexShader vsCopy32;
    size = GFDRepackVertexShaderFor32Bit(&vsCopy, &vsCopy32);
    int oMain = nE;
    pOffset[nE] = GFDAddDataTable(pDT, &vsCopy32, size);
    pAddr[nE]   = 0;       // don't patch this location
    nE++;

    // 1: Store uniform block/buffer array
    GFDUniformBlock *pUB = (GFDUniformBlock *) malloc(sizeof(GFDUniformBlock)*vsCopy.numUniformBlocks);
    if ( !pUB )
    {
        printf("Error! Failed to allocate Uniform Block structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    size = GFDRepackUniformBlockArrayFor32Bit(vsCopy.uniformBlocks, pUB, vsCopy.numUniformBlocks);
    int oUniformBuffers = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pUB, size);
    pAddr[nE]  = (vsCopy.numUniformBlocks == 0) ? 0 : pOffset[oMain] + (u32)&vsCopy32.uniformBlocks - (u32)&vsCopy32;
    nE++;
    free(pUB);

    // 2: Store uniform array
    GFDUniformVar *pUV = (GFDUniformVar *) malloc(sizeof(GFDUniformVar)*vsCopy.numUniforms);
    if ( !pUV )
    {
        printf("Error! Failed to allocate Uniform Variable structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    size = GFDRepackUniformVarArrayFor32Bit(vsCopy.uniformVars, pUV, vsCopy.numUniforms);
    int oUniforms = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pUV, size);
    pAddr[nE]  = (vsCopy.numUniforms == 0) ? 0 : pOffset[oMain] + (u32)&vsCopy32.uniformVars - (u32)&vsCopy32;
    nE++;
    free(pUV);

    // 3: Store uniform initial values
    // Note initial values points to uniform block, a contiguous section of GX2UniformInitialValues
    int oUniformData = nE;
    pOffset[nE] = GFDAddDataTable(pDT, vsCopy.initialValues, vsCopy.numInitialValues * sizeof(GX2UniformInitialValue));
    pAddr[nE]   = (vsCopy.numInitialValues == 0) ? 0 : pOffset[oMain] + (u32)&vsCopy32.initialValues - (u32)&vsCopy32;
    nE++;

    // 4: Store loop
    int oLoops = nE;
    pOffset[nE] = GFDAddDataTable(pDT, vsCopy._loopVars, vsCopy._numLoops * sizeof(GFDLoopVar));
    pAddr[nE]  = (vsCopy._numLoops == 0) ? 0 : pOffset[oMain] + (u32)&vsCopy32._loopVars - (u32)&vsCopy32;
    nE++;

    // 5: Store sampler descriptors
    GFDSamplerVar *pSV = (GFDSamplerVar *) malloc(sizeof(GFDSamplerVar)*vsCopy.numSamplers);
    if ( !pSV )
    {
        printf("Error! Failed to allocate Sampler Variable structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    size = GFDRepackSamplerVarArrayFor32Bit(vsCopy.samplerVars, pSV, vsCopy.numSamplers);
    int oSamplers = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pSV, size);
    pAddr[nE]   = (vsCopy.numSamplers == 0) ? 0 : pOffset[oMain] + (u32)&vsCopy32.samplerVars - (u32)&vsCopy32;
    nE++;
    free(pSV);

    // 6: Store Attributes
    GFDAttribVar *pAV = (GFDAttribVar *) malloc(sizeof(GFDAttribVar)*vsCopy.numAttribs);
    if ( !pAV )
    {
        printf("Error! Failed to allocate Attribute Variable structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    size = GFDRepackAttribVarArrayFor32Bit(vsCopy.attribVars, pAV, vsCopy.numAttribs);
    int oAttrib_names = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pAV, size);
    pAddr[nE]  = (vsCopy.numAttribs == 0) ? 0 : pOffset[oMain] + (u32)&vsCopy32.attribVars - (u32)&vsCopy32;
    nE++;
    free(pAV);

    // 7a: Create a string table to store all the strings in
    const int kAvgCharsPerString = 12;   // will auto-grow if actually biger
    GFDStringTable *pStrTable = GFDCreateStringTable( ( vsCopy.numUniformBlocks + vsCopy.numUniforms + vsCopy.numSamplers + vsCopy.numAttribs) * kAvgCharsPerString);
    u32 offStringTable = pDT->m_nDB;     // current offset...

    // s1: Store each uniform block name (in common string table), as well as pointer to the initial value in uniform block
    for(u32 i = 0; i < vsCopy.numUniformBlocks; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, vsCopy.uniformBlocks[i].name );
        pAddr[nE]   = pOffset[oUniformBuffers] + i * sizeof(GFDUniformBlock) + (u32)&vsCopy.uniformBlocks[i].name - (u32)&vsCopy.uniformBlocks[i];
        nE++;
    }

    // s2: Store each uniform name (in common string table)
    for(u32 i = 0; i < vsCopy.numUniforms; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, vsCopy.uniformVars[i].name );
        pAddr[nE]   = pOffset[oUniforms] + i * sizeof(GFDUniformVar) + (u32)&vsCopy.uniformVars[i].name - (u32)&vsCopy.uniformVars[i];
        nE++;
    }

    // s3: Store each sampler name (in common string table)
    for(u32 i = 0; i < vsCopy.numSamplers; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, vsCopy.samplerVars[i].name );
        pAddr[nE]   = pOffset[oSamplers] + i * sizeof(GFDSamplerVar) + (u32)&vsCopy.samplerVars[i].name - (u32)&vsCopy.samplerVars[i];
        nE++;
    }

    // s4: Store each attrib name (in common string table)
    for(u32 i = 0; i < vsCopy.numAttribs; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, vsCopy.attribVars[i].name );
        pAddr[nE]   = pOffset[oAttrib_names] + i * sizeof(GFDAttribVar) + (u32)&vsCopy.attribVars[i].name - (u32)&vsCopy.attribVars[i];
        nE++;
    }

    // 8: Store the string table (watch out for 1-3 bytes of padding)
    int oStringTable = nE;

    // Note, although arrays of chars don't seem to be modified to go into network order, we cache our
    // stringtable as a block in a word array which does get byte-flipped.  So let's pre-flip it here
    // so it comes out right.
    int nWordsStrTbl = (pStrTable->m_nDB + 0x3) / 4;
    GFDEndianSwap8in32((u32*) pStrTable->m_pDB, nWordsStrTbl);
    pOffset[nE] = GFDAddDataTable(pDT, pStrTable->m_pDB, nWordsStrTbl*4);  // simply write out string table data
    pAddr[nE]   = 0;        // don't patch this location
    nE++;

    // let's convert it back so if we read it latter, we won't have problems
    GFDEndianSwap8in32((u32*) pStrTable->m_pDB, nWordsStrTbl);

    // end0: Turn all our patch addresses to offsets
    for(int i = 0; i < nE; i++)
    {
        if(pAddr[i] != 0)
        {
            *((u32*) (pDT->m_pDB + GFDCleanTag(pAddr[i]))) = pOffset[i];
        }
    }

    // 9: Store the offset patch list
    // (not *really* needed, could reconstruct if know all data types, but makes it a *lot* easier)
    // After reading data block in at addrX, increment each location i of *(addrX + pAddr[i]) += addrX;
    // By putting this after main, we can allocate space for it at same time and dellocate it when main goes away,
    // without affecting the main data.
    int oPatchTable = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pAddr, nElements * sizeof(u32));  // simply write out zeros now..
    pAddr[nE]   = 0;        // don't patch this location either
    nE++;

    // 10: Finally, a small structure describing this data block.
    GFDBlockRelocationHeader *vshTrailer = (GFDBlockRelocationHeader *) malloc( sizeof(GFDBlockRelocationHeader) );
    if (!vshTrailer)
    {
        printf("Error! Failed to allocate structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    memset(vshTrailer, 0, sizeof(GFDBlockRelocationHeader));
    vshTrailer->magic = GFD_SWAP_8_IN_32(GFD_BLOCK_RELOCATION_HEADER_MAGIC);
    vshTrailer->type  = 0;
    vshTrailer->size  = sizeof(GFDBlockRelocationHeader);

    int oTrailer = nE;

    // Fill in our trailer and write it out
    vshTrailer->dataSize               = GFDCleanTag(pOffset[oPatchTable]) - GFDCleanTag(pOffset[oMain]);  // size of the main data section (allocate this size of contiguous memory)
    vshTrailer->dataOffset             = pOffset[oMain];             // offset of the main data section in this block
    vshTrailer->stringTableCharNumber  = pStrTable->m_nDB;           // number of characters in the string table
    vshTrailer->stringTableOffset      = pOffset[oStringTable];      // offset of string table in this block
    vshTrailer->patchTableOffsetNumber = nE;                         // number of offsets in the patch table
    vshTrailer->patchTableOffset       = pOffset[oPatchTable] ;      // offset of the patch table in this block

    pOffset[nE] = GFDAddDataTable(pDT, vshTrailer, sizeof(GFDBlockRelocationHeader));
    pAddr[nE] = 0;          // don't patch this location
    nE++;

    free(vshTrailer);
    vshTrailer = NULL;

    assert(offStringTable == GFDCleanTag(pOffset[oStringTable]) && "Guess for offset table in 7a was wrong");
    assert(nE <= nElements && "Too few offsets allocated");

    free(pAddr);
    free(pOffset);

    GFDDestroyStringTable( pStrTable );
    return pDT;
}

/// Create the flat datablock representation of a GX2PixelShader structure
/// Call GFDDataTableDestroy() on returned object once doen with it.
GFDDataTable* GFDCreateBlockRelocateHeaderPSH(GX2PixelShader *pPS)
{
   // Create second data structure to hold flattened, offseted version of our original shader
    GX2PixelShader psCopy;
    memcpy(&psCopy,  pPS,  sizeof(GX2PixelShader));

    // Walk thru copy, converting all pointers to data blocks in table, and changing
    // addresses to offsets into the data block

    // Create data table to hold the structure elements
    GFDDataTable *pDT = GFDCreateDataTable(sizeof(GX2PixelShader) + psCopy.numUniforms * 8 + 1024);      // todo - pick better number

    // How many pointers do we need to patch? (The 10 here is empirically determined - asserts at end if if wrong)
    // 10 = 1 (shader itself) + 6 in GX2PixelShader + 1 (string table) + 1 (patch list) + 1 (trailer)
    int nElements = 10 + psCopy.numUniformBlocks + 1*psCopy.numUniforms + psCopy.numSamplers;
    int nE = 0;
    u32 size;
    u32 *pAddr   = (u32*) malloc( nElements * sizeof(u32));    // src offsets into data block that need patching
    u32 *pOffset = (u32*) malloc( nElements * sizeof(u32));    // dst offsets into data block for, containing offets to write
    memset(pOffset, 0, nElements * sizeof(u32));
    memset(pAddr, 0,  nElements * sizeof(u32));

    // 0: Store main structure itself  (we'll rewrite offsets at the end).
    // For allocation and alignment purposes, this needs to be the first hunk in the data table
    GFDPixelShader psCopy32;
    size = GFDRepackPixelShaderFor32Bit(&psCopy, &psCopy32);
    int oMain = nE;
    pOffset[nE] = GFDAddDataTable(pDT, &psCopy32, size);
    pAddr[nE]   = 0;       // don't patch this location
    nE++;

    // 1: Store uniform block array
    GFDUniformBlock *pUB = (GFDUniformBlock *) malloc(sizeof(GFDUniformBlock)*psCopy.numUniformBlocks);
    if (!pUB)
    {
        printf("Error! Failed to allocate Uniform Block structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    size = GFDRepackUniformBlockArrayFor32Bit(psCopy.uniformBlocks, pUB, psCopy.numUniformBlocks);
    int oUniformBuffers = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pUB, size);
    pAddr[nE]  = (psCopy.numUniformBlocks == 0) ? 0 : pOffset[oMain] + (u32)&psCopy32.uniformBlocks - (u32)&psCopy32;
    nE++;
    free(pUB);

    // 2: Store uniform array
    GFDUniformVar *pUV = (GFDUniformVar *) malloc(sizeof(GFDUniformVar)*psCopy.numUniforms);
    if (!pUV)
    {
        printf("Error! Failed to allocate Uniform Variable structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    size = GFDRepackUniformVarArrayFor32Bit(psCopy.uniformVars, pUV, psCopy.numUniforms);
    int oUniforms = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pUV, size);
    pAddr[nE]  = (psCopy.numUniforms == 0) ? 0 : pOffset[oMain] + (u32)&psCopy32.uniformVars - (u32)&psCopy32;
    nE++;
    free(pUV);

    // 3: Store uniform initial values
    // Note initial values points to uniform block, a contiguous section of GX2UniformInitialValues
    int oUniformData = nE;
    pOffset[nE] = GFDAddDataTable(pDT, psCopy.initialValues, psCopy.numInitialValues * sizeof(GX2UniformInitialValue));
    pAddr[nE]   = (psCopy.numInitialValues == 0) ? 0 : pOffset[oMain] + (u32)&psCopy32.initialValues - (u32)&psCopy32;
    nE++;

    // 4: Store loop array
    int oLoops = nE;
    pOffset[nE] = GFDAddDataTable(pDT, psCopy._loopVars, psCopy._numLoops * sizeof(GFDLoopVar));
    pAddr[nE]  = (psCopy._numLoops == 0) ? 0 : pOffset[oMain] + (u32)&psCopy32._loopVars - (u32)&psCopy32;
    nE++;

    // 5: Store sampler descriptors
    GFDSamplerVar *pSV = (GFDSamplerVar *) malloc(sizeof(GFDSamplerVar)*psCopy.numSamplers);
    if (!pSV)
    {
        printf("Error! Failed to allocate Sampler Variable structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    size = GFDRepackSamplerVarArrayFor32Bit(psCopy.samplerVars, pSV, psCopy.numSamplers);
    int oSamplers = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pSV, size);
    pAddr[nE]   = (psCopy.numSamplers == 0) ? 0 : pOffset[oMain] + (u32)&psCopy32.samplerVars - (u32)&psCopy32;
    nE++;
    free(pSV);

    // 6a: Create a string table to store all the strings in
    const int kAvgCharsPerString = 12;   // will auto-grow if actually biger
    GFDStringTable *pStrTable = GFDCreateStringTable( ( psCopy.numUniformBlocks +  psCopy.numUniforms + psCopy.numSamplers) * kAvgCharsPerString);
    u32 offStringTable = pDT->m_nDB;     // current offset...

    // s1: Store each uniform block name (in common string table), as well as pointer to the initial value in uniform block
    for(u32 i = 0; i < psCopy.numUniformBlocks; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, psCopy.uniformBlocks[i].name );
        pAddr[nE]   = pOffset[oUniformBuffers] + i * sizeof(GFDUniformBlock) + (u32)&psCopy.uniformBlocks[i].name - (u32)&psCopy.uniformBlocks[i];
        nE++;
    }

    // s2: Store each uniform name (in common string table)
    for(u32 i = 0; i < psCopy.numUniforms; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, psCopy.uniformVars[i].name );
        pAddr[nE]   = pOffset[oUniforms] + i * sizeof(GFDUniformVar) + (u32)&psCopy.uniformVars[i].name - (u32)&psCopy.uniformVars[i];
        nE++;
    }

    // s3: Store each sampler name (in common string table)
    for(u32 i = 0; i < psCopy.numSamplers; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, psCopy.samplerVars[i].name );
        pAddr[nE]   = pOffset[oSamplers] + i * sizeof(GFDSamplerVar) + (u32)&psCopy.samplerVars[i].name - (u32)&psCopy.samplerVars[i];
        nE++;
    }

    // 7: Store the string table (watch out for 1-3 bytes of padding)
    int oStringTable = nE;

    // Note, although arrays of chars don't seem to be modified to go into network order, we cache our
    // stringtable as a block in a word array which does get byte-flipped.  So let's pre-flip it here
    // so it comes out right.
    int nWordsStrTbl = (pStrTable->m_nDB + 0x3) / 4;
    GFDEndianSwap8in32((u32*) pStrTable->m_pDB, nWordsStrTbl);
    pOffset[nE] = GFDAddDataTable(pDT, pStrTable->m_pDB, nWordsStrTbl*4);  // simply write out string table data
    pAddr[nE]   = 0;        // don't patch this location
    nE++;

    // let's convert it back so if we read it latter, we won't have problems
    GFDEndianSwap8in32((u32*) pStrTable->m_pDB, nWordsStrTbl);

    // end0: Turn all our patch addresses to offsets
    for(int i = 0; i < nE; i++)
    {
        if(pAddr[i] != 0)
        {
            *((u32*) (pDT->m_pDB + GFDCleanTag(pAddr[i]))) = pOffset[i];
        }
    }

    // 8: Store the offset patch list
    // (not *really* needed, could reconstruct if know all data types, but makes *alot* easier)
    // After reading data block in at addrX, increment each location i of *(addrX + pAddr[i]) += addrX;
    // By putting this after main, we can allocate space for it at same time and dellocate it when main goes away,
    // without affecting the main data.
    int oPatchTable = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pAddr, nElements * sizeof(u32));        // simply write out zeros now..
    pAddr[nE]   = 0;        // don't patch this location either
    nE++;

    // 9: Finally, a small structure describing this data block.
    GFDBlockRelocationHeader *pshTrailer = (GFDBlockRelocationHeader *) malloc( sizeof(GFDBlockRelocationHeader) );
    if (!pshTrailer)
    {
        printf("Error! Failed to allocate Block Relocation structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    memset(pshTrailer, 0, sizeof(GFDBlockRelocationHeader));
    pshTrailer->magic   = GFD_SWAP_8_IN_32(GFD_BLOCK_RELOCATION_HEADER_MAGIC);
    pshTrailer->type    = 0;
    pshTrailer->size    = sizeof(GFDBlockRelocationHeader);

    int oTrailer = nE;

    // Fill in our trailer and write it out
    pshTrailer->dataSize               = GFDCleanTag(pOffset[oPatchTable]) - GFDCleanTag(pOffset[oMain]);  // size of the main data section (allocate this size of contiguous memory)
    pshTrailer->dataOffset             = pOffset[oMain];             // offset of the main data section in this block
    pshTrailer->stringTableCharNumber  = pStrTable->m_nDB;           // number of characters in the string table
    pshTrailer->stringTableOffset      = pOffset[oStringTable];      // offset of string table in this block
    pshTrailer->patchTableOffsetNumber = nE;                         // number of offsets in the patch table
    pshTrailer->patchTableOffset       = pOffset[oPatchTable] ;      // offset of the patch table in this block

    pOffset[nE] = GFDAddDataTable(pDT, pshTrailer, sizeof(GFDBlockRelocationHeader));
    pAddr[nE] = 0;          // don't patch this location
    nE++;

    free(pshTrailer);
    pshTrailer = NULL;

    assert(offStringTable == GFDCleanTag(pOffset[oStringTable]) && "Guess for offset table in 7a was wrong");
    assert(nE <= nElements && "Too few offsets allocated");

    free(pAddr);
    free(pOffset);

    GFDDestroyStringTable( pStrTable );
    return pDT;
}

/// Create the flat datablock representation of a GX2GeometryShader structure
/// Call GFDDataTableDestroy() on returned object once doen with it.
GFDDataTable* GFDCreateBlockRelocateHeaderGSH(GX2GeometryShader *pGS)
{
    // Create second data structure to hold flattened, offseted version of our original shader
    GX2GeometryShader gsCopy;
    memcpy(&gsCopy,  pGS,  sizeof(GX2GeometryShader));

    // Walk thru copy, converting all pointers to data blocks in table, and changing
    // addresses to offsets into the data block

    // Create data table to hold the structure elements
    GFDDataTable *pDT = GFDCreateDataTable(sizeof(GX2GeometryShader) + gsCopy.numUniforms * 8 + 512);      // todo - pick better number

    // How many pointers do we need to patch? (The 11 here is empirically determined - asserts at end if if wrong)
    // 11 = 1 (shader itself) + 7 in GX2GeometryShader + 1 (string table) + 1 (patch list) + 1 (trailer)
    int nElements = 11 + gsCopy.numUniformBlocks + gsCopy.numUniforms + gsCopy.numSamplers /*+ gsCopy.numAttribs */;
    int nE = 0;
    u32 size;
    u32 *pAddr   = (u32*) malloc( nElements * sizeof(u32));    // src offsets into data block that need patching
    u32 *pOffset = (u32*) malloc( nElements * sizeof(u32));    // dst offsets into data block for, containing offets to write
    memset(pOffset, 0, nElements * sizeof(u32));
    memset(pAddr, 0,  nElements * sizeof(u32));

    // 0: Store main structure itself  (we'll rewrite offsets at the end).
    // For allocation and alignment purposes, this needs to be the first hunk in the data table
    GFDGeometryShader gsCopy32;
    size = GFDRepackGeometryShaderFor32Bit(&gsCopy, &gsCopy32);
    int oMain = nE;
    pOffset[nE] = GFDAddDataTable(pDT, &gsCopy32, size);
    pAddr[nE]   = 0;       // don't patch this location
    nE++;

    // 1: Store uniform block/buffer array
    GFDUniformBlock *pUB = (GFDUniformBlock *) malloc(sizeof(GFDUniformBlock)*gsCopy.numUniformBlocks);
    size = GFDRepackUniformBlockArrayFor32Bit(gsCopy.uniformBlocks, pUB, gsCopy.numUniformBlocks);
    int oUniformBuffers = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pUB, size);
    pAddr[nE]  = (gsCopy.numUniformBlocks == 0) ? 0 : pOffset[oMain] + (u32)&gsCopy32.uniformBlocks - (u32)&gsCopy32;
    nE++;
    free(pUB);

    // 2: Store uniform array
    GFDUniformVar *pUV = (GFDUniformVar *) malloc(sizeof(GFDUniformVar)*gsCopy.numUniforms);
    size = GFDRepackUniformVarArrayFor32Bit(gsCopy.uniformVars, pUV, gsCopy.numUniforms);
    int oUniforms = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pUV, size);
    pAddr[nE]  = (gsCopy.numUniforms == 0) ? 0 : pOffset[oMain] + (u32)&gsCopy32.uniformVars - (u32)&gsCopy32;
    nE++;
    free(pUV);

    // 3: Store uniform initial values
    // Note initial values points to uniform block, a contiguous section of GX2UniformInitialValues
    int oUniformData = nE;
    pOffset[nE] = GFDAddDataTable(pDT, gsCopy.initialValues, gsCopy.numInitialValues * sizeof(GX2UniformInitialValue));
    pAddr[nE]   = (gsCopy.numInitialValues == 0) ? 0 : pOffset[oMain] + (u32)&gsCopy32.initialValues - (u32)&gsCopy32;
    nE++;

    // 4: Store loop
    int oLoops = nE;
    pOffset[nE] = GFDAddDataTable(pDT, gsCopy._loopVars, gsCopy._numLoops * sizeof(GFDLoopVar));
    pAddr[nE]  = (gsCopy._numLoops == 0) ? 0 : pOffset[oMain] + (u32)&gsCopy32._loopVars - (u32)&gsCopy32;
    nE++;

    // 5: Store sampler descriptors
    GFDSamplerVar *pSV = (GFDSamplerVar *) malloc(sizeof(GFDSamplerVar)*gsCopy.numSamplers);
    size = GFDRepackSamplerVarArrayFor32Bit(gsCopy.samplerVars, pSV, gsCopy.numSamplers);
    int oSamplers = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pSV, size);
    pAddr[nE]   = (gsCopy.numSamplers == 0) ? 0 : pOffset[oMain] + (u32)&gsCopy32.samplerVars - (u32)&gsCopy32;
    nE++;
    free(pSV);

    // 6a: Create a string table to store all the strings in
    const int kAvgCharsPerString = 12;   // will auto-grow if actually biger
    GFDStringTable *pStrTable = GFDCreateStringTable( ( gsCopy.numUniformBlocks +  gsCopy.numUniforms + gsCopy.numSamplers) * kAvgCharsPerString);
    u32 offStringTable = pDT->m_nDB;     // current offset...


    // s1: Store each uniform block name (in common string table), as well as pointer to the initial value in uniform block
    for(u32 i = 0; i < gsCopy.numUniformBlocks; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, gsCopy.uniformBlocks[i].name );
        pAddr[nE]   = pOffset[oUniformBuffers] + i * sizeof(GFDUniformBlock) + (u32)&gsCopy.uniformBlocks[i].name - (u32)&gsCopy.uniformBlocks[i];
        nE++;
    }

    // s2: Store each uniform name (in common string table)
    for(u32 i = 0; i < gsCopy.numUniforms; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, gsCopy.uniformVars[i].name );
        pAddr[nE]   = pOffset[oUniforms] + i * sizeof(GFDUniformVar) + (u32)&gsCopy.uniformVars[i].name - (u32)&gsCopy.uniformVars[i];
        nE++;
    }

    // s3: Store each sampler name (in common string table)
    for(u32 i = 0; i < gsCopy.numSamplers; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, gsCopy.samplerVars[i].name );
        pAddr[nE]   = pOffset[oSamplers] + i * sizeof(GFDSamplerVar) + (u32)&gsCopy.samplerVars[i].name - (u32)&gsCopy.samplerVars[i];
        nE++;
    }

    // 7: Store the string table (watch out for 1-3 bytes of padding)
    int oStringTable = nE;

    // Note, although arrays of chars don't seem to be modified to go into network order, we cache our
    // stringtable as a block in a word array which does get byte-flipped.  So let's pre-flip it here
    // so it comes out right.
    int nWordsStrTbl = (pStrTable->m_nDB + 0x3) / 4;
    GFDEndianSwap8in32((u32*) pStrTable->m_pDB, nWordsStrTbl);
    pOffset[nE] = GFDAddDataTable(pDT, pStrTable->m_pDB, nWordsStrTbl*4);  // simply write out string table data
    pAddr[nE]   = 0;        // don't patch this location
    nE++;

    // let's convert it back so if we read it latter, we won't have problems
    GFDEndianSwap8in32((u32*) pStrTable->m_pDB, nWordsStrTbl);

    // end0: Turn all our patch addresses to offsets
    for(int i = 0; i < nE; i++)
    {
        if(pAddr[i] != 0)
        {
            *((u32*) (pDT->m_pDB + GFDCleanTag(pAddr[i]))) = pOffset[i];
        }
    }

    // 8: Store the offset patch list
    // (not *really* needed, could reconstruct if know all data types, but makes it a *lot* easier)
    // After reading data block in at addrX, increment each location i of *(addrX + pAddr[i]) += addrX;
    // By putting this after main, we can allocate space for it at same time and dellocate it when main goes away,
    // without affecting the main data.
    int oPatchTable = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pAddr, nElements * sizeof(u32));  // simply write out zeros now..
    pAddr[nE]   = 0;        // don't patch this location either
    nE++;

    // 9: Finally, a small structure describing this data block.
    GFDBlockRelocationHeader *gshTrailer = (GFDBlockRelocationHeader *) malloc( sizeof(GFDBlockRelocationHeader) );
    memset(gshTrailer, 0, sizeof(GFDBlockRelocationHeader));
    gshTrailer->magic = GFD_SWAP_8_IN_32(GFD_BLOCK_RELOCATION_HEADER_MAGIC);
    gshTrailer->type  = 0;
    gshTrailer->size  = sizeof(GFDBlockRelocationHeader);

    int oTrailer = nE;

    // Fill in our trailer and write it out
    gshTrailer->dataSize               = GFDCleanTag(pOffset[oPatchTable]) - GFDCleanTag(pOffset[oMain]);  // size of the main data section (allocate this size of contiguous memory)
    gshTrailer->dataOffset             = pOffset[oMain];             // offset of the main data section in this block
    gshTrailer->stringTableCharNumber  = pStrTable->m_nDB;           // number of characters in the string table
    gshTrailer->stringTableOffset      = pOffset[oStringTable];      // offset of string table in this block
    gshTrailer->patchTableOffsetNumber = nE;                         // number of offsets in the patch table
    gshTrailer->patchTableOffset       = pOffset[oPatchTable] ;      // offset of the patch table in this block

    pOffset[nE] = GFDAddDataTable(pDT, gshTrailer, sizeof(GFDBlockRelocationHeader));
    pAddr[nE] = 0;          // don't patch this location
    nE++;

    free(gshTrailer);
    gshTrailer = NULL;

    assert(offStringTable == GFDCleanTag(pOffset[oStringTable]) && "Guess for offset table in 7a was wrong");
    assert(nE <= nElements && "Too few offsets allocated");

    free(pAddr);
    free(pOffset);

    GFDDestroyStringTable( pStrTable );

    return pDT;
}
#if CAFE_OS_SDK_VERSION >= 21104
/// Create the flat datablock representation of a GX2ComputeShader structure
/// Call GFDDataTableDestroy() on returned object once done with it.
GFDDataTable* GFDCreateBlockRelocateHeaderCSH(GX2ComputeShader *pCS)
{
    // Create second data structure to hold flattened, offseted version of our original shader
    GX2ComputeShader csCopy;
    memcpy(&csCopy,  pCS,  sizeof(GX2ComputeShader));

    // Walk thru copy, converting all pointers to data blocks in table, and changing
    // addresses to offsets into the data block

    // Create data table to hold the structure elements
    GFDDataTable *pDT = GFDCreateDataTable(sizeof(GX2ComputeShader) + csCopy.numUniformBlocks * sizeof(GX2UniformBlock) + csCopy.numUniforms * sizeof(GX2UniformVar) + csCopy.numSamplers * sizeof(GX2SamplerVar) + 512);      // todo - pick a better number for string table
    if ( !pDT )
    {
        printf("Error! Can't create GFD Data Table!\n");
        return NULL;
    }

    // How many pointers do we need to patch? (The value here is empirically determined - asserts at end if if wrong)
    // 10 = 1 (shader itself) + 6 in GX2ComputeShader + 1 (string table) + 1 (patch list) + 1 (trailer)
    int nElements = 10 + csCopy.numUniformBlocks + csCopy.numUniforms + csCopy.numSamplers;
    int nE = 0;
    u32 size;
    u32 *pAddr   = (u32*) malloc( nElements * sizeof(u32));    // src offsets into data block that need patching
    u32 *pOffset = (u32*) malloc( nElements * sizeof(u32));    // dst offsets into data block for, containing offets to write

    if ( !pAddr || !pOffset )
    {
        printf("Error! Memory allocation failure!\n");
        GFDDestroyDataTable(pDT);
        if (pAddr) free(pAddr);
        if (pOffset) free(pOffset);
        return NULL;
    }
    memset(pOffset, 0, nElements * sizeof(u32));
    memset(pAddr, 0,  nElements * sizeof(u32));

    // 0: Store main structure itself  (we'll rewrite offsets at the end).
    // For allocation and alignment purposes, this needs to be the first hunk in the data table
    GFDComputeShader csCopy32;
    size = GFDRepackComputeShaderFor32Bit(&csCopy, &csCopy32);
    int oMain = nE;
    pOffset[nE] = GFDAddDataTable(pDT, &csCopy32, size);
    pAddr[nE]   = 0;       // don't patch this location
    nE++;

    // 1: Store uniform block/buffer array
    GFDUniformBlock *pUB = (GFDUniformBlock *) malloc(sizeof(GFDUniformBlock)*csCopy.numUniformBlocks);
    if ( !pUB )
    {
        printf("Error! Failed to allocate Uniform Block structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    size = GFDRepackUniformBlockArrayFor32Bit(csCopy.uniformBlocks, pUB, csCopy.numUniformBlocks);
    int oUniformBuffers = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pUB, size);
    pAddr[nE]  = (csCopy.numUniformBlocks == 0) ? 0 : pOffset[oMain] + (u32)&csCopy32.uniformBlocks - (u32)&csCopy32;
    nE++;
    free(pUB);

    // 2: Store uniform array
    GFDUniformVar *pUV = (GFDUniformVar *) malloc(sizeof(GFDUniformVar)*csCopy.numUniforms);
    if ( !pUV )
    {
        printf("Error! Failed to allocate Uniform Variable structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    size = GFDRepackUniformVarArrayFor32Bit(csCopy.uniformVars, pUV, csCopy.numUniforms);
    int oUniforms = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pUV, size);
    pAddr[nE]  = (csCopy.numUniforms == 0) ? 0 : pOffset[oMain] + (u32)&csCopy32.uniformVars - (u32)&csCopy32;
    nE++;
    free(pUV);

    // 3: Store uniform initial values
    // Note initial values points to uniform block, a contiguous section of GX2UniformInitialValues
    int oUniformData = nE;
    pOffset[nE] = GFDAddDataTable(pDT, csCopy.initialValues, csCopy.numInitialValues * sizeof(GX2UniformInitialValue));
    pAddr[nE]   = (csCopy.numInitialValues == 0) ? 0 : pOffset[oMain] + (u32)&csCopy32.initialValues - (u32)&csCopy32;
    nE++;

    // 4: Store loop variables
    int oLoops = nE;
    pOffset[nE] = GFDAddDataTable(pDT, csCopy._loopVars, csCopy._numLoops * sizeof(GFDLoopVar));
    pAddr[nE]  = (csCopy._numLoops == 0) ? 0 : pOffset[oMain] + (u32)&csCopy32._loopVars - (u32)&csCopy32;
    nE++;

    // 5: Store sampler descriptors
    GFDSamplerVar *pSV = (GFDSamplerVar *) malloc(sizeof(GFDSamplerVar)*csCopy.numSamplers);
    if ( !pSV )
    {
        printf("Error! Failed to allocate Sampler Variable structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    size = GFDRepackSamplerVarArrayFor32Bit(csCopy.samplerVars, pSV, csCopy.numSamplers);
    int oSamplers = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pSV, size);
    pAddr[nE]   = (csCopy.numSamplers == 0) ? 0 : pOffset[oMain] + (u32)&csCopy32.samplerVars - (u32)&csCopy32;
    nE++;
    free(pSV);


    // 6a: Create a string table to store all the strings in
    const int kAvgCharsPerString = 12;   // will auto-grow if actually biger
    GFDStringTable *pStrTable = GFDCreateStringTable( ( csCopy.numUniformBlocks + csCopy.numUniforms + csCopy.numSamplers) * kAvgCharsPerString);
    u32 offStringTable = pDT->m_nDB;     // current offset...

    // s1: Store each uniform block name (in common string table), as well as pointer to the initial value in uniform block
    for(u32 i = 0; i < csCopy.numUniformBlocks; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, csCopy.uniformBlocks[i].name );
        pAddr[nE]   = pOffset[oUniformBuffers] + i * sizeof(GFDUniformBlock) + (u32)&csCopy.uniformBlocks[i].name - (u32)&csCopy.uniformBlocks[i];
        nE++;
    }

    // s2: Store each uniform name (in common string table)
    for(u32 i = 0; i < csCopy.numUniforms; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, csCopy.uniformVars[i].name );
        pAddr[nE]   = pOffset[oUniforms] + i * sizeof(GFDUniformVar) + (u32)&csCopy.uniformVars[i].name - (u32)&csCopy.uniformVars[i];
        nE++;
    }

    // s3: Store each sampler name (in common string table)
    for(u32 i = 0; i < csCopy.numSamplers; i++)
    {
        pOffset[nE] = offStringTable + GFDAddStringTable(pStrTable, csCopy.samplerVars[i].name );
        pAddr[nE]   = pOffset[oSamplers] + i * sizeof(GFDSamplerVar) + (u32)&csCopy.samplerVars[i].name - (u32)&csCopy.samplerVars[i];
        nE++;
    }

    // 7: Store the string table (watch out for 1-3 bytes of padding)
    int oStringTable = nE;

    // Note, although arrays of chars don't seem to be modified to go into network order, we cache our
    // stringtable as a block in a word array which does get byte-flipped.  So let's pre-flip it here
    // so it comes out right.
    int nWordsStrTbl = (pStrTable->m_nDB + 0x3) / 4;
    GFDEndianSwap8in32((u32*) pStrTable->m_pDB, nWordsStrTbl);
    pOffset[nE] = GFDAddDataTable(pDT, pStrTable->m_pDB, nWordsStrTbl*4);  // simply write out string table data
    pAddr[nE]   = 0;        // don't patch this location
    nE++;

    // let's convert it back so if we read it latter, we won't have problems
    GFDEndianSwap8in32((u32*) pStrTable->m_pDB, nWordsStrTbl);

    // end0: Turn all our patch addresses to offsets
    for(int i = 0; i < nE; i++)
    {
        if(pAddr[i] != 0)
        {
            *((u32*) (pDT->m_pDB + GFDCleanTag(pAddr[i]))) = pOffset[i];
        }
    }

    // 8: Store the offset patch list
    // (not *really* needed, could reconstruct if know all data types, but makes it a *lot* easier)
    // After reading data block in at addrX, increment each location i of *(addrX + pAddr[i]) += addrX;
    // By putting this after main, we can allocate space for it at same time and dellocate it when main goes away,
    // without affecting the main data.
    int oPatchTable = nE;
    pOffset[nE] = GFDAddDataTable(pDT, pAddr, nElements * sizeof(u32));  // simply write out zeros now..
    pAddr[nE]   = 0;        // don't patch this location either
    nE++;

    // 9: Finally, a small structure describing this data block.
    GFDBlockRelocationHeader *vshTrailer = (GFDBlockRelocationHeader *) malloc( sizeof(GFDBlockRelocationHeader) );
    if ( !vshTrailer )
    {
        printf("Error! Failed to allocate structure!\n");
        GFDDestroyDataTable(pDT);
        free(pAddr);
        free(pOffset);
        return NULL;
    }
    memset(vshTrailer, 0, sizeof(GFDBlockRelocationHeader));
    vshTrailer->magic = GFD_SWAP_8_IN_32(GFD_BLOCK_RELOCATION_HEADER_MAGIC);
    vshTrailer->type  = 0;
    vshTrailer->size  = sizeof(GFDBlockRelocationHeader);

    int oTrailer = nE;

    // Fill in our trailer and write it out
    vshTrailer->dataSize               = GFDCleanTag(pOffset[oPatchTable]) - GFDCleanTag(pOffset[oMain]);  // size of the main data section (allocate this size of contiguous memory)
    vshTrailer->dataOffset             = pOffset[oMain];             // offset of the main data section in this block
    vshTrailer->stringTableCharNumber  = pStrTable->m_nDB;           // number of characters in the string table
    vshTrailer->stringTableOffset      = pOffset[oStringTable];      // offset of string table in this block
    vshTrailer->patchTableOffsetNumber = nE;                         // number of offsets in the patch table
    vshTrailer->patchTableOffset       = pOffset[oPatchTable] ;      // offset of the patch table in this block

    pOffset[nE] = GFDAddDataTable(pDT, vshTrailer, sizeof(GFDBlockRelocationHeader));
    pAddr[nE] = 0;          // don't patch this location
    nE++;

    free(vshTrailer);
    vshTrailer = NULL;

    assert(offStringTable == GFDCleanTag(pOffset[oStringTable]) && "Guess for offset table in 7a was wrong");
    assert(nE <= nElements && "Too few offsets allocated");

    free(pAddr);
    free(pOffset);

    GFDDestroyStringTable( pStrTable );
    return pDT;
}
#endif

char *GFDWriteMemVertexShaderBlock(char *fp, char *topShader, GFDEndianSwapMode swapMode, GFDAlignMode alignMode, GX2VertexShader *pVS)
{
    if(pVS == NULL)
        return false;

    // Get info about the actual shader program
    u32   nBytesProg = pVS->shaderSize;
    void* pDataProg  = pVS->shaderPtr;  // Current location, we will relocate it

    // Set NULL
    pVS->shaderPtr = NULL;

    // Now convert structure into a flat, relocatable format
    GFDDataTable *pDT_VS = GFDCreateBlockRelocateHeaderVSH(pVS);

    if(NULL == pDT_VS)
        return false;

    int nBytesVSStruct = pDT_VS->m_nDB;  // How big is our vertex struct (changes size due to uniforms, samplers, and other varying things)

    // Write header for VS struct
    fp = GFDWriteMemBlockHeader(fp, GFD_BLOCK_TYPE_GX2_VSH_HEADER, nBytesVSStruct);

    // Write VS struct
    fp = GFDWriteMemPPCData(fp, (nBytesVSStruct + 3) / 4, GFD_ELEMENT_SIZE_32, (u32 *) pDT_VS->m_pDB);

    // Add pad block
    if(alignMode)
    {
        // Calc padding size for shader align
        u32 padSize = ((u32)((fp-topShader) + 2 * GFD_BLOCK_HEADER_SIZE +(GX2_SHADER_ALIGNMENT-1)) & ~(GX2_SHADER_ALIGNMENT-1)) - (u32)(fp-topShader) - 2 * GFD_BLOCK_HEADER_SIZE;
        while(padSize < 0)
            padSize += GX2_SHADER_ALIGNMENT;

        fp = GFDWriteMemPadBlock(fp, padSize);
    }

    //  Write out Header for program block
    fp = GFDWriteMemBlockHeader(fp, GFD_BLOCK_TYPE_GX2_VSH_PROGRAM, nBytesProg);

    // Write program data block
    fp = GFDWriteMemGPUData(fp, (nBytesProg + 3) / 4, GFD_ELEMENT_SIZE_32, swapMode, (u32 *) pDataProg);

    GFDDestroyDataTable(pDT_VS);

    // Restore
    pVS->shaderPtr =  pDataProg;

    return fp;
}

char *GFDWriteMemPixelShaderBlock(char *fp, char *topShader, GFDEndianSwapMode swapMode, GFDAlignMode alignMode, GX2PixelShader *pPS)
{
    if(pPS == NULL)
        return false;

   // Get info about the actual shader program
    u32   nBytesProg = pPS->shaderSize;
    void* pDataProg  = pPS->shaderPtr;  // Current location, we will relocate it

    // Set NULL
    pPS->shaderPtr = NULL;

    // Convert structure into a flat, relocatable format
    GFDDataTable *pDT_PS = GFDCreateBlockRelocateHeaderPSH(pPS);

    if(NULL == pDT_PS)
        return false;

    int nBytesPSStruct = pDT_PS->m_nDB;  // How big is our pixel shader struct (changes size due to uniforms, samplers, and other varying things)

    // Write header for PS struct
    fp = GFDWriteMemBlockHeader(fp, GFD_BLOCK_TYPE_GX2_PSH_HEADER, nBytesPSStruct);

    // Write PS struct
    fp = GFDWriteMemPPCData(fp, (nBytesPSStruct + 3) / 4, GFD_ELEMENT_SIZE_32, (u32 *) pDT_PS->m_pDB);

    // Add pad block
    if(alignMode)
    {
        // Calc padding size for shader align
        u32 padSize = ((u32)((fp-topShader) + 2 * GFD_BLOCK_HEADER_SIZE +(GX2_SHADER_ALIGNMENT-1)) & ~(GX2_SHADER_ALIGNMENT-1)) - (u32)(fp-topShader) - 2 * GFD_BLOCK_HEADER_SIZE;
        while(padSize < 0)
            padSize += GX2_SHADER_ALIGNMENT;

        fp = GFDWriteMemPadBlock(fp, padSize);
    }

    // Write Header for program block
    fp = GFDWriteMemBlockHeader(fp, GFD_BLOCK_TYPE_GX2_PSH_PROGRAM, nBytesProg);

    // Write program data block
    fp = GFDWriteMemGPUData(fp, (nBytesProg + 3) / 4, GFD_ELEMENT_SIZE_32, swapMode, (u32 *) pDataProg);

    GFDDestroyDataTable(pDT_PS);

    // Restore
    pPS->shaderPtr = pDataProg;

    return fp;
}

char *GFDWriteMemGeometryShaderBlock(char *fp, char *topShader, GFDEndianSwapMode swapMode, GFDAlignMode alignMode, GX2GeometryShader *pGS)
{
    if(pGS == NULL)
        return false;

    // Get info about the actual shader program
    u32   nBytesProg = pGS->shaderSize;
    void* pDataProg  = pGS->shaderPtr;  // Current location, we will relocate it

    u32   nBytesCopyProg = pGS->copyShaderSize;
    void* pDataCopyProg  = pGS->copyShaderPtr;  // Current location, we will relocate it

    // Set NULL
    pGS->shaderPtr     = NULL;
    pGS->copyShaderPtr = NULL;

    // Convert structure into a flat, relocatable format
    GFDDataTable *pDT_GS = GFDCreateBlockRelocateHeaderGSH(pGS);

    if(NULL == pDT_GS)
        return false;

    int nBytesGSStruct = pDT_GS->m_nDB;  // How big is our shader struct (changes size due to uniforms, samplers, and other varying things)

    // Write header for GS struct
    fp = GFDWriteMemBlockHeader(fp, GFD_BLOCK_TYPE_GX2_GSH_HEADER, nBytesGSStruct);

    // Write GS struct
    fp = GFDWriteMemPPCData(fp, (nBytesGSStruct + 3) / 4, GFD_ELEMENT_SIZE_32, (u32 *) pDT_GS->m_pDB);

    // Add pad block
    if(alignMode)
    {
        // Calc padding size for shader align
        u32 padSize = ((u32)((fp-topShader) + 2 * GFD_BLOCK_HEADER_SIZE +(GX2_SHADER_ALIGNMENT-1)) & ~(GX2_SHADER_ALIGNMENT-1)) - (u32)(fp-topShader) - 2 * GFD_BLOCK_HEADER_SIZE;
        while(padSize < 0)
            padSize += GX2_SHADER_ALIGNMENT;

        fp = GFDWriteMemPadBlock(fp, padSize);
    }

    // Write Header for program block
    fp = GFDWriteMemBlockHeader(fp, GFD_BLOCK_TYPE_GX2_GSH_PROGRAM, nBytesProg);

    // Write program data block
    fp = GFDWriteMemGPUData(fp, (nBytesProg + 3) / 4, GFD_ELEMENT_SIZE_32, swapMode, (u32 *) pDataProg);

    // Add pad block
    if(alignMode)
    {
        // Calc padding size for shader align
        u32 padSize = ((u32)((fp-topShader) + 2 * GFD_BLOCK_HEADER_SIZE +(GX2_SHADER_ALIGNMENT-1)) & ~(GX2_SHADER_ALIGNMENT-1)) - (u32)(fp-topShader) - 2 * GFD_BLOCK_HEADER_SIZE;
        while(padSize < 0)
            padSize += GX2_SHADER_ALIGNMENT;

        if(!GFDWriteMemPadBlock(fp, padSize))
            return false;
    }

    // Write Header for copy program block
    fp = GFDWriteMemBlockHeader(fp, GFD_BLOCK_TYPE_GX2_GSH_COPY_PROGRAM, nBytesCopyProg);

    // Write copy program data block
    fp =  GFDWriteMemGPUData(fp, (nBytesCopyProg + 3) / 4, GFD_ELEMENT_SIZE_32, swapMode, (u32 *) pDataCopyProg);

    GFDDestroyDataTable(pDT_GS);

    // Restore
    pGS->shaderPtr = pDataProg;
    pGS->copyShaderPtr = pDataCopyProg;

    return fp;
}

#if CAFE_OS_SDK_VERSION >= 21104
char* GFDWriteMemComputeShaderBlock(char *fp, char *topShader, GFDEndianSwapMode swapMode, GFDAlignMode alignMode, GX2ComputeShader *pCS)
{
    if(pCS == NULL)
        return false;

    // Get info about the actual shader program
    u32   nBytesProg = pCS->shaderSize;
    void* pDataProg  = pCS->shaderPtr;  // Current location, we will relocate it

    // Set NULL
    pCS->shaderPtr = NULL;

    // Now convert structure into a flat, relocatable format
    GFDDataTable *pDT_CS = GFDCreateBlockRelocateHeaderCSH(pCS);

    if(NULL == pDT_CS)
        return false;

    int nBytesCSStruct = pDT_CS->m_nDB;  // How big is our compute struct (changes size due to uniforms, samplers, and other varying things)

    // Write header for CS struct
    if (!GFDWriteMemBlockHeader(fp, GFD_BLOCK_TYPE_GX2_CSH_HEADER, nBytesCSStruct))
    {
        GFDDestroyDataTable(pDT_CS);
        return false;
    }

    // Write CS struct
    if (!GFDWriteMemPPCData(fp, (nBytesCSStruct + 3) / 4, GFD_ELEMENT_SIZE_32, (u32 *)pDT_CS->m_pDB))
    {
        GFDDestroyDataTable(pDT_CS);
        return false;
    }

    // Add pad block
    if(alignMode)
    {
        // Calc padding size for shader align
        u32 padSize = ((u32)((fp-topShader)  + 2 * GFD_BLOCK_HEADER_SIZE +(GX2_SHADER_ALIGNMENT-1)) & ~(GX2_SHADER_ALIGNMENT-1)) - (u32)(fp-topShader) - 2 * GFD_BLOCK_HEADER_SIZE;
        while(padSize < 0)
            padSize += GX2_SHADER_ALIGNMENT;

        if (!GFDWriteMemPadBlock(fp, padSize))
        {
            GFDDestroyDataTable(pDT_CS);
            return false;
        }
    }

    //  Write out Header for program block
    if(!GFDWriteMemBlockHeader(fp, GFD_BLOCK_TYPE_GX2_CSH_PROGRAM, nBytesProg))
        return false;

    // Write program data block
    if(!GFDWriteMemGPUData(fp, (nBytesProg + 3) / 4, GFD_ELEMENT_SIZE_32, swapMode, (u32 *) pDataProg))
        return false;

    GFDDestroyDataTable(pDT_CS);

    // Restore
    pCS->shaderPtr =  pDataProg;

    return fp;
}
#endif

#if CAFE_OS_SDK_VERSION >= 21104
bool GFD_API GFDWriteMemShader(char** pShader, GFDGPUVersion gpuVer, GFDEndianSwapMode swapMode, GFDAlignMode alignMode, u32 numShader, const GFDShaders *pShadersOrig, u32 *shaderSize )
{
    GFDShaders2 shaders = {0};

    if (!pShadersOrig)
    {
        return false;
    }

    shaders.abiVersion = GFD_DLL_ABI_VERSION;
    shaders.pVertexShader = pShadersOrig->pVertexShader;
    shaders.pGeometryShader = pShadersOrig->pGeometryShader;
    shaders.pPixelShader = pShadersOrig->pPixelShader;

    return GFDWriteMemShader2( pShader, gpuVer, swapMode, alignMode, numShader, &shaders, shaderSize);
}

bool GFD_API GFDWriteMemShader2(char** pShader, GFDGPUVersion gpuVer, GFDEndianSwapMode swapMode, GFDAlignMode alignMode, u32 numShader, const GFDShaders2 *pShaders, u32 *shaderSize )
{
 //   FILE               *fpout = NULL;
 //   u32                 count = 0;

    if (!pShaders)
    {
        return false;
    }

    if ( GFD_DLL_ABI_TYPE(pShaders->abiVersion) != GFD_DLL_ABI_TYPE(GFD_DLL_ABI_VERSION) ||
         GFD_DLL_ABI_VERSION_NUM(pShaders->abiVersion) > 0)
    {
        return false;
    }
#else
bool GFD_API GFDWriteMemShader(char** pShader, GFDGPUVersion gpuVer, GFDEndianSwapMode swapMode, GFDAlignMode alignMode, u32 numShader, GFDShaders *pShaders, u32 *shaderSize )
{
#endif
    // Check gpu version
    switch(gpuVer) {
    case GFD_GPU_VERSION_0:
        break;
    case GFD_GPU_VERSION_1:
        break;
    case GFD_GPU_VERSION_GPU7:
        break;
    default:
        printf("Warning: Unsupported GPU %d, using default\n", gpuVer);
        gpuVer = GFD_GPU_VERSION_GPU7;
        break;
    }
    // estimate total buffer size which should be allocated for shader binary.
    u32 totalSize = 0;
    for ( u32 i = 0; i < numShader; i ++)
    {
        if ( pShaders[i].pVertexShader != NULL )
        {
            // ユニフォームのシンボルテーブルを削除
            pShaders[i].pVertexShader->numUniforms = 0;
            pShaders[i].pVertexShader->uniformVars = NULL;

            GFDDataTable *pDT_VS = GFDCreateBlockRelocateHeaderVSH( pShaders[i].pVertexShader );
            if ( pDT_VS )
            {
                totalSize += pDT_VS->m_maxDB;
                GFDDestroyDataTable( pDT_VS );
            }
        }

        if ( pShaders[i].pPixelShader != NULL )
        {
            // ユニフォームのシンボルテーブルを削除
            pShaders[i].pPixelShader->numUniforms = 0;
            pShaders[i].pPixelShader->uniformVars = NULL;

            GFDDataTable *pDT_VS = GFDCreateBlockRelocateHeaderPSH( pShaders[i].pPixelShader );
            if ( pDT_VS )
            {
                totalSize += pDT_VS->m_maxDB;
                GFDDestroyDataTable( pDT_VS );
            }
        }

        if ( pShaders[i].pGeometryShader != NULL )
        {
            GFDDataTable *pDT_VS = GFDCreateBlockRelocateHeaderGSH( pShaders[i].pGeometryShader );
            if ( pDT_VS )
            {
                totalSize += pDT_VS->m_maxDB;
                GFDDestroyDataTable( pDT_VS );
            }
        }
#if CAFE_OS_SDK_VERSION >= 21104
        if ( pShaders[i].pComputeShader != NULL )
        {
            GFDDataTable *pDT_VS = GFDCreateBlockRelocateHeaderCSH( pShaders[i].pComputeShader );
            if ( pDT_VS )
            {
                totalSize += pDT_VS->m_maxDB;
                GFDDestroyDataTable( pDT_VS );
            }
        }
#endif
    }
    totalSize *= 5;
    *pShader = (char *)malloc( totalSize );

    if ( *pShader == NULL ) return false;
    char *pSp  = *pShader;
    memset( pSp, 0, totalSize );
    u32  count = 0;

    // Write the file header
    pSp = GFDWriteMemHeader(pSp, gpuVer);

    // Writes multiple shader blocks
    for (count = 0; count < numShader; count++)
    {
        if(NULL != &pShaders[count])
        {
            if(NULL != pShaders[count].pVertexShader)
            {
                pSp = GFDWriteMemVertexShaderBlock(pSp, *pShader, swapMode, alignMode, pShaders[count].pVertexShader);
            }

            if(NULL != pShaders[count].pPixelShader)
            {
                pSp = GFDWriteMemPixelShaderBlock(pSp, *pShader, swapMode, alignMode, pShaders[count].pPixelShader);
            }

            if(NULL != pShaders[count].pGeometryShader)
            {

                pSp = GFDWriteMemGeometryShaderBlock(pSp, *pShader, swapMode, alignMode, pShaders[count].pGeometryShader);
            }

#if CAFE_OS_SDK_VERSION >= 21104
            if(NULL != pShaders[count].pComputeShader)
            {

                pSp = GFDWriteMemComputeShaderBlock(pSp, *pShader, swapMode, alignMode, pShaders[count].pComputeShader);
            }
#endif
        }
    }

    // Write an 'End' block to the file
    pSp = GFDWriteMemBlockHeader(pSp, GFD_BLOCK_TYPE_END, 0);

    *shaderSize = (u32)(pSp - *pShader);

    return true;
}

#if 0
bool GFD_API GFDAppendWriteMemShader(cchar* pShader, GFDGPUVersion gpuVer, GFDEndianSwapMode swapMode, GFDAlignMode alignMode, u32 numShader, GFDShaders *pShaders, u32 *shaderSize)
{
    char               *pSp = pShader;
    u32                 count = 0;
    GFDHeader           fileHeader;

    if (pShader == NULL) return false;

    // Read File Header
    if(!GFDReadMemPPCData(&fileHeader, (GFD_HEADER_SIZE + 3) / 4, GFD_ELEMENT_SIZE_32, fpout))
    {
        printf("Error: Can't read file header.\n");
        return false;
    }

    // check gpu version
    if(fileHeader.gpuVersion != gpuVer)
    {
        printf("Error: GPU version is different.\n");
        return false;
    }

    // check header version
    if(!GFDCheckHeaderMagicVersions(&fileHeader))
    {
        printf("Error: Format version is different.\n");
        return false;
    }
#if 0
    // seeks to beginning of 'End' block
    fseek(fpout, -(s32)GFD_BLOCK_HEADER_SIZE, SEEK_END);

    // append writes multiple shader blocks
    for (count = 0; count < numShader; count++)
    {
        if(NULL != &pShaders[count])
        {
            if(NULL != pShaders[count].pVertexShader)
            {
                if(!GFDWriteFileVertexShaderBlock(fpout, swapMode, alignMode, pShaders[count].pVertexShader))
                {
                    printf("Error: Can't write vsh block.\n");
                    GFDCloseFile(fpout);
                    return false;
                }
            }

            if(NULL != pShaders[count].pPixelShader)
            {
                if(!GFDWriteFilePixelShaderBlock(fpout, swapMode, alignMode, pShaders[count].pPixelShader))
                {
                    printf("Error: Can't write psh block.\n");
                    GFDCloseFile(fpout);
                    return false;
                }
            }

            if(NULL != pShaders[count].pGeometryShader)
            {
                if(!GFDWriteFileGeometryShaderBlock(fpout, swapMode, alignMode, pShaders[count].pGeometryShader))
                {
                    printf("Error: Can't write psh block.\n");
                    GFDCloseFile(fpout);
                    return false;
                }
            }
        }
    }

    // Write an 'End' block to the file
    if(!GFDWriteFileBlockHeader(fpout, GFD_BLOCK_TYPE_END, 0))
    {
        printf("Error: Can't write end block header\n");
        GFDCloseFile(fpout);
        return false;
    }

    GFDCloseFile(fpout);
#endif
    return true;
}

// Write As Code-----

void _GFDWriteGX2AttributesAsCode(FILE *fp, const char* sName, u32 nAttribs,  GX2AttribVar *pAV)
{
    if (nAttribs == 0) {
        return;
    } else {
        fprintf(fp, "GX2AttribVar %s_attrib_vars[] = {\n", sName);

        for(u32 i = 0; i < nAttribs; i++)
        {
            GX2AttribVar *pAVI = &(pAV[i]);
            fprintf(fp,"    {\"%s\", %s, %u, %u}%c\n", pAVI->name,
                    varTypeName[pAVI->type], pAVI->arrayCount, pAVI->location,
                    (i==nAttribs-1)?' ':',');
        }
        fprintf(fp,"};\n");
    }
    fprintf(fp,"\n");
}

void _GFDWriteGX2SamplersAsCode(FILE *fp, const char* sName, u32 nSamplers, GX2SamplerVar *pSV)
{
    if (nSamplers == 0) {
        return;
    } else {
        fprintf(fp, "GX2SamplerVar %s_sampler_vars[] = {\n", sName);

        for(u32 i = 0; i < nSamplers; i++)
        {
            GX2SamplerVar *pSVI = &(pSV[i]);
            fprintf(fp,"    {\"%s\", %s, %u}%c\n", pSVI->name,
                    samplerTypeName[pSVI->type], pSVI->location,
                    (i==nSamplers-1)?' ':',');
        }
        fprintf(fp,"};\n");
    }
    fprintf(fp,"\n");
}

void _GFDWriteGX2UniformsAsCode(FILE *fp, const char* sName, u32 nUniforms, GX2UniformVar *pUV, GX2UniformInitialValue *ivBase)
{
    if (nUniforms == 0) {
        return;
    } else {
        fprintf(fp, "GX2UniformVar %s_uniforms[] = {\n", sName);

        for(u32 i = 0; i < nUniforms; i++)
        {
            GX2UniformVar *pUVI = &(pUV[i]);
            fprintf(fp,"    {\"%s\", %s, %u, %u, ", pUVI->name,
                    varTypeName[pUVI->type], pUVI->arrayCount, pUVI->offset);
            if (pUVI->blockIndex == GX2_UNIFORM_BLOCK_INDEX_INVALID) {
                fprintf(fp,"\n     GX2_UNIFORM_BLOCK_INDEX_INVALID, ");
            } else {
                fprintf(fp,"%u, ", pUVI->blockIndex);
            }
            fprintf(fp,"}%c\n", (i==nUniforms-1)?' ':',');
        }
        fprintf(fp,"};\n");
    }
    fprintf(fp,"\n");
}

void _GFDGX2UniformBlockAsCode(FILE *fp, const char* sName, u32 nUniBlocks, GX2UniformBlock *pUB)
{
    if (nUniBlocks == 0) {
        return;
    } else {
        fprintf(fp, "GX2UniformBlock %s_uniform_blocks[] = {\n", sName);

        for(u32 i = 0; i < nUniBlocks; i++)
        {
            GX2UniformBlock *pUBI = &(pUB[i]);
            fprintf(fp,"    {\"%s\", %u, %u}%c\n", pUBI->name,
                    pUBI->location, pUBI->size,
                    (i==nUniBlocks-1)?' ':',');
        }
        fprintf(fp,"};\n");
    }
    fprintf(fp,"\n");
}

void _GFDWriteWordsAsHex(FILE *fp, u32 *ptr, u32 byteLen)
{
    assert(fp != NULL);
    assert((byteLen & 0x03) == 0);

    fprintf(fp, "    ");
    for(u32 j = 0; j < byteLen/4; j++)
    {
        fprintf(fp,"0x%08x%c", ptr[j], (j==byteLen/4-1)?' ':',');
        if((j & 3) == 3)
        {
            if((j & 0x3f) == 0x3)
                fprintf(fp, "    // 0x%04x\n    ", j-3);
            else
                fprintf(fp, "\n    ");
        }
    }
}

void _GFDWriteWordsAsCode(FILE *fp, const char *name, u32 *ptr, u32 byteLen, const char *attrib)
{
    assert(fp != NULL);
    assert((byteLen & 0x03) == 0);

    if (ptr == NULL || byteLen == 0) {
        return;
    }

    fprintf(fp, "%s static const u32 %s[%u] =\n{\n", attrib, name, byteLen/4);

    _GFDWriteWordsAsHex(fp, ptr, byteLen);

    fprintf(fp, "\n};\n\n");
}

static char *makeName(char *dst, char *src1, char *src2, u32 max)
{
    dst[0]=0;

    // Avoid warnings about strncat being unsafe
    // (because it might write size+1 characters)

    // strncat(dst, src1, max-1);
    strncat_s(dst, max, src1, max-1);
    size_t len=strlen(dst);
    // strncat(dst+len, src2, max-len-1);
    strncat_s(dst+len, max-len, src2, max-len-1);
    return dst;
}

GFD_DECLSPEC bool GFD_API GFDWriteFileShaderAsCode(char* pFilename, GFDEndianSwapMode swapMode, const GFDShaders *pShaders)
{
    FILE *fpout = NULL;

    // Get base name from filename (for structure names)
    char *slash1, *slash2, *dot;
#define BASEMAX 256
    char basename[BASEMAX], tempname[BASEMAX];

    slash1 = strrchr(pFilename, '/');
    slash2 = strrchr(pFilename, '\\');
    if (slash2 > slash1) slash1 = slash2;
    basename[0]=0;
    if (slash1) {
        // strncat(basename, slash1+1, BASEMAX-1);
        strncat_s(basename, BASEMAX, slash1+1, BASEMAX-1);
    } else {
        // strncat(basename, pFilename, BASEMAX-1);
        strncat_s(basename, BASEMAX, pFilename, BASEMAX-1);
    }
    dot = strrchr(basename, '.');
    if (dot) *dot=0;

    // open file
    if(GFDOpenFile(&fpout, pFilename, "w") != 0)
    {
        printf("Error: Can't open %s\n", pFilename);
        return false;
    }

    fprintf(fpout,"//--------------------------------------------------\n\n");
    fprintf(fpout,"// This file is automatically generated by gfd.\n\n");
    fprintf(fpout,"//--------------------------------------------------\n\n");

    // For Vertex Shader
    if(NULL != pShaders->pVertexShader)
    {
        fprintf(fpout, "// ---------- %s Vertex Shader ----------\n\n", basename);

        // First, write out initial values
        // (because uniforms refer to them)

        _GFDWriteWordsAsCode(fpout, makeName(tempname, basename, "_VS_initial_values", BASEMAX),
                             (u32 *) pShaders->pVertexShader->initialValues,
                             pShaders->pVertexShader->numInitialValues * sizeof(GX2UniformInitialValue), "");

        // Swap endian for GPU7
        if(swapMode != GFD_ENDIAN_SWAP_MODE_8_IN_32)
        {
            int nElem = (pShaders->pVertexShader->shaderSize + 0x3) / 4;
            GFDEndianSwap8in32((u32*)pShaders->pVertexShader->shaderPtr, nElem);
        }

        // Then, write out the shader pieces,
        // and finally, write the shader itself.
        _GFDWriteWordsAsCode(fpout, makeName(tempname, basename, "_VS_shaderPtr", BASEMAX),
                             (u32 *) pShaders->pVertexShader->shaderPtr,
                            pShaders->pVertexShader->shaderSize, "ALIGNVAR(256)");

        _GFDGX2UniformBlockAsCode(fpout, makeName(tempname, basename, "_VS", BASEMAX),
                                  pShaders->pVertexShader->numUniformBlocks,
                                  pShaders->pVertexShader->uniformBlocks);

        _GFDWriteGX2UniformsAsCode(fpout, makeName(tempname, basename, "_VS", BASEMAX),
                                   pShaders->pVertexShader->numUniforms,
                                   pShaders->pVertexShader->uniformVars, pShaders->pVertexShader->initialValues);

        _GFDWriteWordsAsCode(fpout, makeName(tempname, basename, "_VS_loop_vars", BASEMAX),
                             (u32 *) pShaders->pVertexShader->_loopVars,
                             pShaders->pVertexShader->_numLoops * sizeof(_GX2LoopVar), "");

        _GFDWriteGX2SamplersAsCode(fpout, makeName(tempname, basename, "_VS", BASEMAX),
                                   pShaders->pVertexShader->numSamplers,
                                   pShaders->pVertexShader->samplerVars);

        _GFDWriteGX2AttributesAsCode(fpout, makeName(tempname, basename, "_VS", BASEMAX),
                                     pShaders->pVertexShader->numAttribs,
                                     pShaders->pVertexShader->attribVars);

        fprintf(fpout, "\n");
        fprintf(fpout, "\n");
        fprintf(fpout, "static GX2VertexShader %s_VS = {\n", basename);
        fprintf(fpout, "  { // _regs\n");
        _GFDWriteWordsAsHex(fpout, (u32 *) &pShaders->pVertexShader->_regs, sizeof(_GX2VertexShaderRegs));
        fprintf(fpout, "\n  },\n");

        fprintf(fpout, "  %u,\n", pShaders->pVertexShader->shaderSize);
        fprintf(fpout, "  (void *) %s_VS_shaderPtr,\n", basename);

        fprintf(fpout, "  %s,\n", shaderModeName[(u32)pShaders->pVertexShader->shaderMode]);

        fprintf(fpout, "  %u,\n", pShaders->pVertexShader->numUniformBlocks);
        if (pShaders->pVertexShader->numUniformBlocks) {
            fprintf(fpout, "  %s_VS_uniform_blocks,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pVertexShader->numUniforms);
        if (pShaders->pVertexShader->numUniforms) {
            fprintf(fpout, "  %s_VS_uniforms,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pVertexShader->numInitialValues);
        if (pShaders->pVertexShader->numInitialValues) {
            fprintf(fpout, "  (GX2UniformInitialValue *) %s_VS_initial_values,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pVertexShader->_numLoops);
        if (pShaders->pVertexShader->_numLoops) {
            fprintf(fpout, "  (_GX2LoopVar *) %s_VS_loop_vars,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pVertexShader->numSamplers);
        if (pShaders->pVertexShader->numSamplers) {
            fprintf(fpout, "  %s_VS_sampler_vars,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pVertexShader->numAttribs);
        if (pShaders->pVertexShader->numAttribs) {
            fprintf(fpout, "  %s_VS_attrib_vars,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %d,\n", (u32)pShaders->pVertexShader->ringItemsize);

        fprintf(fpout, "  (GX2Boolean)%d,\n", (u32)pShaders->pVertexShader->hasStreamOut);

        fprintf(fpout, "  { \n");
        _GFDWriteWordsAsHex(fpout, (u32 *) &pShaders->pVertexShader->streamOutVertexStride, sizeof(u32)*GX2_MAX_STREAMOUT_BUFFERS);
        fprintf(fpout, "\n  },\n");

        // end of structure
        fprintf(fpout, "};\n\n\n");
    }

    // For Pixel Shader
    if(NULL != pShaders->pPixelShader )
    {
        fprintf(fpout, "// ---------- %s Pixel Shader ----------\n\n", basename);

        // First, write out initial values
        // (because uniforms refer to them)
        _GFDWriteWordsAsCode(fpout, makeName(tempname, basename, "_PS_initial_values", BASEMAX),
                             (u32 *) pShaders->pPixelShader->initialValues,
                             pShaders->pPixelShader->numInitialValues * sizeof(GX2UniformInitialValue), "");

        // Swap endian for GPU7
        if(swapMode != GFD_ENDIAN_SWAP_MODE_8_IN_32)
        {
            int nElem = ( pShaders->pPixelShader->shaderSize + 0x3) / 4;
            GFDEndianSwap8in32((u32*) pShaders->pPixelShader->shaderPtr, nElem);
        }

        // Then, write out the shader pieces,
        // and finally, write the shader itself.
        _GFDWriteWordsAsCode(fpout, makeName(tempname, basename, "_PS_shaderPtr", BASEMAX),
                             (u32 *) pShaders->pPixelShader->shaderPtr,
                             pShaders->pPixelShader->shaderSize, "SET_STRUCT_ALIGN(256)");

        _GFDGX2UniformBlockAsCode(fpout, makeName(tempname, basename, "_PS", BASEMAX),
                                  pShaders->pPixelShader->numUniformBlocks,
                                  pShaders->pPixelShader->uniformBlocks);

        _GFDWriteGX2UniformsAsCode(fpout, makeName(tempname, basename, "_PS", BASEMAX),
                                   pShaders->pPixelShader->numUniforms,
                                   pShaders->pPixelShader->uniformVars, pShaders->pPixelShader->initialValues);

        _GFDWriteWordsAsCode(fpout, makeName(tempname, basename, "_PS_loop_vars", BASEMAX),
                             (u32 *) pShaders->pPixelShader->_loopVars,
                             pShaders->pPixelShader->_numLoops * sizeof(GFDLoopVar), "");

        _GFDWriteGX2SamplersAsCode(fpout, makeName(tempname, basename, "_PS", BASEMAX),
                                   pShaders->pPixelShader->numSamplers,
                                   pShaders->pPixelShader->samplerVars);

        fprintf(fpout, "\n");
        fprintf(fpout, "\n");
        fprintf(fpout, "static GX2PixelShader %s_PS = {\n", basename);
        fprintf(fpout, "  { // _regs\n");
        _GFDWriteWordsAsHex(fpout, (u32 *) &pShaders->pPixelShader->_regs, sizeof(GFDPixelShaderRegs));
        fprintf(fpout, "\n  },\n");

        fprintf(fpout, "  %u,\n", pShaders->pPixelShader->shaderSize);
        fprintf(fpout, "  (void *) %s_PS_shaderPtr,\n", basename);

        fprintf(fpout, "  %s,\n", shaderModeName[(u32)pShaders->pPixelShader->shaderMode]);

        fprintf(fpout, "  %u,\n", pShaders->pPixelShader->numUniformBlocks);
        if (pShaders->pPixelShader->numUniformBlocks) {
            fprintf(fpout, "  %s_PS_uniform_blocks,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pPixelShader->numUniforms);
        if (pShaders->pPixelShader->numUniforms) {
            fprintf(fpout, "  %s_PS_uniforms,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pPixelShader->numInitialValues);
        if (pShaders->pPixelShader->numInitialValues) {
            fprintf(fpout, "  (GX2UniformInitialValue *) %s_PS_initial_values,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pPixelShader->_numLoops);
        if (pShaders->pPixelShader->_numLoops) {
            fprintf(fpout, "  (GFDLoopVar *) %s_PS_loop_vars,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pPixelShader->numSamplers);
        if (pShaders->pPixelShader->numSamplers) {
            fprintf(fpout, "  %s_PS_sampler_vars,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        // end of structure
        fprintf(fpout, "};\n\n\n");
    }

    // For Geometry Shader
    if(NULL != pShaders->pGeometryShader )
    {
        fprintf(fpout, "// ---------- %s Geometry Shader ----------\n\n", basename);

        // First, write out initial values
        // (because uniforms refer to them)
        _GFDWriteWordsAsCode(fpout, makeName(tempname, basename, "_GS_initial_values", BASEMAX),
                             (u32 *) pShaders->pGeometryShader->initialValues,
                             pShaders->pGeometryShader->numInitialValues * sizeof(GX2UniformInitialValue), "");
        // Swap endian for GPU7
        if(swapMode != GFD_ENDIAN_SWAP_MODE_8_IN_32)
        {
            int nElem = ( pShaders->pGeometryShader->shaderSize + 0x3) / 4;
            GFDEndianSwap8in32((u32*) pShaders->pGeometryShader->shaderPtr, nElem);
        }

        // Then, write out the shader pieces,
        // and finally, write the shader itself.
        _GFDWriteWordsAsCode(fpout, makeName(tempname, basename, "_GS_shaderPtr", BASEMAX),
                             (u32 *) pShaders->pGeometryShader->shaderPtr,
                             pShaders->pGeometryShader->shaderSize, "ALIGNVAR(256)");

        // Swap endian for GPU7
        if(swapMode != GFD_ENDIAN_SWAP_MODE_8_IN_32)
        {
            int nElem = ( pShaders->pGeometryShader->copyShaderSize + 0x3) / 4;
            GFDEndianSwap8in32((u32*) pShaders->pGeometryShader->copyShaderPtr, nElem);
        }

        _GFDWriteWordsAsCode(fpout, makeName(tempname, basename, "_GS_copyShaderPtr", BASEMAX),
                             (u32 *) pShaders->pGeometryShader->copyShaderPtr,
                             pShaders->pGeometryShader->copyShaderSize, "ALIGNVAR(256)");

        _GFDGX2UniformBlockAsCode(fpout, makeName(tempname, basename, "_GS", BASEMAX),
                                  pShaders->pGeometryShader->numUniformBlocks,
                                  pShaders->pGeometryShader->uniformBlocks);

        _GFDWriteGX2UniformsAsCode(fpout, makeName(tempname, basename, "_GS", BASEMAX),
                                   pShaders->pGeometryShader->numUniforms,
                                   pShaders->pGeometryShader->uniformVars, pShaders->pGeometryShader->initialValues);

        _GFDWriteWordsAsCode(fpout, makeName(tempname, basename, "_GS_loop_vars", BASEMAX),
                             (u32 *) pShaders->pGeometryShader->_loopVars,
                             pShaders->pGeometryShader->_numLoops * sizeof(GFDLoopVar), "");

        _GFDWriteGX2SamplersAsCode(fpout, makeName(tempname, basename, "_GS", BASEMAX),
                                   pShaders->pGeometryShader->numSamplers,
                                   pShaders->pGeometryShader->samplerVars);

        fprintf(fpout, "\n");
        fprintf(fpout, "\n");
        fprintf(fpout, "static GX2GeometryShader %s_GS = {\n", basename);
        fprintf(fpout, "  { // _regs\n");
        _GFDWriteWordsAsHex(fpout, (u32 *) &pShaders->pGeometryShader->_regs, sizeof(GFDGeometryShaderRegs));
        fprintf(fpout, "\n  },\n");

        fprintf(fpout, "  %u,\n", pShaders->pGeometryShader->shaderSize);
        fprintf(fpout, "  (void *) %s_GS_shaderPtr,\n", basename);
        fprintf(fpout, "  %u,\n", pShaders->pGeometryShader->copyShaderSize);
        fprintf(fpout, "  (void *) %s_GS_copyShaderPtr,\n", basename);
        fprintf(fpout, "  %s,\n", shaderModeName[(u32)pShaders->pGeometryShader->shaderMode]);

        fprintf(fpout, "  %u,\n", pShaders->pGeometryShader->numUniformBlocks);
        if (pShaders->pGeometryShader->numUniformBlocks) {
            fprintf(fpout, "  %s_GS_uniform_blocks,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pGeometryShader->numUniforms);
        if (pShaders->pGeometryShader->numUniforms) {
            fprintf(fpout, "  %s_GS_uniforms,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pGeometryShader->numInitialValues);
        if (pShaders->pGeometryShader->numInitialValues) {
            fprintf(fpout, "  (GX2UniformInitialValue *) %s_GS_initial_values,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pGeometryShader->_numLoops);
        if (pShaders->pGeometryShader->_numLoops) {
            fprintf(fpout, "  (GFDLoopVar *) %s_GS_loop_vars,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %u,\n", pShaders->pGeometryShader->numSamplers);
        if (pShaders->pGeometryShader->numSamplers) {
            fprintf(fpout, "  %s_GS_sampler_vars,\n", basename);
        } else {
            fprintf(fpout, "  NULL,\n");
        }

        fprintf(fpout, "  %d,\n", (u32)pShaders->pGeometryShader->ringItemsize);

        fprintf(fpout, "  (GX2Boolean)%d,\n", (u32)pShaders->pGeometryShader->hasStreamOut);

        fprintf(fpout, "  { \n");
        _GFDWriteWordsAsHex(fpout, (u32 *) &pShaders->pGeometryShader->streamOutVertexStride, sizeof(u32)*GX2_MAX_STREAMOUT_BUFFERS);
        fprintf(fpout, "\n  },\n");

        // end of structure
        fprintf(fpout, "};\n\n\n");
    }
    GFDCloseFile(fpout);

    return true;
}
#endif

#pragma warning ( pop )
