﻿/*--------------------------------------------------------------------------------*
  Copyright (C)Nintendo All rights reserved.

  These coded instructions, statements, and computer programs contain proprietary
  information of Nintendo and/or its licensed developers and are protected by
  national and international copyright laws. They may not be disclosed to third
  parties or copied or duplicated in any form, in whole or in part, without the
  prior written consent of Nintendo.

  The content herein is highly confidential and should be handled accordingly.
 *--------------------------------------------------------------------------------*/

#extension GL_ARB_shader_storage_buffer_object : enable

#ifdef NN_G3D_COMPUTE_SHADER

#define LDS_INDEX ( gl_LocalInvocationIndex )
#define TILE_DIM  ( gl_NumWorkGroups.xy )

#define TILE_RESOLUTION_X ( 64 ) // @@ id="tile_resolution_x" choice="8, 16, 32, 64, 128, 256" default="64" type="dynamic"
#define TILE_RESOLUTION_Y ( 64 ) // @@ id="tile_resolution_y" choice="8, 16, 32, 64, 128, 256" default="64" type="dynamic"

#define TILE_RESOLUTION    ( TILE_RESOLUTION_X * TILE_RESOLUTION_Y )

#define THREAD_WORK_X      ( TILE_RESOLUTION_X / 8 )         // 1, 2, 4, 8, 16, 32
#define THREAD_WORK_Y      ( TILE_RESOLUTION_Y / 8 )         // 1, 2, 4, 8, 16, 32
#define THREAD_WORK        ( THREAD_WORK_X * THREAD_WORK_Y ) // 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024

#define SAR_COUNT2(x)  (int(((x)>>1)>0)   + int(((x)>>2)>0))
#define SAR_COUNT4(x)  (SAR_COUNT2(x)     + SAR_COUNT2((x)>>2))
#define SAR_COUNT8(x)  (SAR_COUNT4(x)     + SAR_COUNT4((x)>>4))
#define SAR_COUNT16(x) (SAR_COUNT8(x)     + SAR_COUNT8((x)>>8))
#define SAR_COUNT32(x) (SAR_COUNT16(x)    + SAR_COUNT16((x)>>16))
#define LOG2(x)        (SAR_COUNT32(x))

layout( local_size_x = 8, local_size_y = 8, local_size_z = 1 ) in;

uniform sampler2D targetTexture; // @@ id="TargetTexture"

layout( std140 ) buffer ExpBuf	// @@ id="exp_buf" type="none"
{
    vec4 result[]; // [ gl_NumWorkGroups.x * gl_NumWorkGroups.y ]
};

shared vec4 temp[64];

void main()
{
    uvec2 tile_resolution = uvec2(TILE_RESOLUTION_X, TILE_RESOLUTION_Y);

    vec2 uvDelta = vec2(1.0, 1.0) / (TILE_DIM * tile_resolution);
    vec2 uvBase = (tile_resolution * gl_WorkGroupID.xy + gl_LocalInvocationID.xy) * uvDelta;
    vec2 uvThreadDelta = gl_WorkGroupSize.xy * uvDelta;

    vec4 thread_sum = texture(targetTexture, uvBase);
    for(int i = 1; i < THREAD_WORK; ++i)
    {
        thread_sum += texture(targetTexture, uvBase + vec2(i & (THREAD_WORK_X - 1), i >> LOG2(THREAD_WORK_X)) * uvThreadDelta);
    }

    temp[LDS_INDEX] = thread_sum;
    barrier();

    temp[LDS_INDEX] = temp[LDS_INDEX] + temp[LDS_INDEX + 32u];
    barrier();
    temp[LDS_INDEX] = temp[LDS_INDEX] + temp[LDS_INDEX + 16u];
    barrier();
    temp[LDS_INDEX] = temp[LDS_INDEX] + temp[LDS_INDEX + 8u];
    barrier();
    temp[LDS_INDEX] = temp[LDS_INDEX] + temp[LDS_INDEX + 4u];
    barrier();
    temp[LDS_INDEX] = temp[LDS_INDEX] + temp[LDS_INDEX + 2u];
    barrier();
    temp[LDS_INDEX] = temp[LDS_INDEX] + temp[LDS_INDEX + 1u];

    if(LDS_INDEX == 0u)
    {
        result[gl_WorkGroupID.x + gl_WorkGroupID.y * gl_NumWorkGroups.x] = temp[LDS_INDEX] * (1.0 / TILE_RESOLUTION);
    }
}

#endif
