// test cuda random memory access performance
//
// cuda kernel version 0.03 created on 12 july 2011 by Skybuck Flying
//

extern "C" 
{ // extern c begin

// __constant__ int LoopCount = 80000;
// __constant__ int BlockCount = 2048;
// __constant__ int ElementCount = 8192;

__global__ void Kernel( int ElementCount, int BlockCount, int LoopCount, volatile int *Memory, volatile int *BlockResult )
//__global__ void Kernel( int ElementCount, int BlockCount, int LoopCount, int *Memory, int *BlockResult )
{
	int BlockIndex;
	int ElementIndex;
	int LoopIndex;
	int LinearIndex;
	
	// uses 9 registers for sm_10
	// uses 7 registers for sm_20
	// alternative ways to calculate BlockIndex
/*	
	BlockIndex = 
		(threadIdx.x) + 
		(threadIdx.y * blockDim.x) + 
		(threadIdx.z * blockDim.x * blockDim.y) + 
		(blockIdx.x * blockDim.x * blockDim.y * blockDim.z) + 
		(blockIdx.y * blockDim.x * blockDim.y * blockDim.z * gridDim.x) +
		(blockIdx.z * blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y);
*/

	// uses 4 registers for sm_10
	// uses 5 registers for sm_20
	// still needs to be verified, but looks correct to me ;)
	BlockIndex = 
		threadIdx.x + blockDim.x *
		(
			threadIdx.y + blockDim.y *
			(			
				threadIdx.z + blockDim.z *
				(
					blockIdx.x + gridDim.x *
					(
						blockIdx.y + gridDim.y *
						(
							blockIdx.z
						)
					)
				)
			)
		);

/*
	// uses 8 registers for sm_10
	// uses 5 registers for sm_20
	int LinearDimension;

	BlockIndex = threadIdx.x;
	LinearDimension = blockDim.x;

	BlockIndex = BlockIndex + threadIdx.y * LinearDimension;
	LinearDimension = LinearDimension * blockDim.y;

	BlockIndex = BlockIndex + threadIdx.z * LinearDimension;
	LinearDimension = LinearDimension * blockDim.z;

	BlockIndex = BlockIndex + blockIdx.x * LinearDimension;
	LinearDimension = LinearDimension * gridDim.x;

	BlockIndex = BlockIndex + blockIdx.y * LinearDimension;
	LinearDimension = LinearDimension * gridDim.y;

	BlockIndex = BlockIndex + blockIdx.z * LinearDimension;
	LinearDimension = LinearDimension * gridDim.z;
*/

	if (BlockIndex < BlockCount)
	{
		ElementIndex = 0;
				
		for (LoopIndex = 0; LoopIndex < LoopCount; LoopIndex++)				
		{
			LinearIndex = ElementIndex + (BlockIndex * ElementCount);
				
			// get next element index
			ElementIndex = Memory[ LinearIndex ];		
		
		}
		
		// each block should output the last element index for a check up and to prevent the kernel from being reduced away ! ;)
		BlockResult[ BlockIndex ] = ElementIndex;
	}
}

} // extern c end