// test cuda random memory access performance // // cuda kernel version 0.02 created on 12 july 2011 by Skybuck Flying // extern "C" { // extern c begin __global__ void Kernel( int ElementCount, int BlockCount, int LoopCount, int *Memory, int *BlockResult ) { int BlockIndex; int ElementIndex; int LoopIndex; int LinearIndex; // uses 9 registers for sm_10 // alternative ways to calculate BlockIndex /* BlockIndex = (threadIdx.x) + (threadIdx.y * blockDim.x) + (threadIdx.z * blockDim.x * blockDim.y) + (blockIdx.x * blockDim.x * blockDim.y * blockDim.z) + (blockIdx.y * blockDim.x * blockDim.y * blockDim.z * gridDim.x) + (blockIdx.z * blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y); */ // uses 8 registers for sm_10 int LinearDimension; BlockIndex = threadIdx.x; LinearDimension = blockDim.x; BlockIndex = BlockIndex + threadIdx.y * LinearDimension; LinearDimension = LinearDimension * blockDim.y; BlockIndex = BlockIndex + threadIdx.z * LinearDimension; LinearDimension = LinearDimension * blockDim.z; BlockIndex = BlockIndex + blockIdx.x * LinearDimension; LinearDimension = LinearDimension * gridDim.x; BlockIndex = BlockIndex + blockIdx.y * LinearDimension; LinearDimension = LinearDimension * gridDim.y; BlockIndex = BlockIndex + blockIdx.z * LinearDimension; LinearDimension = LinearDimension * gridDim.z; if (BlockIndex < BlockCount) { ElementIndex = 0; for (LoopIndex = 0; LoopIndex < LoopCount; LoopIndex++) { LinearIndex = ElementIndex + (BlockIndex * ElementCount); // get next element index ElementIndex = Memory[ LinearIndex ]; } // each block should output the last element index for a check up and to prevent the kernel from being reduced away ! ;) BlockResult[ BlockIndex ] = ElementIndex; } } } // extern c end