// test cuda random memory access performance // // cuda kernel version 0.03 created on 12 july 2011 by Skybuck Flying // extern "C" { // extern c begin // __constant__ int LoopCount = 80000; // __constant__ int BlockCount = 2048; // __constant__ int ElementCount = 8192; __global__ void Kernel( int ElementCount, int BlockCount, int LoopCount, volatile int *Memory, volatile int *BlockResult ) //__global__ void Kernel( int ElementCount, int BlockCount, int LoopCount, int *Memory, int *BlockResult ) { int BlockIndex; int ElementIndex; int LoopIndex; int LinearIndex; // uses 9 registers for sm_10 // uses 7 registers for sm_20 // alternative ways to calculate BlockIndex /* BlockIndex = (threadIdx.x) + (threadIdx.y * blockDim.x) + (threadIdx.z * blockDim.x * blockDim.y) + (blockIdx.x * blockDim.x * blockDim.y * blockDim.z) + (blockIdx.y * blockDim.x * blockDim.y * blockDim.z * gridDim.x) + (blockIdx.z * blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y); */ // uses 4 registers for sm_10 // uses 5 registers for sm_20 // still needs to be verified, but looks correct to me ;) BlockIndex = threadIdx.x + blockDim.x * ( threadIdx.y + blockDim.y * ( threadIdx.z + blockDim.z * ( blockIdx.x + gridDim.x * ( blockIdx.y + gridDim.y * ( blockIdx.z ) ) ) ) ); /* // uses 8 registers for sm_10 // uses 5 registers for sm_20 int LinearDimension; BlockIndex = threadIdx.x; LinearDimension = blockDim.x; BlockIndex = BlockIndex + threadIdx.y * LinearDimension; LinearDimension = LinearDimension * blockDim.y; BlockIndex = BlockIndex + threadIdx.z * LinearDimension; LinearDimension = LinearDimension * blockDim.z; BlockIndex = BlockIndex + blockIdx.x * LinearDimension; LinearDimension = LinearDimension * gridDim.x; BlockIndex = BlockIndex + blockIdx.y * LinearDimension; LinearDimension = LinearDimension * gridDim.y; BlockIndex = BlockIndex + blockIdx.z * LinearDimension; LinearDimension = LinearDimension * gridDim.z; */ if (BlockIndex < BlockCount) { ElementIndex = 0; for (LoopIndex = 0; LoopIndex < LoopCount; LoopIndex++) { LinearIndex = ElementIndex + (BlockIndex * ElementCount); // get next element index ElementIndex = Memory[ LinearIndex ]; } // each block should output the last element index for a check up and to prevent the kernel from being reduced away ! ;) BlockResult[ BlockIndex ] = ElementIndex; } } } // extern c end