// // test cuda memory bandwidth performance // // cuda kernel version 0.04 created on 9 february 2015 by Skybuck Flying // extern "C" { // extern c begin __global__ void KernelBandwidth(float4 * ParaMemoryBlock, int ParaMemoryBlockCount) { // calculate linear memory index int MemoryIndex; // alternative ways to calculate MemoryIndex /* // Method 1: // uses 9 registers for sm_10 // uses 7 registers for sm_20 // 16 instructions MemoryIndex = (threadIdx.x) + (threadIdx.y * blockDim.x) + (threadIdx.z * blockDim.x * blockDim.y) + (blockIdx.x * blockDim.x * blockDim.y * blockDim.z) + (blockIdx.y * blockDim.x * blockDim.y * blockDim.z * gridDim.x) + (blockIdx.z * blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y); */ // Method 2: // uses 4 registers for sm_10 // uses 5 registers for sm_20 // still needs to be verified, but looks correct to me ;) // 16 instructions MemoryIndex = threadIdx.x + blockDim.x * ( threadIdx.y + blockDim.y * ( threadIdx.z + blockDim.z * ( blockIdx.x + gridDim.x * ( blockIdx.y + gridDim.y * ( blockIdx.z ) ) ) ) ); /* // Method 3: // uses 8 registers for sm_10 // uses 5 registers for sm_20 int LinearDimension; // 16 instructions MemoryIndex = threadIdx.x; LinearDimension = blockDim.x; MemoryIndex = MemoryIndex + threadIdx.y * LinearDimension; LinearDimension = LinearDimension * blockDim.y; MemoryIndex = MemoryIndex + threadIdx.z * LinearDimension; LinearDimension = LinearDimension * blockDim.z; MemoryIndex = MemoryIndex + blockIdx.x * LinearDimension; LinearDimension = LinearDimension * gridDim.x; MemoryIndex = MemoryIndex + blockIdx.y * LinearDimension; LinearDimension = LinearDimension * gridDim.y; MemoryIndex = MemoryIndex + blockIdx.z * LinearDimension; // LinearDimension = LinearDimension * gridDim.z; */ // make sure memory index lies within memory block range. MemoryIndex = MemoryIndex % ParaMemoryBlockCount; float4 Memory; Memory = ParaMemoryBlock[MemoryIndex]; if (Memory.x == 12345) ParaMemoryBlock[0].x = 0; // Memory.x = Memory.x + 1; // ParaMemoryBlock[MemoryIndex] = Memory; } } // extern c end