// // test cuda memory bandwidth performance // // cuda kernel version 0.04 created on 9 february 2015 by Skybuck Flying // extern "C" { // extern c begin __global__ void KernelBandwidth(float4 * ParaMemoryBlock, long long ParaMemoryBlockCount) { // calculate linear memory index // int MemoryIndex; // alternative ways to calculate MemoryIndex /* // Method 1: // uses 9 registers for sm_10 // uses 7 registers for sm_20 // 16 instructions MemoryIndex = (threadIdx.x) + (threadIdx.y * blockDim.x) + (threadIdx.z * blockDim.x * blockDim.y) + (blockIdx.x * blockDim.x * blockDim.y * blockDim.z) + (blockIdx.y * blockDim.x * blockDim.y * blockDim.z * gridDim.x) + (blockIdx.z * blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y); MemoryIndex = MemoryIndex % ParaMemoryBlockCount; */ // Method 2: // uses 4 registers for sm_10 // uses 5 registers for sm_20 // still needs to be verified, but looks correct to me ;) // 16 instructions /* MemoryIndex = threadIdx.x + blockDim.x * ( threadIdx.y + blockDim.y * ( threadIdx.z + blockDim.z * ( blockIdx.x + gridDim.x * ( blockIdx.y + gridDim.y * ( blockIdx.z ) ) ) ) ); MemoryIndex = MemoryIndex % ParaMemoryBlockCount; */ /* // Method 3: // uses 8 registers for sm_10 // uses 5 registers for sm_20 int LinearDimension; // 16 instructions MemoryIndex = threadIdx.x; LinearDimension = blockDim.x; MemoryIndex = MemoryIndex + threadIdx.y * LinearDimension; LinearDimension = LinearDimension * blockDim.y; MemoryIndex = MemoryIndex + threadIdx.z * LinearDimension; LinearDimension = LinearDimension * blockDim.z; MemoryIndex = MemoryIndex + blockIdx.x * LinearDimension; LinearDimension = LinearDimension * gridDim.x; MemoryIndex = MemoryIndex + blockIdx.y * LinearDimension; LinearDimension = LinearDimension * gridDim.y; MemoryIndex = MemoryIndex + blockIdx.z * LinearDimension; // LinearDimension = LinearDimension * gridDim.z; MemoryIndex = MemoryIndex % ParaMemoryBlockCount; */ /* // Method 4, 64 bit safe : long long MemoryIndex; long long LinearDimension; long long Variable; // 31 instructions MemoryIndex = threadIdx.x; LinearDimension = blockDim.x; Variable = threadIdx.y; MemoryIndex = MemoryIndex + Variable * LinearDimension; Variable = blockDim.y; LinearDimension = LinearDimension * Variable; Variable = threadIdx.z; MemoryIndex = MemoryIndex + Variable * LinearDimension; Variable = blockDim.z; LinearDimension = LinearDimension * Variable; Variable = blockIdx.x; MemoryIndex = MemoryIndex + Variable * LinearDimension; Variable = gridDim.x; LinearDimension = LinearDimension * Variable; Variable = blockIdx.y; MemoryIndex = MemoryIndex + Variable * LinearDimension; Variable = gridDim.y; LinearDimension = LinearDimension * Variable; Variable = blockIdx.z; MemoryIndex = MemoryIndex + Variable * LinearDimension; // Variable = gridDim.z // LinearDimension = LinearDimension * Variable; Variable = ParaMemoryBlockCount; // make sure memory index lies within memory block range. MemoryIndex = MemoryIndex % Variable; */ // Method 5: // Same as above except using typecasts, seems to produce same ptx result. long long MemoryIndex; long long LinearDimension; // 31 instructions MemoryIndex = threadIdx.x; LinearDimension = blockDim.x; MemoryIndex = MemoryIndex + (long long)(threadIdx.y) * LinearDimension; LinearDimension = LinearDimension * (long long)(blockDim.y); MemoryIndex = MemoryIndex + (long long)(threadIdx.z) * LinearDimension; LinearDimension = LinearDimension * (long long)(blockDim.z); MemoryIndex = MemoryIndex + (long long)(blockIdx.x) * LinearDimension; LinearDimension = LinearDimension * (long long)(gridDim.x); MemoryIndex = MemoryIndex + (long long)(blockIdx.y) * LinearDimension; LinearDimension = LinearDimension * (long long)(gridDim.y); MemoryIndex = MemoryIndex + (long long)(blockIdx.z) * LinearDimension; // LinearDimension = LinearDimension * (long long)(gridDim.z); MemoryIndex = MemoryIndex % (long long)(ParaMemoryBlockCount); /* // Method 6 (bad): // Use mod to keep values in range of memory block count // Little bit less safe... should be ok for 128 MB range. // unsafe, can still overflow easily I think ! ;) // 32 instructions int MemoryIndex; int LinearDimension; // 16 instructions MemoryIndex = threadIdx.x; LinearDimension = blockDim.x; MemoryIndex = (MemoryIndex + threadIdx.y * LinearDimension) % ParaMemoryBlockCount; LinearDimension = (LinearDimension * blockDim.y) % ParaMemoryBlockCount; MemoryIndex = (MemoryIndex + threadIdx.z * LinearDimension) % ParaMemoryBlockCount; LinearDimension = (LinearDimension * blockDim.z) % ParaMemoryBlockCount; MemoryIndex = (MemoryIndex + blockIdx.x * LinearDimension) % ParaMemoryBlockCount; LinearDimension = (LinearDimension * gridDim.x) % ParaMemoryBlockCount; MemoryIndex = (MemoryIndex + blockIdx.y * LinearDimension) % ParaMemoryBlockCount; LinearDimension = (LinearDimension * gridDim.y) % ParaMemoryBlockCount; MemoryIndex = (MemoryIndex + blockIdx.z * LinearDimension) % ParaMemoryBlockCount; // LinearDimension = (LinearDimension * gridDim.z) % ParaMemoryBlockCount;; */ float4 Memory; Memory = ParaMemoryBlock[MemoryIndex]; if (Memory.x == 12345) ParaMemoryBlock[0].x = 0; // Memory.x = Memory.x + 1; // ParaMemoryBlock[MemoryIndex] = Memory; } } // extern c end