Issue with pitch and 3D arrays

Mar 5, 2015 at 2:26 PM
I'm attempting to port some of our CUDA code to managedCUDA.

One of the kernels we use is as follows:
    __global__ void Transpose(float* data, int pitch, int volDimXY, int volDimZ)
    {
        int vi = (blockIdx.x * blockDim.x) + threadIdx.x;        // vi is [0 to 512] of reconstruction volume
        int vj = (blockIdx.y * blockDim.y) + threadIdx.y;        // vj is [0 to 512] of reconstruction volume
        
        if (( vi >= volDimXY ) || ( vj >= volDimXY )) return;
        
        for (int slice = 0; slice < volDimZ; slice++)
        {
            float *zslice = data + slice* pitch * volDimXY;
            
            if (vi > vj)
            {
                float temp = zslice[vi + (vj * pitch)];
                zslice[vi+vj*pitch] = zslice[vj + (vi * pitch)]; // memory scattering
                zslice[vj+vi*pitch] = temp;
            }
        }
    }
#define BLK_DIM_X 32
#define BLK_DIM_Y 32

int _VolDimX = 1024;
int _VolDimY = 1024;
int _VolDimZ  = 192;
The routine is called as follows:
                    ChannelDesc = cudaCreateChannelDesc<float>();
            
                    cudaPitchedPtr _VolumeDataGPU.ptr = NULL;

                    // Extent variable for 3D pitched data
                    _VolumeExtPitched = make_cudaExtent( _VolDimX * sizeof( float ), _VolDimY,  _VolDimZ );

                    // Allocate 3D pitched data in GPU
                    cudaMalloc3D( &_VolumeDataGPU, _VolumeExtPitched );
                    if( cudaSuccess != Log::WriteCudaErrorLog("VolumeBuilder : CreateInstance : cudaMalloc3D _VolumeDataGPU : "))
                    {
                        throw new exception( "VolumeBuilder : CreateInstance : cudaMalloc3D() failed" );
                    }

                    // extent for 3D array
                    _VolumeExtArray = make_cudaExtent(_VolDimX,_VolDimY,_VolDimZ  );
                    // allocate 3D array in GPU
                    cudaMalloc3DArray( &_VolDataGPUArray, &_ChannelDesc, _VolumeExtArray );
                    if( cudaSuccess != Log::WriteCudaErrorLog("VolumeBuilder : CreateInstance : cudaMalloc3DArray _VolDataGPUArray : "))
                    {
                        throw new exception( "VolumeBuilder : CreateInstance: cudaMalloc3DArray() failed" );
                    }

                    // Create CPU volume variable                   
                    float* _VolumeData = new float[_VolDimX * _VolDimY * _VolDimZ];

                    // Initialize CPU volume variable
                    memset( _VolumeData, 0, _VolDimX * _VolDimY * _VolDimZ * sizeof( float ));

                    // Copy 3D reconstructed volume to the GPU
                    cudaMemcpy3DParms copyParam = { 0 };
                    copyParam.srcPtr = make_cudaPitchedPtr(( void* )_VolumeData, 
                        _VolumeExtPitched.width, 
                        _VolumeExtPitched.width /  sizeof( float ), 
                        _VolumeExtPitched.height );
                    copyParam.dstPtr.ptr = _VolumeDataGPU.ptr;
                    copyParam.dstPtr.pitch = _VolumeDataGPU.pitch;
                    copyParam.dstPtr.xsize = _VolDimX;
                    copyParam.dstPtr.ysize = _VolDimY;
                    copyParam.extent.width = _VolDimX * sizeof( float );
                    copyParam.extent.height = _VolDimY;
                    copyParam.extent.depth = _VolDimZ;
                    copyParam.kind = cudaMemcpyHostToDevice;
                    cudaMemcpy3D( &copyParam );

dim3 dimBlock( BLK_DIM_X,BLK_DIM_Y,1 );

dim3 dimGrid( iDivUp( _VolDimX, dimBlock.x ), iDivUp( _VolDimY, dimBlock.y ), 1 );

Transpose<<<dimGrid, dimBlock>>>(( float* )_VolumeDataGPU.ptr,_VolumeDataGPU.pitch / sizeof( float ), 
                                _VolDimX, _VolDimZ);
I'm not sure how to duplicate the pitch associated with the 3D alloc. How can I calculate the ideal pitch, create the 3D array on the GPU with the pitch, and pass this to the transpose kernel?
Coordinator
Mar 5, 2015 at 5:52 PM
I guess you are looking for CudaPitchedDeviceVariable.

in your case this would give something like:
int _VolDimX = 64, _VolDimY = 64,  _VolDimZ = 64;
//Alloc a cudaArray (cudaMalloc3DArray in Cuda runtime API)
CudaArray3D _VolDataGPUArray = new CudaArray3D(CUArrayFormat.Float, _VolDimX, _VolDimY, _VolDimZ, CudaArray3DNumChannels.One, CUDAArray3DFlags.None);
//Alloc a pitched device variable (note _VolDimY * _VolDimZ for the height parameter to get volumetric array)
CudaPitchedDeviceVariable<float> _VolumeDataGPU = new CudaPitchedDeviceVariable<float>(_VolDimX, _VolDimY * _VolDimZ);
//host arrays
float[] _VolumeData = new float[_VolDimX * _VolDimY * _VolDimZ];
float[] _VolumeData2 = new float[_VolDimX * _VolDimY * _VolDimZ];

//fill one array with some meaningfull data
for (int i = 0; i < _VolDimX * _VolDimY * _VolDimZ; i++)
{
    _VolumeData[i] = i;
}

//Copy from host to pitched linear device memory
_VolumeDataGPU.CopyToDevice(_VolumeData);
//Copy from linear device memory to cuda array
_VolDataGPUArray.CopyFromDeviceToThis(_VolumeDataGPU.DevicePointer, sizeof(float), _VolumeDataGPU.Pitch);
//Copy from cuda array to host
_VolDataGPUArray.CopyFromThisToHost<float>(_VolumeData2);

//Check if everything is OK...
for (int i = 0; i < _VolDimX * _VolDimY * _VolDimZ; i++)
{
    if (_VolumeData[i] != _VolumeData2[i])
        throw new Exception("Bad things happened!");
}

//run a kernel on pitched linear memory
kernel.Run(_VolumeDataGPU.DevicePointer,(int)_VolumeDataGPU.Pitch / sizeof( float ), _VolDimX, _VolDimZ);
Of course all the other copy directions are possible, too.
And don't forget that the Pitch member of CudaPitchedDeviceVariable is of type SizeT, i.e. 8 bytes of size on 64-bit systems, but your kernel only takes an int as argument. So either use size_t in your kernel or always cast to int when calling the kernel.

Hope it helps,
Michael
Mar 5, 2015 at 8:54 PM
Thanks Michael for the quick response. This is perfect.

Thanks,
Mike