--- a/src/CUDAMarchingCubes.cu 2018-03-30 18:52:25.467189457 +0300 +++ b/src/CUDAMarchingCubes.cu 2018-03-30 18:52:02.387136244 +0300 @@ -10,7 +10,7 @@ * * $RCSfile: CUDAMarchingCubes.cu,v $ * $Author: johns $ $Locker: $ $State: Exp $ - * $Revision: 1.30 $ $Date: 2016/11/28 03:04:58 $ + * $Revision: 1.32 $ $Date: 2018/02/15 05:15:02 $ * *************************************************************************** * DESCRIPTION: @@ -25,14 +25,17 @@ // // Description: This class computes an isosurface for a given density grid // using a CUDA Marching Cubes (MC) alorithm. -// The implementation is based on the MC demo from the -// Nvidia GPU Computing SDK, but has been improved -// and extended. This implementation achieves higher -// performance by reducing the number of temporary memory -// buffers, reduces the number of scan calls by using vector -// integer types, and allows extraction of per-vertex normals -// optionally computes per-vertex colors if provided with a -// volumetric texture map. +// +// The implementation is loosely based on the MC demo from +// the Nvidia GPU Computing SDK, but the design has been +// improved and extended in several ways. +// +// This implementation achieves higher performance +// by reducing the number of temporary memory +// buffers, reduces the number of scan calls by using +// vector integer types, and allows extraction of +// per-vertex normals and optionally computes +// per-vertex colors if a volumetric texture map is provided. // // Author: Michael Krone // John Stone @@ -48,7 +51,7 @@ #include // -// Restrict macro to make it easy to do perf tuning tess +// Restrict macro to make it easy to do perf tuning tests // #if 0 #define RESTRICT __restrict__ @@ -171,6 +174,11 @@ texture volumeTex; // sample volume data set at a point p, p CAN NEVER BE OUT OF BOUNDS +// XXX The sampleVolume() call underperforms vs. peak memory bandwidth +// because we don't strictly enforce coalescing requirements in the +// layout of the input volume presently. If we forced X/Y dims to be +// warp-multiple it would become possible to use wider fetches and +// a few other tricks to improve global memory bandwidth __device__ float sampleVolume(const float * RESTRICT data, uint3 p, uint3 gridSize) { return data[(p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x]; @@ -592,6 +600,30 @@ cudaBindTextureToArray(volumeTex, d_vol, desc); } +#if CUDART_VERSION >= 9000 +// +// XXX CUDA 9.0RC breaks the usability of Thrust scan() prefix sums when +// used with the built-in uint2 vector integer types. To workaround +// the problem we have to define our own type and associated conversion +// routines etc. +// + +// XXX workaround for uint2 breakage in CUDA 9.0RC +struct myuint2 : uint2 { + __host__ __device__ myuint2() : uint2(make_uint2(0, 0)) {} + __host__ __device__ myuint2(int val) : uint2(make_uint2(val, val)) {} + __host__ __device__ myuint2(uint2 val) : uint2(make_uint2(val.x, val.y)) {} +}; + +void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) { + const uint2 zero = make_uint2(0, 0); + thrust::exclusive_scan(thrust::device_ptr((myuint2*)input), + thrust::device_ptr((myuint2*)input + numElements), + thrust::device_ptr((myuint2*)output), + (myuint2) zero); +} + +#else void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) { const uint2 zero = make_uint2(0, 0); @@ -601,6 +633,7 @@ zero); } +#endif void ThrustScanWrapperArea(float* output, float* input, unsigned int numElements) { thrust::inclusive_scan(thrust::device_ptr(input), @@ -639,11 +672,9 @@ } -/////////////////////////////////////////////////////////////////////////////// // // class CUDAMarchingCubes // -/////////////////////////////////////////////////////////////////////////////// CUDAMarchingCubes::CUDAMarchingCubes() { // initialize values @@ -713,9 +744,6 @@ } -//////////////////////////////////////////////////////////////////////////////// -//! Run the Cuda part of the computation -//////////////////////////////////////////////////////////////////////////////// void CUDAMarchingCubes::computeIsosurfaceVerts(float3* vertOut, unsigned int maxverts, dim3 & grid3) { // check if data is available if (!this->setdata) --- a/src/CUDAMDFF.cu 2016-12-01 10:11:56.000000000 +0300 +++ b/src/CUDAMDFF.cu 2018-03-30 18:56:44.352937599 +0300 @@ -11,7 +11,7 @@ * * $RCSfile: CUDAMDFF.cu,v $ * $Author: johns $ $Locker: $ $State: Exp $ - * $Revision: 1.75 $ $Date: 2015/04/07 20:41:26 $ + * $Revision: 1.78 $ $Date: 2018/02/19 07:10:37 $ * *************************************************************************** * DESCRIPTION: @@ -28,12 +28,16 @@ #include #include #include -#include // FLT_MAX etc - +#if CUDART_VERSION >= 9000 +#include // need to explicitly include for CUDA 9.0 +#endif #if CUDART_VERSION < 4000 #error The VMD MDFF feature requires CUDA 4.0 or later #endif +#include // FLT_MAX etc + + #include "Inform.h" #include "utilities.h" #include "WKFThreads.h" @@ -588,6 +592,43 @@ } + +// #define VMDUSESHUFFLE 1 +#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000 +// New warp shuffle-based CC sum reduction for Kepler and later GPUs. +inline __device__ void cc_sumreduction(int tid, int totaltb, + float4 &total_cc_sums, + float &total_lcc, + int &total_lsize, + float4 *tb_cc_sums, + float *tb_lcc, + int *tb_lsize) { + total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + total_lcc = 0.0f; + total_lsize = 0; + + // use precisely one warp to do the final reduction + if (tid < warpSize) { + for (int i=tid; i0; mask>>=1) { + total_cc_sums.x += __shfl_xor_sync(0xffffffff, total_cc_sums.x, mask); + total_cc_sums.y += __shfl_xor_sync(0xffffffff, total_cc_sums.y, mask); + total_cc_sums.z += __shfl_xor_sync(0xffffffff, total_cc_sums.z, mask); + total_cc_sums.w += __shfl_xor_sync(0xffffffff, total_cc_sums.w, mask); + total_lcc += __shfl_xor_sync(0xffffffff, total_lcc, mask); + total_lsize += __shfl_xor_sync(0xffffffff, total_lsize, mask); + } + } +} +#else +// shared memory based CC sum reduction inline __device__ void cc_sumreduction(int tid, int totaltb, float4 &total_cc_sums, float &total_lcc, @@ -629,6 +670,7 @@ total_lcc = tb_lcc[0]; total_lsize = tb_lsize[0]; } +#endif inline __device__ void thread_cc_sum(float ref, float density, @@ -750,6 +792,92 @@ } +#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000 + // all threads write their local sums to shared memory... + __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ]; + __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ]; + __shared__ float tb_lcc_s[TOTALBLOCKSZ]; + __shared__ int tb_lsize_s[TOTALBLOCKSZ]; + + tb_cc_means_s[tid] = thread_cc_means; + tb_cc_squares_s[tid] = thread_cc_squares; + tb_lcc_s[tid] = thread_lcc; + tb_lsize_s[tid] = thread_lsize; + __syncthreads(); // all threads must hit syncthreads call... + + // use precisely one warp to do the thread-block-wide reduction + if (tid < warpSize) { + float2 tmp_cc_means = make_float2(0.0f, 0.0f); + float2 tmp_cc_squares = make_float2(0.0f, 0.0f); + float tmp_lcc = 0.0f; + int tmp_lsize = 0; + for (int i=tid; i0; mask>>=1) { + tmp_cc_means.x += __shfl_xor_sync(0xffffffff, tmp_cc_means.x, mask); + tmp_cc_means.y += __shfl_xor_sync(0xffffffff, tmp_cc_means.y, mask); + tmp_cc_squares.x += __shfl_xor_sync(0xffffffff, tmp_cc_squares.x, mask); + tmp_cc_squares.y += __shfl_xor_sync(0xffffffff, tmp_cc_squares.y, mask); + tmp_lcc += __shfl_xor_sync(0xffffffff, tmp_lcc, mask); + tmp_lsize += __shfl_xor_sync(0xffffffff, tmp_lsize, mask); + } + + // write per-thread-block partial sums to global memory, + // if a per-thread-block CC output array is provided, write the + // local CC for this thread block out, and finally, check if we + // are the last thread block to finish, and finalize the overall + // CC results for the entire grid of thread blocks. + if (tid == 0) { + unsigned int bid = blockIdx.z * gridDim.x * gridDim.y + + blockIdx.y * gridDim.x + blockIdx.x; + + tb_cc_sums[bid] = make_float4(tmp_cc_means.x, tmp_cc_means.y, + tmp_cc_squares.x, tmp_cc_squares.y); + tb_lcc[bid] = tmp_lcc; + tb_lsize[bid] = tmp_lsize; + + if (tb_CC != NULL) { + float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y, + tb_cc_squares_s[0].x, tb_cc_squares_s[0].y, + tb_lsize_s[0], tb_lcc_s[0]); + + // write local per-thread-block CC to global memory + tb_CC[bid] = cc; + } + + __threadfence(); + + unsigned int value = atomicInc(&tbcatomic[0], totaltb); + isLastBlockDone = (value == (totaltb - 1)); + } + } + __syncthreads(); + + if (isLastBlockDone) { + float4 total_cc_sums; + float total_lcc; + int total_lsize; + cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize, + tb_cc_sums, tb_lcc, tb_lsize); + + if (tid == 0) { + tb_cc_sums[totaltb] = total_cc_sums; + tb_lcc[totaltb] = total_lcc; + tb_lsize[totaltb] = total_lsize; + } + + reset_atomic_counter(&tbcatomic[0]); + } + +#else + // all threads write their local sums to shared memory... __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ]; __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ]; @@ -794,6 +922,7 @@ } __syncthreads(); // all threads must hit syncthreads call... } +//#endif // write per-thread-block partial sums to global memory, // if a per-thread-block CC output array is provided, write the @@ -847,6 +976,7 @@ } #endif } +#endif } --- a/src/CUDAQuickSurf.cu 2016-12-01 10:11:56.000000000 +0300 +++ b/src/CUDAQuickSurf.cu 2018-03-30 19:01:38.777196233 +0300 @@ -11,7 +11,7 @@ * * $RCSfile: CUDAQuickSurf.cu,v $ * $Author: johns $ $Locker: $ $State: Exp $ - * $Revision: 1.81 $ $Date: 2016/04/20 04:57:46 $ + * $Revision: 1.84 $ $Date: 2018/02/15 04:59:15 $ * *************************************************************************** * DESCRIPTION: @@ -22,6 +22,9 @@ #include #include #include +#if CUDART_VERSION >= 9000 +#include // need to explicitly include for CUDA 9.0 +#endif #if CUDART_VERSION < 4000 #error The VMD QuickSurf feature requires CUDA 4.0 or later @@ -130,14 +133,14 @@ #define GUNROLL 1 #endif -#if __CUDA_ARCH__ >= 300 #define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ ) -#define MINBLOCKDENS 1 +#if __CUDA_ARCH__ >= 600 +#define MINBLOCKDENS 16 +#elif __CUDA_ARCH__ >= 300 +#define MINBLOCKDENS 16 #elif __CUDA_ARCH__ >= 200 -#define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ ) #define MINBLOCKDENS 1 #else -#define MAXTHRDENS ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ ) #define MINBLOCKDENS 1 #endif @@ -150,7 +153,7 @@ // template __global__ static void -// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) +__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) gaussdensity_fast_tex_norm(int natoms, const float4 * RESTRICT sorted_xyzr, const float4 * RESTRICT sorted_color, @@ -217,6 +220,8 @@ for (yab=yabmin; yab<=yabmax; yab++) { for (xab=xabmin; xab<=xabmax; xab++) { int abcellidx = zab * acplanesz + yab * acncells.x + xab; + // this biggest latency hotspot in the kernel, if we could improve + // packing of the grid cell map, we'd likely improve performance uint2 atomstartend = cellStartEnd[abcellidx]; if (atomstartend.x != GRID_CELL_EMPTY) { unsigned int atomid; @@ -296,7 +301,7 @@ __global__ static void -// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) +__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS ) gaussdensity_fast_tex3f(int natoms, const float4 * RESTRICT sorted_xyzr, const float4 * RESTRICT sorted_color, @@ -363,6 +368,8 @@ for (yab=yabmin; yab<=yabmax; yab++) { for (xab=xabmin; xab<=xabmax; xab++) { int abcellidx = zab * acplanesz + yab * acncells.x + xab; + // this biggest latency hotspot in the kernel, if we could improve + // packing of the grid cell map, we'd likely improve performance uint2 atomstartend = cellStartEnd[abcellidx]; if (atomstartend.x != GRID_CELL_EMPTY) { unsigned int atomid; @@ -550,7 +557,6 @@ // per-GPU handle with various memory buffer pointers, etc. typedef struct { - /// max grid sizes and attributes the current allocations will support int verbose; long int natoms; int colorperatom; @@ -561,18 +567,18 @@ int gy; int gz; - CUDAMarchingCubes *mc; ///< Marching cubes class used to extract surface + CUDAMarchingCubes *mc; - float *devdensity; ///< density map stored in GPU memory - void *devvoltexmap; ///< volumetric texture map - float4 *xyzr_d; ///< atom coords and radii - float4 *sorted_xyzr_d; ///< cell-sorted coords and radii - float4 *color_d; ///< colors - float4 *sorted_color_d; ///< cell-sorted colors - - unsigned int *atomIndex_d; ///< cell index for each atom - unsigned int *atomHash_d; ///< - uint2 *cellStartEnd_d; ///< cell start/end indices + float *devdensity; + void *devvoltexmap; + float4 *xyzr_d; + float4 *sorted_xyzr_d; + float4 *color_d; + float4 *sorted_color_d; + + unsigned int *atomIndex_d; + unsigned int *atomHash_d; + uint2 *cellStartEnd_d; void *safety; float3 *v3f_d;