TensorReductionCuda.h
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
00005 //
00006 // This Source Code Form is subject to the terms of the Mozilla
00007 // Public License v. 2.0. If a copy of the MPL was not distributed
00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00009 
00010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
00011 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
00012 
00013 namespace Eigen {
00014 namespace internal {
00015 
00016 
00017 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
00018 // Full reducers for GPU, don't vectorize for now
00019 
00020 // Reducer function that enables multiple cuda thread to safely accumulate at the same
00021 // output address. It basically reads the current value of the output variable, and
00022 // attempts to update it with the new value. If in the meantime another cuda thread
00023 // updated the content of the output address it will try again.
00024 template <typename T, typename R>
00025 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
00026 #if __CUDA_ARCH__ >= 300
00027   if (sizeof(T) == 4)
00028   {
00029     unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
00030     unsigned int newval = oldval;
00031     reducer.reduce(accum, reinterpret_cast<T*>(&newval));
00032     if (newval == oldval) {
00033       return;
00034     }
00035     unsigned int readback;
00036     while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
00037       oldval = readback;
00038       newval = oldval;
00039       reducer.reduce(accum, reinterpret_cast<T*>(&newval));
00040       if (newval == oldval) {
00041         return;
00042       }
00043     }
00044   }
00045   else if (sizeof(T) == 8) {
00046     unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
00047     unsigned long long newval = oldval;
00048     reducer.reduce(accum, reinterpret_cast<T*>(&newval));
00049     if (newval == oldval) {
00050       return;
00051     }
00052     unsigned long long readback;
00053     while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
00054       oldval = readback;
00055       newval = oldval;
00056       reducer.reduce(accum, reinterpret_cast<T*>(&newval));
00057       if (newval == oldval) {
00058         return;
00059       }
00060     }
00061   }
00062   else {
00063     assert(0 && "Wordsize not supported");
00064   }
00065 #else
00066   assert(0 && "Shouldn't be called on unsupported device");
00067 #endif
00068 }
00069 
00070 // We extend atomicExch to support extra data types
00071 template <typename Type>
00072 __device__ inline Type atomicExchCustom(Type* address, Type val) {
00073   return atomicExch(address, val);
00074 }
00075 
00076 template <>
00077 __device__ inline double atomicExchCustom(double* address, double val) {
00078   unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
00079   return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
00080 }
00081 
00082 #ifdef EIGEN_HAS_CUDA_FP16
00083 template <template <typename T> class R>
00084 __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
00085   unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
00086   unsigned int newval = oldval;
00087   reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
00088   if (newval == oldval) {
00089     return;
00090   }
00091   unsigned int readback;
00092   while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
00093     oldval = readback;
00094     newval = oldval;
00095     reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
00096     if (newval == oldval) {
00097       return;
00098     }
00099   }
00100 }
00101 #endif
00102 
00103 template <>
00104 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
00105 #if __CUDA_ARCH__ >= 300
00106   atomicAdd(output, accum);
00107 #else
00108   assert(0 && "Shouldn't be called on unsupported device");
00109 #endif
00110 }
00111 
00112 
00113 template <typename CoeffType, typename Index>
00114 __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
00115   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
00116   const Index num_threads = blockDim.x * gridDim.x;
00117   for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
00118     output[i] = val;
00119   }
00120 }
00121 
00122 
00123 template <int BlockSize, int NumPerThread, typename Self,
00124           typename Reducer, typename Index>
00125 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
00126                                     typename Self::CoeffReturnType* output, unsigned int* semaphore) {
00127 #if __CUDA_ARCH__ >= 300
00128   // Initialize the output value
00129   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
00130   if (gridDim.x == 1) {
00131     if (first_index == 0) {
00132       *output = reducer.initialize();
00133     }
00134   }
00135   else {
00136     if (threadIdx.x == 0) {
00137       unsigned int block = atomicCAS(semaphore, 0u, 1u);
00138       if (block == 0) {
00139         // We're the first block to run, initialize the output value
00140         atomicExchCustom(output, reducer.initialize());
00141         __threadfence();
00142         atomicExch(semaphore, 2u);
00143       }
00144       else {
00145         // Wait for the first block to initialize the output value.
00146         // Use atomicCAS here to ensure that the reads aren't cached
00147         unsigned int val;
00148         do {
00149           val = atomicCAS(semaphore, 2u, 2u);
00150         }
00151         while (val < 2u);
00152       }
00153     }
00154   }
00155 
00156   __syncthreads();
00157 
00158   eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
00159 
00160   typename Self::CoeffReturnType accum = reducer.initialize();
00161   Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
00162   for (Index i = 0; i < max_iter; i+=BlockSize) {
00163     const Index index = first_index + i;
00164     eigen_assert(index < num_coeffs);
00165     typename Self::CoeffReturnType val = input.m_impl.coeff(index);
00166     reducer.reduce(val, &accum);
00167   }
00168 
00169 #pragma unroll
00170   for (int offset = warpSize/2; offset > 0; offset /= 2) {
00171     reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
00172   }
00173 
00174   if ((threadIdx.x & (warpSize - 1)) == 0) {
00175     atomicReduce(output, accum, reducer);
00176   }
00177 
00178   if (gridDim.x > 1 && threadIdx.x == 0) {
00179     // Let the last block reset the semaphore
00180     atomicInc(semaphore, gridDim.x + 1);
00181   }
00182 #else
00183   assert(0 && "Shouldn't be called on unsupported device");
00184 #endif
00185 }
00186 
00187 
00188 #ifdef EIGEN_HAS_CUDA_FP16
00189 template <typename Self,
00190           typename Reducer, typename Index>
00191 __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
00192   eigen_assert(blockDim.x == 1);
00193   eigen_assert(gridDim.x == 1);
00194   if (num_coeffs % 2 != 0) {
00195     half last = input.m_impl.coeff(num_coeffs-1);
00196     *scratch = __halves2half2(last, reducer.initialize());
00197   } else {
00198     *scratch = reducer.template initializePacket<half2>();
00199   }
00200 }
00201 
00202 template <typename Self,
00203           typename Reducer, typename Index>
00204 __global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
00205   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
00206   const Index num_threads = blockDim.x * gridDim.x;
00207   const Index num_packets = num_coeffs / 2;
00208   for (Index i = thread_id; i < num_packets; i += num_threads) {
00209     ((half2*)output)[i] = reducer.template initializePacket<half2>();
00210   }
00211 
00212   if (thread_id == 0 && num_coeffs % 2 != 0) {
00213     output[num_coeffs-1] = reducer.initialize();
00214   }
00215 }
00216 
00217 template <int BlockSize, int NumPerThread, typename Self,
00218           typename Reducer, typename Index>
00219 __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
00220                                     half* output, half2* scratch) {
00221   eigen_assert(NumPerThread % 2 == 0);
00222 
00223   const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
00224 
00225   // Initialize the output value if it wasn't initialized by the ReductionInitKernel
00226   if (gridDim.x == 1 && first_index == 0) {
00227     if (num_coeffs % 2 != 0) {
00228       half last = input.m_impl.coeff(num_coeffs-1);
00229       *scratch = __halves2half2(last, reducer.initialize());
00230     } else {
00231       *scratch = reducer.template initializePacket<half2>();
00232     }
00233     __syncthreads();
00234   }
00235 
00236   half2 accum = reducer.template initializePacket<half2>();
00237   const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
00238   for (Index i = 0; i < max_iter; i += BlockSize) {
00239     const Index index = first_index + 2*i;
00240     eigen_assert(index + 1 < num_coeffs);
00241     half2 val = input.m_impl.template packet<Unaligned>(index);
00242     reducer.reducePacket(val, &accum);
00243   }
00244 
00245 #pragma unroll
00246   for (int offset = warpSize/2; offset > 0; offset /= 2) {
00247     reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
00248   }
00249 
00250   if ((threadIdx.x & (warpSize - 1)) == 0) {
00251     atomicReduce(scratch, accum, reducer);
00252   }
00253 
00254   __syncthreads();
00255 
00256   if (gridDim.x == 1 && first_index == 0) {
00257     half tmp = __low2half(*scratch);
00258     reducer.reduce(__high2half(*scratch), &tmp);
00259     *output = tmp;
00260   }
00261 }
00262 
00263 template <typename Op>
00264 __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
00265   eigen_assert(threadIdx.x == 1);
00266   half tmp = __low2half(*scratch);
00267   reducer.reduce(__high2half(*scratch), &tmp);
00268   *output = tmp;
00269 }
00270 
00271 #endif
00272 
00273 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
00274 struct FullReductionLauncher {
00275   static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
00276     assert(false && "Should only be called on doubles, floats and half floats");
00277   }
00278 };
00279 
00280 // Specialization for float and double
00281 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
00282 struct FullReductionLauncher<
00283     Self, Op, OutputType, PacketAccess,
00284     typename internal::enable_if<
00285       internal::is_same<float, OutputType>::value ||
00286       internal::is_same<double, OutputType>::value,
00287     void>::type> {
00288   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
00289     typedef typename Self::Index Index;
00290     typedef typename Self::CoeffReturnType Scalar;
00291     const int block_size = 256;
00292     const int num_per_thread = 128;
00293     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
00294 
00295     unsigned int* semaphore = NULL;
00296     if (num_blocks > 1) {
00297       semaphore = device.semaphore();
00298     }
00299 
00300     LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
00301                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
00302   }
00303 };
00304 
00305 #ifdef EIGEN_HAS_CUDA_FP16
00306 template <typename Self, typename Op>
00307 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
00308   static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
00309     assert(false && "Should not be called since there is no packet accessor");
00310   }
00311 };
00312 
00313 template <typename Self, typename Op>
00314 struct FullReductionLauncher<Self, Op, Eigen::half, true> {
00315   static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
00316     typedef typename Self::Index Index;
00317 
00318     const int block_size = 256;
00319     const int num_per_thread = 128;
00320     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
00321     half2* scratch = static_cast<half2*>(device.scratchpad());
00322 
00323     if (num_blocks > 1) {
00324       // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
00325       // won't be a race conditions between multiple thread blocks.
00326       LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
00327                          1, 1, 0, device, reducer, self, num_coeffs, scratch);
00328     }
00329 
00330     LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
00331                        num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
00332 
00333     if (num_blocks > 1) {
00334       LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
00335                          1, 1, 0, device, reducer, output, scratch);
00336     }
00337   }
00338 };
00339 #endif
00340 
00341 
00342 template <typename Self, typename Op, bool Vectorizable>
00343 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
00344   // Unfortunately nvidia doesn't support well exotic types such as complex,
00345   // so reduce the scope of the optimized version of the code to the simple cases
00346   // of doubles, floats and half floats
00347 #ifdef EIGEN_HAS_CUDA_FP16
00348   static const bool HasOptimizedImplementation = !Op::IsStateful &&
00349       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
00350        internal::is_same<typename Self::CoeffReturnType, double>::value ||
00351        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
00352 #else
00353   static const bool HasOptimizedImplementation = !Op::IsStateful &&
00354                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
00355                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
00356 #endif
00357 
00358   template <typename OutputType>
00359   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
00360     assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
00361     const Index num_coeffs = array_prod(self.m_impl.dimensions());
00362     // Don't crash when we're called with an input tensor of size 0.
00363     if (num_coeffs == 0) {
00364       return;
00365     }
00366 
00367     FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
00368   }
00369 };
00370 
00371 
00372 template <int NumPerThread, typename Self,
00373           typename Reducer, typename Index>
00374 __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
00375                                          typename Self::CoeffReturnType* output) {
00376 #if __CUDA_ARCH__ >= 300
00377   typedef typename Self::CoeffReturnType Type;
00378   eigen_assert(blockDim.y == 1);
00379   eigen_assert(blockDim.z == 1);
00380   eigen_assert(gridDim.y == 1);
00381   eigen_assert(gridDim.z == 1);
00382 
00383   const int unroll_times = 16;
00384   eigen_assert(NumPerThread % unroll_times == 0);
00385 
00386   const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
00387   const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
00388 
00389   const Index num_threads = blockDim.x * gridDim.x;
00390   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
00391 
00392   // Initialize the output values if they weren't initialized by the ReductionInitKernel
00393   if (gridDim.x == 1) {
00394     for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
00395       output[i] = reducer.initialize();
00396     }
00397     __syncthreads();
00398   }
00399 
00400   for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
00401     const Index row = i / input_col_blocks;
00402 
00403     if (row < num_preserved_coeffs) {
00404       const Index col_block = i % input_col_blocks;
00405       const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
00406 
00407       Type reduced_val = reducer.initialize();
00408 
00409       for (Index j = 0; j < NumPerThread; j += unroll_times) {
00410         const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
00411         if (last_col >= num_coeffs_to_reduce) {
00412           for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
00413             const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
00414             reducer.reduce(val, &reduced_val);
00415           }
00416           break;
00417         } else {
00418           // Faster version of the loop with no branches after unrolling.
00419 #pragma unroll
00420           for (int k = 0; k < unroll_times; ++k) {
00421             const Index col = col_begin + blockDim.x * (j + k);
00422             reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
00423           }
00424         }
00425       }
00426 
00427 #pragma unroll
00428       for (int offset = warpSize/2; offset > 0; offset /= 2) {
00429         reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
00430       }
00431 
00432       if ((threadIdx.x & (warpSize - 1)) == 0) {
00433         atomicReduce(&(output[row]), reduced_val, reducer);
00434       }
00435     }
00436   }
00437 #else
00438   assert(0 && "Shouldn't be called on unsupported device");
00439 #endif
00440 }
00441 
00442 #ifdef EIGEN_HAS_CUDA_FP16
00443 
00444 template <int NumPerThread, typename Self,
00445           typename Reducer, typename Index>
00446 __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
00447                                               half* output) {
00448   eigen_assert(blockDim.y == 1);
00449   eigen_assert(blockDim.z == 1);
00450   eigen_assert(gridDim.y == 1);
00451   eigen_assert(gridDim.z == 1);
00452 
00453   const int unroll_times = 16;
00454   eigen_assert(NumPerThread % unroll_times == 0);
00455   eigen_assert(unroll_times % 2 == 0);
00456 
00457   const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
00458   const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
00459 
00460   const Index num_threads = blockDim.x * gridDim.x;
00461   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
00462 
00463   // Initialize the output values if they weren't initialized by the ReductionInitKernel
00464   if (gridDim.x == 1) {
00465     Index i = 2*thread_id;
00466     for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
00467       half* loc = output + i;
00468       *((half2*)loc) = reducer.template initializePacket<half2>();
00469     }
00470     if (i < num_preserved_coeffs) {
00471       output[i] = reducer.initialize();
00472     }
00473     __syncthreads();
00474   }
00475 
00476   for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
00477     const Index row = 2 * (i / input_col_blocks);
00478 
00479     if (row + 1 < num_preserved_coeffs) {
00480       const Index col_block = i % input_col_blocks;
00481       const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
00482 
00483       half2 reduced_val1 = reducer.template initializePacket<half2>();
00484       half2 reduced_val2 = reducer.template initializePacket<half2>();
00485 
00486       for (Index j = 0; j < NumPerThread; j += unroll_times) {
00487         const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
00488         if (last_col >= num_coeffs_to_reduce) {
00489           Index col = col_begin + blockDim.x * j;
00490           for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
00491             const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
00492             reducer.reducePacket(val1, &reduced_val1);
00493             const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
00494             reducer.reducePacket(val2, &reduced_val2);
00495           }
00496           if (col < num_coeffs_to_reduce) {
00497             // Peel;
00498             const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
00499             const half2 val1 = __halves2half2(last1, reducer.initialize());
00500             reducer.reducePacket(val1, &reduced_val1);
00501             const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
00502             const half2 val2 = __halves2half2(last2, reducer.initialize());
00503             reducer.reducePacket(val2, &reduced_val2);
00504           }
00505           break;
00506         } else {
00507           // Faster version of the loop with no branches after unrolling.
00508 #pragma unroll
00509           for (int k = 0; k < unroll_times; ++k) {
00510             const Index col = col_begin + blockDim.x * (j + k) * 2;
00511             reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
00512             reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
00513           }
00514         }
00515       }
00516 
00517 #pragma unroll
00518       for (int offset = warpSize/2; offset > 0; offset /= 2) {
00519         reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
00520         reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
00521       }
00522 
00523       half val1 =  __low2half(reduced_val1);
00524       reducer.reduce(__high2half(reduced_val1), &val1);
00525       half val2 =  __low2half(reduced_val2);
00526       reducer.reduce(__high2half(reduced_val2), &val2);
00527       half2 val = __halves2half2(val1, val2);
00528 
00529       if ((threadIdx.x & (warpSize - 1)) == 0) {
00530         half* loc = output + row;
00531         atomicReduce((half2*)loc, val, reducer);
00532       }
00533     }
00534   }
00535 }
00536 
00537 #endif
00538 
00539 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
00540 struct InnerReductionLauncher {
00541   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
00542     assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
00543     return true;
00544   }
00545 };
00546 
00547 // Specialization for float and double
00548 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
00549 struct InnerReductionLauncher<
00550   Self, Op, OutputType, PacketAccess,
00551   typename internal::enable_if<
00552     internal::is_same<float, OutputType>::value ||
00553     internal::is_same<double, OutputType>::value,
00554   void>::type> {
00555   static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
00556     typedef typename Self::Index Index;
00557 
00558     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
00559     const int block_size = 256;
00560     const int num_per_thread = 128;
00561     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
00562     const int max_blocks = device.getNumCudaMultiProcessors() *
00563                            device.maxCudaThreadsPerMultiProcessor() / block_size;
00564     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
00565 
00566     if (num_blocks > 1) {
00567       // We initialize the outputs outside the reduction kernel when we can't be sure that there
00568       // won't be a race conditions between multiple thread blocks.
00569       const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
00570       const int max_blocks = device.getNumCudaMultiProcessors() *
00571                            device.maxCudaThreadsPerMultiProcessor() / 1024;
00572       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
00573       LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
00574                          num_blocks, 1024, 0, device, reducer.initialize(),
00575                          num_preserved_vals, output);
00576     }
00577 
00578     LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
00579                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
00580 
00581     return false;
00582   }
00583 };
00584 
00585 #ifdef EIGEN_HAS_CUDA_FP16
00586 template <typename Self, typename Op>
00587 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
00588   static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
00589     assert(false && "Should not be called since there is no packet accessor");
00590     return true;
00591   }
00592 };
00593 
00594 template <typename Self, typename Op>
00595 struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
00596   static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
00597     typedef typename Self::Index Index;
00598 
00599     if (num_preserved_vals % 2 != 0) {
00600       // Not supported yet, revert to the slower code path
00601       return true;
00602     }
00603 
00604     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
00605     const int block_size = /*256*/128;
00606     const int num_per_thread = /*128*/64;
00607     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
00608     const int max_blocks = device.getNumCudaMultiProcessors() *
00609                            device.maxCudaThreadsPerMultiProcessor() / block_size;
00610     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
00611 
00612     if (num_blocks > 1) {
00613       // We initialize the outputs outside the reduction kernel when we can't be sure that there
00614       // won't be a race conditions between multiple thread blocks.
00615       const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
00616       const int max_blocks = device.getNumCudaMultiProcessors() *
00617                            device.maxCudaThreadsPerMultiProcessor() / 1024;
00618       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
00619       LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
00620                          1, 1, 0, device, reducer, self, num_preserved_vals, output);
00621     }
00622 
00623     LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
00624                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
00625 
00626     return false;
00627   }
00628 };
00629 #endif
00630 
00631 
00632 template <typename Self, typename Op>
00633 struct InnerReducer<Self, Op, GpuDevice> {
00634   // Unfortunately nvidia doesn't support well exotic types such as complex,
00635   // so reduce the scope of the optimized version of the code to the simple case
00636   // of floats and half floats.
00637 #ifdef EIGEN_HAS_CUDA_FP16
00638   static const bool HasOptimizedImplementation = !Op::IsStateful &&
00639       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
00640        internal::is_same<typename Self::CoeffReturnType, double>::value ||
00641        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
00642 #else
00643   static const bool HasOptimizedImplementation = !Op::IsStateful &&
00644                                                  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
00645                                                   internal::is_same<typename Self::CoeffReturnType, double>::value);
00646 #endif
00647 
00648   template <typename OutputType>
00649   static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
00650     assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
00651     const Index num_coeffs = array_prod(self.m_impl.dimensions());
00652     // Don't crash when we're called with an input tensor of size 0.
00653     if (num_coeffs == 0) {
00654       return true;
00655     }
00656     // It's faster to use the usual code.
00657     if (num_coeffs_to_reduce <= 128) {
00658       return true;
00659     }
00660 
00661     return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
00662   }
00663 };
00664 
00665 template <int NumPerThread, typename Self,
00666           typename Reducer, typename Index>
00667 __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
00668                                      typename Self::CoeffReturnType* output) {
00669   const Index num_threads = blockDim.x * gridDim.x;
00670   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
00671   // Initialize the output values if they weren't initialized by the ReductionInitKernel
00672   if (gridDim.x == 1) {
00673     for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
00674       output[i] = reducer.initialize();
00675     }
00676     __syncthreads();
00677   }
00678 
00679   // Do the reduction.
00680   const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
00681   for (Index i = thread_id; i < max_iter; i += num_threads) {
00682     const Index input_col = i % num_preserved_coeffs;
00683     const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
00684     typename Self::CoeffReturnType reduced_val = reducer.initialize();
00685     const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
00686     for (Index j = input_row; j < max_row; j++) {
00687       typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
00688       reducer.reduce(val, &reduced_val);
00689     }
00690     atomicReduce(&(output[input_col]), reduced_val, reducer);
00691   }
00692 }
00693 
00694 
00695 template <typename Self, typename Op>
00696 struct OuterReducer<Self, Op, GpuDevice> {
00697   // Unfortunately nvidia doesn't support well exotic types such as complex,
00698   // so reduce the scope of the optimized version of the code to the simple case
00699   // of floats.
00700   static const bool HasOptimizedImplementation = !Op::IsStateful &&
00701                                                  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
00702                                                   internal::is_same<typename Self::CoeffReturnType, double>::value);
00703   template <typename Device, typename OutputType>
00704   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
00705     assert(false && "Should only be called to reduce doubles or floats on a gpu device");
00706     return true;
00707   }
00708 
00709   static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
00710     typedef typename Self::Index Index;
00711 
00712     // It's faster to use the usual code.
00713     if (num_coeffs_to_reduce <= 32) {
00714       return true;
00715     }
00716 
00717     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
00718     const int block_size = 256;
00719     const int num_per_thread = 16;
00720     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
00721     const int max_blocks = device.getNumCudaMultiProcessors() *
00722                            device.maxCudaThreadsPerMultiProcessor() / block_size;
00723     const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
00724 
00725     if (num_blocks > 1) {
00726       // We initialize the outputs in the reduction kernel itself when we don't have to worry
00727       // about race conditions between multiple thread blocks.
00728       const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
00729       const int max_blocks = device.getNumCudaMultiProcessors() *
00730                              device.maxCudaThreadsPerMultiProcessor() / 1024;
00731       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
00732       LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
00733                          num_blocks, 1024, 0, device, reducer.initialize(),
00734                          num_preserved_vals, output);
00735     }
00736 
00737     LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
00738                        num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
00739 
00740     return false;
00741   }
00742 };
00743 
00744 #endif
00745 
00746 
00747 } // end namespace internal
00748 } // end namespace Eigen
00749 
00750 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
 All Classes Functions Variables Typedefs Enumerator