TensorDeviceCuda.h
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
00005 //
00006 // This Source Code Form is subject to the terms of the Mozilla
00007 // Public License v. 2.0. If a copy of the MPL was not distributed
00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00009 
00010 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
00011 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
00012 
00013 namespace Eigen {
00014 
00015 static const int kCudaScratchSize = 1024;
00016 
00017 // This defines an interface that GPUDevice can take to use
00018 // CUDA streams underneath.
00019 class StreamInterface {
00020  public:
00021   virtual ~StreamInterface() {}
00022 
00023   virtual const cudaStream_t& stream() const = 0;
00024   virtual const cudaDeviceProp& deviceProperties() const = 0;
00025 
00026   // Allocate memory on the actual device where the computation will run
00027   virtual void* allocate(size_t num_bytes) const = 0;
00028   virtual void deallocate(void* buffer) const = 0;
00029 
00030   // Return a scratchpad buffer of size 1k
00031   virtual void* scratchpad() const = 0;
00032 
00033   // Return a semaphore. The semaphore is initially initialized to 0, and
00034   // each kernel using it is responsible for resetting to 0 upon completion
00035   // to maintain the invariant that the semaphore is always equal to 0 upon
00036   // each kernel start.
00037   virtual unsigned int* semaphore() const = 0;
00038 };
00039 
00040 static cudaDeviceProp* m_deviceProperties;
00041 static bool m_devicePropInitialized = false;
00042 
00043 static void initializeDeviceProp() {
00044   if (!m_devicePropInitialized) {
00045     // Attempts to ensure proper behavior in the case of multiple threads
00046     // calling this function simultaneously. This would be trivial to
00047     // implement if we could use std::mutex, but unfortunately mutex don't
00048     // compile with nvcc, so we resort to atomics and thread fences instead.
00049     // Note that if the caller uses a compiler that doesn't support c++11 we
00050     // can't ensure that the initialization is thread safe.
00051 #if __cplusplus >= 201103L
00052     static std::atomic<bool> first(true);
00053     if (first.exchange(false)) {
00054 #else
00055     static bool first = true;
00056     if (first) {
00057       first = false;
00058 #endif
00059       // We're the first thread to reach this point.
00060       int num_devices;
00061       cudaError_t status = cudaGetDeviceCount(&num_devices);
00062       if (status != cudaSuccess) {
00063         std::cerr << "Failed to get the number of CUDA devices: "
00064                   << cudaGetErrorString(status)
00065                   << std::endl;
00066         assert(status == cudaSuccess);
00067       }
00068       m_deviceProperties = new cudaDeviceProp[num_devices];
00069       for (int i = 0; i < num_devices; ++i) {
00070         status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
00071         if (status != cudaSuccess) {
00072           std::cerr << "Failed to initialize CUDA device #"
00073                     << i
00074                     << ": "
00075                     << cudaGetErrorString(status)
00076                     << std::endl;
00077           assert(status == cudaSuccess);
00078         }
00079       }
00080 
00081 #if __cplusplus >= 201103L
00082       std::atomic_thread_fence(std::memory_order_release);
00083 #endif
00084       m_devicePropInitialized = true;
00085     } else {
00086       // Wait for the other thread to inititialize the properties.
00087       while (!m_devicePropInitialized) {
00088 #if __cplusplus >= 201103L
00089         std::atomic_thread_fence(std::memory_order_acquire);
00090 #endif
00091         sleep(1);
00092       }
00093     }
00094   }
00095 }
00096 
00097 static const cudaStream_t default_stream = cudaStreamDefault;
00098 
00099 class CudaStreamDevice : public StreamInterface {
00100  public:
00101   // Use the default stream on the current device
00102   CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
00103     cudaGetDevice(&device_);
00104     initializeDeviceProp();
00105   }
00106   // Use the default stream on the specified device
00107   CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
00108     initializeDeviceProp();
00109   }
00110   // Use the specified stream. Note that it's the
00111   // caller responsibility to ensure that the stream can run on
00112   // the specified device. If no device is specified the code
00113   // assumes that the stream is associated to the current gpu device.
00114   CudaStreamDevice(const cudaStream_t* stream, int device = -1)
00115       : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
00116     if (device < 0) {
00117       cudaGetDevice(&device_);
00118     } else {
00119       int num_devices;
00120       cudaError_t err = cudaGetDeviceCount(&num_devices);
00121       EIGEN_UNUSED_VARIABLE(err)
00122       assert(err == cudaSuccess);
00123       assert(device < num_devices);
00124       device_ = device;
00125     }
00126     initializeDeviceProp();
00127   }
00128 
00129   virtual ~CudaStreamDevice() {
00130     if (scratch_) {
00131       deallocate(scratch_);
00132     }
00133   }
00134 
00135   const cudaStream_t& stream() const { return *stream_; }
00136   const cudaDeviceProp& deviceProperties() const {
00137     return m_deviceProperties[device_];
00138   }
00139   virtual void* allocate(size_t num_bytes) const {
00140     cudaError_t err = cudaSetDevice(device_);
00141     EIGEN_UNUSED_VARIABLE(err)
00142     assert(err == cudaSuccess);
00143     void* result;
00144     err = cudaMalloc(&result, num_bytes);
00145     assert(err == cudaSuccess);
00146     assert(result != NULL);
00147     return result;
00148   }
00149   virtual void deallocate(void* buffer) const {
00150     cudaError_t err = cudaSetDevice(device_);
00151     EIGEN_UNUSED_VARIABLE(err)
00152     assert(err == cudaSuccess);
00153     assert(buffer != NULL);
00154     err = cudaFree(buffer);
00155     assert(err == cudaSuccess);
00156   }
00157 
00158   virtual void* scratchpad() const {
00159     if (scratch_ == NULL) {
00160       scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
00161     }
00162     return scratch_;
00163   }
00164 
00165   virtual unsigned int* semaphore() const {
00166     if (semaphore_ == NULL) {
00167       char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
00168       semaphore_ = reinterpret_cast<unsigned int*>(scratch);
00169       cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
00170       EIGEN_UNUSED_VARIABLE(err)
00171       assert(err == cudaSuccess);
00172     }
00173     return semaphore_;
00174   }
00175 
00176  private:
00177   const cudaStream_t* stream_;
00178   int device_;
00179   mutable void* scratch_;
00180   mutable unsigned int* semaphore_;
00181 };
00182 
00183 struct GpuDevice {
00184   // The StreamInterface is not owned: the caller is
00185   // responsible for its initialization and eventual destruction.
00186   explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
00187     eigen_assert(stream);
00188   }
00189   explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
00190     eigen_assert(stream);
00191   }
00192   // TODO(bsteiner): This is an internal API, we should not expose it.
00193   EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
00194     return stream_->stream();
00195   }
00196 
00197   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
00198     return stream_->allocate(num_bytes);
00199   }
00200 
00201   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
00202     stream_->deallocate(buffer);
00203   }
00204 
00205   EIGEN_STRONG_INLINE void* scratchpad() const {
00206     return stream_->scratchpad();
00207   }
00208 
00209   EIGEN_STRONG_INLINE unsigned int* semaphore() const {
00210     return stream_->semaphore();
00211   }
00212 
00213   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
00214 #ifndef __CUDA_ARCH__
00215     cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
00216                                       stream_->stream());
00217     EIGEN_UNUSED_VARIABLE(err)
00218     assert(err == cudaSuccess);
00219 #else
00220   eigen_assert(false && "The default device should be used instead to generate kernel code");
00221 #endif
00222   }
00223 
00224   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
00225     cudaError_t err =
00226         cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
00227     EIGEN_UNUSED_VARIABLE(err)
00228     assert(err == cudaSuccess);
00229   }
00230 
00231   EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
00232     cudaError_t err =
00233         cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
00234     EIGEN_UNUSED_VARIABLE(err)
00235     assert(err == cudaSuccess);
00236   }
00237 
00238   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
00239 #ifndef __CUDA_ARCH__
00240     cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
00241     EIGEN_UNUSED_VARIABLE(err)
00242     assert(err == cudaSuccess);
00243 #else
00244   eigen_assert(false && "The default device should be used instead to generate kernel code");
00245 #endif
00246   }
00247 
00248   EIGEN_STRONG_INLINE size_t numThreads() const {
00249     // FIXME
00250     return 32;
00251   }
00252 
00253   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
00254     // FIXME
00255     return 48*1024;
00256   }
00257 
00258   EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
00259     // We won't try to take advantage of the l2 cache for the time being, and
00260     // there is no l3 cache on cuda devices.
00261     return firstLevelCacheSize();
00262   }
00263 
00264   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
00265 #if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
00266     cudaError_t err = cudaStreamSynchronize(stream_->stream());
00267     if (err != cudaSuccess) {
00268       std::cerr << "Error detected in CUDA stream: "
00269                 << cudaGetErrorString(err)
00270                 << std::endl;
00271       assert(err == cudaSuccess);
00272     }
00273 #else
00274     assert(false && "The default device should be used instead to generate kernel code");
00275 #endif
00276   }
00277 
00278   EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
00279     return stream_->deviceProperties().multiProcessorCount;
00280   }
00281   EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
00282     return stream_->deviceProperties().maxThreadsPerBlock;
00283   }
00284   EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
00285     return stream_->deviceProperties().maxThreadsPerMultiProcessor;
00286   }
00287   EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
00288     return stream_->deviceProperties().sharedMemPerBlock;
00289   }
00290   EIGEN_STRONG_INLINE int majorDeviceVersion() const {
00291     return stream_->deviceProperties().major;
00292   }
00293   EIGEN_STRONG_INLINE int minorDeviceVersion() const {
00294     return stream_->deviceProperties().minor;
00295   }
00296 
00297   EIGEN_STRONG_INLINE int maxBlocks() const {
00298     return max_blocks_;
00299   }
00300 
00301   // This function checks if the CUDA runtime recorded an error for the
00302   // underlying stream device.
00303   inline bool ok() const {
00304 #ifdef __CUDACC__
00305     cudaError_t error = cudaStreamQuery(stream_->stream());
00306     return (error == cudaSuccess) || (error == cudaErrorNotReady);
00307 #else
00308     return false;
00309 #endif
00310   }
00311 
00312  private:
00313   const StreamInterface* stream_;
00314   int max_blocks_;
00315 };
00316 
00317 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
00318   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
00319   assert(cudaGetLastError() == cudaSuccess);
00320 
00321 
00322 // FIXME: Should be device and kernel specific.
00323 #ifdef __CUDACC__
00324 static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
00325 #ifndef __CUDA_ARCH__
00326   cudaError_t status = cudaDeviceSetSharedMemConfig(config);
00327   EIGEN_UNUSED_VARIABLE(status)
00328   assert(status == cudaSuccess);
00329 #else
00330   EIGEN_UNUSED_VARIABLE(config)
00331 #endif
00332 }
00333 #endif
00334 
00335 }  // end namespace Eigen
00336 
00337 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
 All Classes Functions Variables Typedefs Enumerator