TensorReductionSycl.h
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Mehdi Goli    Codeplay Software Ltd.
00005 // Ralph Potter  Codeplay Software Ltd.
00006 // Luke Iwanski  Codeplay Software Ltd.
00007 // Contact: <eigen@codeplay.com>
00008 //
00009 // This Source Code Form is subject to the terms of the Mozilla
00010 // Public License v. 2.0. If a copy of the MPL was not distributed
00011 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00012 
00013 /*****************************************************************
00014  * TensorSyclPlaceHolderExpr.h
00015  *
00016  * \brief:
00017  *  This is the specialisation of the placeholder expression based on the
00018  * operation type
00019  *
00020 *****************************************************************/
00021 
00022 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
00023 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
00024 
00025 namespace Eigen {
00026 namespace internal {
00027 
00028 template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
00029 template<typename BufferTOut, typename BufferTIn>
00030 static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
00031   do {
00032           auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
00033             cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
00034                                     cl::sycl::range<1>{std::min(length, local)}};
00035             /* Two accessors are used: one to the buffer that is being reduced,
00036              * and a second to local memory, used to store intermediate data. */
00037             auto aI =
00038                 bufI.template get_access<cl::sycl::access::mode::read_write>(h);
00039             auto aOut =
00040                 bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
00041             cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
00042                                cl::sycl::access::target::local>
00043                 scratch(cl::sycl::range<1>(local), h);
00044 
00045             /* The parallel_for invocation chosen is the variant with an nd_item
00046              * parameter, since the code requires barriers for correctness. */
00047             h.parallel_for<KernelName>(
00048                 r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
00049                   size_t globalid = id.get_global(0);
00050                   size_t localid = id.get_local(0);
00051                   /* All threads collectively read from global memory into local.
00052                    * The barrier ensures all threads' IO is resolved before
00053                    * execution continues (strictly speaking, all threads within
00054                    * a single work-group - there is no co-ordination between
00055                    * work-groups, only work-items). */
00056                   if (globalid < length) {
00057                     scratch[localid] = aI[globalid];
00058                   }
00059                   id.barrier(cl::sycl::access::fence_space::local_space);
00060 
00061                   /* Apply the reduction operation between the current local
00062                    * id and the one on the other half of the vector. */
00063                   if (globalid < length) {
00064                     int min = (length < local) ? length : local;
00065                     for (size_t offset = min / 2; offset > 0; offset /= 2) {
00066                       if (localid < offset) {
00067                         scratch[localid] += scratch[localid + offset];
00068                       }
00069                       id.barrier(cl::sycl::access::fence_space::local_space);
00070                     }
00071                     /* The final result will be stored in local id 0. */
00072                     if (localid == 0) {
00073                       aI[id.get_group(0)] = scratch[localid];
00074                       if((length<=local) && globalid ==0){
00075                         aOut[globalid]=scratch[localid];
00076                       }
00077                     }
00078                   }
00079                 });
00080           };
00081             dev.m_queue.submit(f);
00082             dev.m_queue.throw_asynchronous();
00083 
00084           /* At this point, you could queue::wait_and_throw() to ensure that
00085            * errors are caught quickly. However, this would likely impact
00086            * performance negatively. */
00087           length = length / local;
00088 
00089         } while (length > 1);
00090 
00091 
00092 
00093 }
00094 
00095 };
00096 
00101 // a leafNode.
00102 template <typename Self, typename Op, bool Vectorizable>
00103 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
00104 
00105   typedef typename Self::CoeffReturnType CoeffReturnType;
00106   static const bool HasOptimizedImplementation = false;
00107 
00108   static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
00109     typedef const typename Self::ChildType HostExpr; 
00110     typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
00111     auto functors = TensorSycl::internal::extractFunctors(self.impl());
00112     int red_factor =256; 
00113     size_t inputSize =self.impl().dimensions().TotalSize();
00114     size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
00115     size_t remaining = inputSize% red_factor;
00116     if(rng ==0) {
00117       red_factor=1;
00118     };
00119     size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
00120     size_t GRange=std::max((size_t )1, rng);
00121 
00122     // convert global range to power of 2 for redecution
00123     GRange--;
00124     GRange |= GRange >> 1;
00125     GRange |= GRange >> 2;
00126     GRange |= GRange >> 4;
00127     GRange |= GRange >> 8;
00128     GRange |= GRange >> 16;
00129 #if __x86_64__ || __ppc64__ || _WIN64
00130     GRange |= GRange >> 32;
00131 #endif
00132     GRange++;
00133     size_t  outTileSize = tileSize;
00135     if (GRange < outTileSize) outTileSize=GRange;
00136     // getting final out buffer at the moment the created buffer is true because there is no need for assign
00137     auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
00141     auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
00142     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
00143     Dims dims= self.xprDims();
00144     Op functor = reducer;
00145     dev.m_queue.submit([&](cl::sycl::handler &cgh) {
00146       // create a tuple of accessors from Evaluator
00147       auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
00148       auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
00149 
00150       cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
00151         typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
00152         auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
00156         const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
00159         auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
00161         auto globalid=itemID.get_global_linear_id();
00162 
00163         if(globalid<rng)
00164           tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
00165         else
00166           tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
00167 
00168         if(remaining!=0 && globalid==0 )
00169           // this will add the rest of input buffer when the input size is not devidable to red_factor.
00170           tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
00171       });
00172     });
00173   dev.m_queue.throw_asynchronous();
00174 
00176   syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
00177   }
00178 
00179 };
00180 
00181 template <typename Self, typename Op>
00182 struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
00183 
00184   typedef typename Self::CoeffReturnType CoeffReturnType;
00185   static const bool HasOptimizedImplementation = false;
00186 
00187   static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
00188     typedef const typename Self::ChildType HostExpr; 
00189     typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
00190     auto functors = TensorSycl::internal::extractFunctors(self.impl());
00191 
00192     size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
00193 
00194     size_t GRange=num_coeffs_to_preserve;
00195     if (tileSize>GRange) tileSize=GRange;
00196     else if(GRange>tileSize){
00197       size_t xMode = GRange % tileSize;
00198       if (xMode != 0) GRange += (tileSize - xMode);
00199     }
00200     // getting final out buffer at the moment the created buffer is true because there is no need for assign
00204     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
00205     Dims dims= self.xprDims();
00206     Op functor = reducer;
00207 
00208     dev.m_queue.submit([&](cl::sycl::handler &cgh) {
00209       // create a tuple of accessors from Evaluator
00210       auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
00211       auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
00212 
00213       cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
00214         typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
00215         auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
00219         const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
00222         typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
00223         auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
00225         auto globalid=itemID.get_global_linear_id();
00226         if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
00227           typename DeiceSelf::CoeffReturnType accum = functor.initialize();
00228           GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
00229           functor.finalize(accum);
00230           output_accessor.get_pointer()[globalid]= accum;
00231         }
00232       });
00233     });
00234   dev.m_queue.throw_asynchronous();
00235     return false;
00236   }
00237 };
00238 
00239 }  // end namespace internal
00240 }  // namespace Eigen
00241 
00242 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 All Classes Functions Variables Typedefs Enumerator