![]() |
Eigen-unsupported
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Mehdi Goli Codeplay Software Ltd. 00005 // Ralph Potter Codeplay Software Ltd. 00006 // Luke Iwanski Codeplay Software Ltd. 00007 // Contact: <eigen@codeplay.com> 00008 // 00009 // This Source Code Form is subject to the terms of the Mozilla 00010 // Public License v. 2.0. If a copy of the MPL was not distributed 00011 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00012 00013 /***************************************************************** 00014 * TensorSyclPlaceHolderExpr.h 00015 * 00016 * \brief: 00017 * This is the specialisation of the placeholder expression based on the 00018 * operation type 00019 * 00020 *****************************************************************/ 00021 00022 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 00023 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP 00024 00025 namespace Eigen { 00026 namespace internal { 00027 00028 template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{ 00029 template<typename BufferTOut, typename BufferTIn> 00030 static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ 00031 do { 00032 auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable { 00033 cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, 00034 cl::sycl::range<1>{std::min(length, local)}}; 00035 /* Two accessors are used: one to the buffer that is being reduced, 00036 * and a second to local memory, used to store intermediate data. */ 00037 auto aI = 00038 bufI.template get_access<cl::sycl::access::mode::read_write>(h); 00039 auto aOut = 00040 bufOut->template get_access<cl::sycl::access::mode::discard_write>(h); 00041 cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, 00042 cl::sycl::access::target::local> 00043 scratch(cl::sycl::range<1>(local), h); 00044 00045 /* The parallel_for invocation chosen is the variant with an nd_item 00046 * parameter, since the code requires barriers for correctness. */ 00047 h.parallel_for<KernelName>( 00048 r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) { 00049 size_t globalid = id.get_global(0); 00050 size_t localid = id.get_local(0); 00051 /* All threads collectively read from global memory into local. 00052 * The barrier ensures all threads' IO is resolved before 00053 * execution continues (strictly speaking, all threads within 00054 * a single work-group - there is no co-ordination between 00055 * work-groups, only work-items). */ 00056 if (globalid < length) { 00057 scratch[localid] = aI[globalid]; 00058 } 00059 id.barrier(cl::sycl::access::fence_space::local_space); 00060 00061 /* Apply the reduction operation between the current local 00062 * id and the one on the other half of the vector. */ 00063 if (globalid < length) { 00064 int min = (length < local) ? length : local; 00065 for (size_t offset = min / 2; offset > 0; offset /= 2) { 00066 if (localid < offset) { 00067 scratch[localid] += scratch[localid + offset]; 00068 } 00069 id.barrier(cl::sycl::access::fence_space::local_space); 00070 } 00071 /* The final result will be stored in local id 0. */ 00072 if (localid == 0) { 00073 aI[id.get_group(0)] = scratch[localid]; 00074 if((length<=local) && globalid ==0){ 00075 aOut[globalid]=scratch[localid]; 00076 } 00077 } 00078 } 00079 }); 00080 }; 00081 dev.m_queue.submit(f); 00082 dev.m_queue.throw_asynchronous(); 00083 00084 /* At this point, you could queue::wait_and_throw() to ensure that 00085 * errors are caught quickly. However, this would likely impact 00086 * performance negatively. */ 00087 length = length / local; 00088 00089 } while (length > 1); 00090 00091 00092 00093 } 00094 00095 }; 00096 00101 // a leafNode. 00102 template <typename Self, typename Op, bool Vectorizable> 00103 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { 00104 00105 typedef typename Self::CoeffReturnType CoeffReturnType; 00106 static const bool HasOptimizedImplementation = false; 00107 00108 static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { 00109 typedef const typename Self::ChildType HostExpr; 00110 typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; 00111 auto functors = TensorSycl::internal::extractFunctors(self.impl()); 00112 int red_factor =256; 00113 size_t inputSize =self.impl().dimensions().TotalSize(); 00114 size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input 00115 size_t remaining = inputSize% red_factor; 00116 if(rng ==0) { 00117 red_factor=1; 00118 }; 00119 size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2; 00120 size_t GRange=std::max((size_t )1, rng); 00121 00122 // convert global range to power of 2 for redecution 00123 GRange--; 00124 GRange |= GRange >> 1; 00125 GRange |= GRange >> 2; 00126 GRange |= GRange >> 4; 00127 GRange |= GRange >> 8; 00128 GRange |= GRange >> 16; 00129 #if __x86_64__ || __ppc64__ || _WIN64 00130 GRange |= GRange >> 32; 00131 #endif 00132 GRange++; 00133 size_t outTileSize = tileSize; 00135 if (GRange < outTileSize) outTileSize=GRange; 00136 // getting final out buffer at the moment the created buffer is true because there is no need for assign 00137 auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output); 00141 auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange)); 00142 typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; 00143 Dims dims= self.xprDims(); 00144 Op functor = reducer; 00145 dev.m_queue.submit([&](cl::sycl::handler &cgh) { 00146 // create a tuple of accessors from Evaluator 00147 auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); 00148 auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh); 00149 00150 cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) { 00151 typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; 00152 auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); 00156 const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); 00159 auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); 00161 auto globalid=itemID.get_global_linear_id(); 00162 00163 if(globalid<rng) 00164 tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor)); 00165 else 00166 tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0); 00167 00168 if(remaining!=0 && globalid==0 ) 00169 // this will add the rest of input buffer when the input size is not devidable to red_factor. 00170 tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor)); 00171 }); 00172 }); 00173 dev.m_queue.throw_asynchronous(); 00174 00176 syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); 00177 } 00178 00179 }; 00180 00181 template <typename Self, typename Op> 00182 struct InnerReducer<Self, Op, const Eigen::SyclDevice> { 00183 00184 typedef typename Self::CoeffReturnType CoeffReturnType; 00185 static const bool HasOptimizedImplementation = false; 00186 00187 static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { 00188 typedef const typename Self::ChildType HostExpr; 00189 typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; 00190 auto functors = TensorSycl::internal::extractFunctors(self.impl()); 00191 00192 size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2; 00193 00194 size_t GRange=num_coeffs_to_preserve; 00195 if (tileSize>GRange) tileSize=GRange; 00196 else if(GRange>tileSize){ 00197 size_t xMode = GRange % tileSize; 00198 if (xMode != 0) GRange += (tileSize - xMode); 00199 } 00200 // getting final out buffer at the moment the created buffer is true because there is no need for assign 00204 typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; 00205 Dims dims= self.xprDims(); 00206 Op functor = reducer; 00207 00208 dev.m_queue.submit([&](cl::sycl::handler &cgh) { 00209 // create a tuple of accessors from Evaluator 00210 auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); 00211 auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output); 00212 00213 cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { 00214 typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; 00215 auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); 00219 const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); 00222 typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf; 00223 auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); 00225 auto globalid=itemID.get_global_linear_id(); 00226 if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) { 00227 typename DeiceSelf::CoeffReturnType accum = functor.initialize(); 00228 GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum); 00229 functor.finalize(accum); 00230 output_accessor.get_pointer()[globalid]= accum; 00231 } 00232 }); 00233 }); 00234 dev.m_queue.throw_asynchronous(); 00235 return false; 00236 } 00237 }; 00238 00239 } // end namespace internal 00240 } // namespace Eigen 00241 00242 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP