curfil: src/ndarray/src/cuv/ndarray.hpp Source File

 #ifndef __CUV_NDARRAY_HPP__
 #define __CUV_NDARRAY_HPP__
 
 #include <boost/multi_array/extent_gen.hpp>
 #include <boost/multi_array/index_gen.hpp>
 #include <boost/shared_ptr.hpp>
 #include <iostream>
 #include <limits>
 #include <numeric>
 #include <stdexcept>
 #include <vector>
 
 #include "allocators.hpp"
 #include "memory.hpp"
 #include "meta_programming.hpp"
 
 namespace cuv {
 
 static inline void cuvAssertFailed(const char *msg) {
     throw std::runtime_error(std::string(msg));
 }
 
 #define cuvAssert(X)  \
   if(!(X)){ cuv::cuvAssertFailed(#X); }
 
 using boost::detail::multi_array::extent_gen;
 using boost::detail::multi_array::index_gen;
 
 typedef boost::detail::multi_array::index_range<boost::detail::multi_array::index, boost::detail::multi_array::size_type> index_range;
 
 typedef index_range::index index;
 
 #ifndef CUV_DONT_CREATE_EXTENTS_OBJ
 
 namespace {
 extent_gen<0> extents;
 
 index_gen<0, 0> indices;
 }
 #endif
 
 template<class V, class M, class L> class ndarray;
 template<class V, class M, class L> class ndarray_view;
 
 template<class V, class M, class L>
 void fill(ndarray<V, M, L>& v, const V& p);
 
 namespace detail {
 
 template<class index_type, class size_type>
 void get_pitched_params(size_type& rows, size_type& cols, size_type& pitch,
         const linear_memory<size_type, host_memory_space>& shape,
         const linear_memory<index_type, host_memory_space>& stride, row_major) {
     // strided dimension is the LAST one
     rows = std::accumulate(shape[0].ptr, shape[0].ptr + shape.size() - 1, 1, std::multiplies<index_type>());
     cols = shape[shape.size() - 1];
     pitch = stride[shape.size() - 2];
 }
 
 template<class index_type, class size_type>
 void get_pitched_params(size_type& rows, size_type& cols, size_type& pitch,
         const linear_memory<size_type, host_memory_space>& shape,
         const linear_memory<index_type, host_memory_space>& stride, column_major) {
     // strided dimension is the FIRST one
     rows = std::accumulate(shape[0].ptr + 1, shape[0].ptr + shape.size(), 1, std::multiplies<index_type>());
     cols = shape[0];
     pitch = stride[1];
 }
 
 }
 
 template<class M, class L>
 class ndarray_info {
 
 public:
 
     typedef unsigned int size_type; 
     typedef int index_type; 
     typedef M data_memory_space; 
 
     boost::shared_ptr<allocator> m_allocator;
 
     linear_memory<size_type, host_memory_space> host_shape;
 
     linear_memory<index_type, host_memory_space> host_stride;
 
     linear_memory<size_type, data_memory_space> data_shape;
 
     linear_memory<index_type, data_memory_space> data_stride;
 
     ndarray_info(const boost::shared_ptr<allocator>& _allocator) :
             m_allocator(_allocator), host_shape(_allocator), host_stride(_allocator),
                     data_shape(_allocator), data_stride(_allocator)
     {
     }
 
     size_type size() {
         return host_shape.size();
     }
 
     ndarray_info(size_type s, const boost::shared_ptr<allocator>& _allocator) :
             m_allocator(_allocator), host_shape(_allocator), host_stride(_allocator),
                     data_shape(_allocator), data_stride(_allocator)
     {
         resize(s);
     }
 
     void resize(size_type s) {
         host_shape.set_size(s);
         host_stride.set_size(s);
     }
 
     ndarray_info(const ndarray_info<M, L>& o) :
             m_allocator(o.m_allocator), host_shape(o.host_shape), host_stride(o.host_stride),
                     data_shape(m_allocator), data_stride(m_allocator)
     {
     }
 
     template<class OM>
     ndarray_info(const ndarray_info<OM, L>& o) :
             m_allocator(o.m_allocator), host_shape(o.host_shape), host_stride(o.host_stride),
                     data_shape(m_allocator), data_stride(m_allocator)
     {
     }
 
 };
 
 template<class V, class M, class L = row_major>
 class ndarray {
 
 public:
 
     typedef memory<V, M> memory_type; 
     typedef typename memory_type::reference_type reference_type; 
     typedef typename memory_type::const_reference_type const_reference_type; 
     typedef typename memory_type::memory_space_type memory_space_type; 
     typedef typename memory_type::value_type value_type; 
     typedef typename memory_type::size_type size_type; 
     typedef typename memory_type::index_type index_type; 
     typedef L memory_layout_type; 
 
     typedef ndarray_info<M, L> info_type; 
     typedef ndarray_view<V, M, L> view_type; 
 
 public:
     boost::shared_ptr<allocator> m_allocator;
 
 private:
     void check_size_limit(size_t size) const {
         if (size > static_cast<size_t>(std::numeric_limits<index_type>::max())) {
             throw std::runtime_error("maximum ndarray size exceeded");
         }
     }
 
     template<class _V, class _M, class _L>
     friend class ndarray_view;
 
 protected:
 
     info_type m_info;
 
     boost::shared_ptr<memory_type> m_memory;
 
     V* m_ptr;
 
     size_type index_of(int D, index_type* arr) const {
         index_type pos = 0;
         for (int i = 0; i < D; i++) {
             index_type temp = arr[i];
             if (temp < 0)
                 temp = m_info.host_shape[i] + temp;
             pos += temp * m_info.host_stride[i];
         }
         return pos;
     }
 
     void allocate(ndarray& t, linear_memory_tag) {
         linear_memory<V, M> mem(t.size(), t.m_allocator);
         mem.set_strides(t.m_info.host_stride, t.m_info.host_shape, L());
         t.m_ptr = mem.ptr();
         t.m_memory.reset(new memory<V, M>(mem.release(), mem.size(), t.m_allocator));
     }
 
     void allocate(ndarray& t, pitched_memory_tag) {
         typename ndarray<V, M, L>::size_type row, col, pitch;
         detail::get_pitched_params(row, col, pitch, t.m_info.host_shape, t.m_info.host_stride, L());
         pitched_memory<V, M> d(row, col);
         d.set_strides(t.m_info.host_stride, t.m_info.host_shape, L());
         t.m_ptr = d.ptr();
         t.m_memory.reset(new memory<V, M>(d.release(), d.size(), t.m_allocator));
     }
 
 public:
 
     template<size_t D>
     size_type index_of(const extent_gen<D>& eg) const {
         index_type pos = 0;
         for (size_t i = 0; i < D; i++) {
             index_type temp = eg.ranges_[i].finish();
             if (temp < 0)
                 temp = m_info.host_shape[i] + temp;
             pos += temp * m_info.host_stride[i];
         }
         return pos;
     }
 
 
     index_type ndim() const {
         return m_info.host_shape.size();
     }
 
     size_type shape(const size_t i) const {
         return m_info.host_shape[i];
     }
 
     index_type stride(const size_t i) const {
         return m_info.host_stride[i];
     }
 
     V* ptr() {
         return m_ptr;
     }
 
     const V* ptr() const {
         return m_ptr;
     }
 
     void set_ptr_offset(long int i) {
         m_ptr = m_memory->ptr() + i;
     }
 
     boost::shared_ptr<memory_type>& mem() {
         return m_memory;
     }
     const boost::shared_ptr<memory_type>& mem() const {
         return m_memory;
     }
 
     size_type size() const {
         size_t size = std::accumulate(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size(), 1,
                 std::multiplies<size_t>());
 
         check_size_limit(size);
 
         return static_cast<size_type>(size);
     }
 
     size_type memsize() const {
 #ifndef NDEBUG
         cuvAssert(is_c_contiguous());
 #endif
         size_t size = std::accumulate(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size(), 1,
                 std::multiplies<size_type>());
 
         check_size_limit(size);
 
         return static_cast<size_type>(size);
     }
 
     std::vector<size_type> shape() const {
         if (ndim() == 0)
             return std::vector<size_type>();
         return std::vector<size_type>(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size());
     }
 
     std::vector<size_type> effective_shape() const {
         std::vector<size_type> shape;
         shape.reserve(ndim());
         if (ndim() == 0)
             return shape;
         std::remove_copy_if(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size(),
                 std::back_inserter(shape), std::bind2nd(std::equal_to<size_type>(), 1));
         return shape;
     }
 
     const info_type& info() const {
         return m_info;
     }
 
     info_type& info() {
         return m_info;
     }
 
     bool is_c_contiguous() const {
         return detail::is_c_contiguous(memory_layout_type(), m_info.host_shape, m_info.host_stride);
     }
 
     bool is_2dcopyable() const {
         return detail::is_2dcopyable(memory_layout_type(), m_info.host_shape, m_info.host_stride);
     }
  // accessors
     reference_type operator[](index_type idx) {
         size_type ndim = m_info.host_shape.size();
         size_type* virtualstride = new size_type[ndim];
         size_type pos = 0;
         if (IsSame<L, row_major>::Result::value) {
             // row major
             {
                 size_type virt_size = 1;
                 for (int i = ndim - 1; i >= 0; --i) {
                     virtualstride[i] = virt_size;
                     virt_size *= m_info.host_shape[i];
                 }
             }
             for (size_type i = 0; i < ndim; ++i) {
                 pos += (idx / virtualstride[i]) * m_info.host_stride[i];
                 idx -= (idx / virtualstride[i]) * virtualstride[i];
             }
         } else {
             // column major
             {
                 size_type virt_size = 1;
                 for (unsigned int i = 0; i < ndim; ++i) {
                     virtualstride[i] = virt_size;
                     virt_size *= m_info.host_shape[i];
                 }
             }
             for (int i = ndim - 1; i >= 0; --i) {
                 pos += (idx / virtualstride[i]) * m_info.host_stride[i];
                 idx -= (idx / virtualstride[i]) * virtualstride[i];
             }
         }
         delete[] virtualstride;
         return reference_type(m_ptr + pos);
     }
 
     const_reference_type operator[](index_type idx) const {
         return const_cast<ndarray&>(*this)[idx];
     }
 
     reference_type operator()(index_type i0) {
 #ifndef NDEBUG
         cuvAssert(ndim()==1);
         cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)(-i0)<shape(0)+1))
 #endif
         if (i0 >= 0) {
             return reference_type(m_ptr + i0);
         } else {
             return reference_type(m_ptr + shape(0) - i0);
         }
     }
 
     const_reference_type operator()(index_type i0) const {
         return const_cast<ndarray&>(*this)(i0);
     }
 
     const_reference_type operator()(index_type i0, index_type i1) const {
         return const_cast<ndarray&>(*this)(i0, i1);
     }
 
     reference_type operator()(index_type i0, index_type i1) {
 #ifndef NDEBUG
         cuvAssert(ndim()==2);
         cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)(-i0)<shape(0)+1))
         cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)(-i1)<shape(1)+1))
 #endif
         index_type arr[2] = { i0, i1 };
         return reference_type(m_ptr + index_of(2, arr));
     }
 
     const_reference_type operator()(index_type i0, index_type i1, index_type i2) const {
         return const_cast<ndarray&>(*this)(i0, i1, i2);
     }
 
     reference_type operator()(index_type i0, index_type i1, index_type i2) {
 #ifndef NDEBUG
         cuvAssert(ndim()==3);
         cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0<shape(0)+1))
         cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1<shape(1)+1))
         cuvAssert((i2>=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2<shape(2)+1))
 #endif
         index_type arr[3] = { i0, i1, i2 };
         return reference_type(m_ptr + index_of(3, arr));
     }
 
     const_reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3) const {
         return const_cast<ndarray&>(*this)(i0, i1, i2, i3);
     }
 
     reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3) {
 #ifndef NDEBUG
         cuvAssert(ndim()==4);
         cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0<shape(0)+1))
         cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1<shape(1)+1))
         cuvAssert((i2>=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2<shape(2)+1))
         cuvAssert((i3>=0 && (size_type)i3 < shape(3)) || (i3<0 && (size_type)-i3<shape(3)+1))
 #endif
         index_type arr[4] = { i0, i1, i2, i3 };
         return reference_type(m_ptr + index_of(4, arr));
     }
 
     const_reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3,
             index_type i4) const {
         return const_cast<ndarray&>(*this)(i0, i1, i2, i3, i4);
     }
 
     reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3, index_type i4) {
 #ifndef NDEBUG
         cuvAssert(ndim()==5);
         cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0<shape(0)+1))
         cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1<shape(1)+1))
         cuvAssert((i2>=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2<shape(2)+1))
         cuvAssert((i3>=0 && (size_type)i3 < shape(3)) || (i3<0 && (size_type)-i3<shape(3)+1))
         cuvAssert((i4>=0 && (size_type)i4 < shape(4)) || (i4<0 && (size_type)-i4<shape(4)+1))
 #endif
         index_type arr[5] = { i0, i1, i2, i3, i4 };
         return reference_type(m_ptr + index_of(5, arr));
     }
  // accessing stored values
     ndarray(const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
             m_allocator(_allocator), m_info(_allocator), m_ptr(NULL) {
     }
 
     // ****************************************************************
     //        Constructing from other ndarray
     // ****************************************************************
 
     ndarray(const ndarray& o) :
             m_allocator(o.m_allocator),
                     m_info(o.m_info), // copy only shape
                     m_memory(o.m_memory), // increase ref counter
                     m_ptr(o.m_ptr) {
     } // same pointer in memory
 
     template<class OM>
     ndarray(const ndarray<value_type, OM, L>& o, cudaStream_t stream = 0) :
             m_allocator(o.m_allocator),
                     m_info(o.info()), // primarily to copy shape
                     m_ptr(NULL) {
         copy_memory(o, linear_memory_tag(), stream);
         m_ptr = m_memory->ptr();
     }
 
     explicit ndarray(const ndarray& o, pitched_memory_tag, cudaStream_t stream = 0) :
             m_allocator(o.m_allocator),
                     m_info(o.m_info), // primarily to copy shape
                     m_ptr(NULL) {
         copy_memory(o, pitched_memory_tag(), stream);
         m_ptr = m_memory->ptr();
     }
 
     template<class OM>
     explicit ndarray(const ndarray<value_type, OM, L>& o, pitched_memory_tag, cudaStream_t stream = 0) :
             m_allocator(o.m_allocator),
                     m_info(o.info()), // primarily to copy shape
                     m_ptr(NULL) {
         copy_memory(o, pitched_memory_tag(), stream);
         m_ptr = m_memory->ptr();
     }
 
     explicit ndarray(const ndarray& o, linear_memory_tag, cudaStream_t stream = 0) :
             m_allocator(o.m_allocator),
                     m_info(o.m_info), // primarily to copy shape
                     m_ptr(NULL) {
         copy_memory(o, linear_memory_tag(), stream);
         m_ptr = m_memory->ptr();
     }
 
     template<class OM>
     explicit ndarray(const ndarray<value_type, OM, L>& o, linear_memory_tag, cudaStream_t stream = 0) :
             m_allocator(o.m_allocator),
                     m_info(o.info()), // primarily to copy shape
                     m_ptr(NULL) {
         copy_memory(o, linear_memory_tag(), stream);
         m_ptr = m_memory->ptr();
     }
 
     template<class OL>
     explicit ndarray(const ndarray<value_type, M, OL>& o) :
             m_allocator(o.m_allocator),
                     m_info(o.m_allocator),
                     m_memory(o.mem()), // increase ref counter
                     m_ptr(const_cast<V*>(o.ptr())) { // same pointer in memory
         m_info.host_shape = o.info().host_shape;
         m_info.host_shape.reverse();
         m_info.host_stride = o.info().host_stride;
         m_info.host_stride.reverse();
     }
 
     // ****************************************************************
     //        Constructing from SHAPE
     // ****************************************************************
 
     explicit ndarray(const size_type i,
             const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
             m_allocator(_allocator),
                     m_info(_allocator),
                     m_ptr(NULL) {
         m_info.resize(1);
         m_info.host_shape[0] = i;
         allocate(*this, linear_memory_tag());
     }
 
     explicit ndarray(const size_type i, const int j, const boost::shared_ptr<allocator> _allocator =
             boost::make_shared<default_allocator>()) :
             m_allocator(_allocator),
                     m_info(_allocator),
                     m_ptr(NULL) {
         m_info.resize(2);
         m_info.host_shape[0] = i;
         m_info.host_shape[1] = j;
         allocate(*this, linear_memory_tag());
     }
 
     template<size_t D>
     explicit ndarray(const extent_gen<D>& eg,
             const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
             m_allocator(_allocator),
                     m_info(_allocator),
                     m_ptr(NULL) {
         m_info.resize(D);
         for (size_t i = 0; i < D; i++)
             m_info.host_shape[i] = eg.ranges_[i].finish();
         allocate(*this, linear_memory_tag());
     }
 
     explicit ndarray(const std::vector<size_type>& eg,
             const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
             m_allocator(_allocator),
                     m_info(_allocator),
                     m_ptr(NULL) {
         m_info.resize(eg.size());
         for (size_t i = 0; i < eg.size(); i++)
             m_info.host_shape[i] = eg[i];
         allocate(*this, linear_memory_tag());
     }
 
     explicit ndarray(const std::vector<size_type>& eg, pitched_memory_tag,
             const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
             m_allocator(_allocator),
                     m_info(_allocator),
                     m_ptr(NULL) {
         m_info.resize(eg.size());
         for (size_t i = 0; i < eg.size(); i++)
             m_info.host_shape[i] = eg[i];
         allocate(*this, pitched_memory_tag());
     }
 
     template<size_t D>
     explicit ndarray(const extent_gen<D>& eg, pitched_memory_tag, const boost::shared_ptr<allocator> _allocator =
             boost::make_shared<default_allocator>()) :
             m_allocator(_allocator),
                     m_info(_allocator),
                     m_ptr(NULL) {
         m_info.resize(D);
         for (size_t i = 0; i < D; i++)
             m_info.host_shape[i] = eg.ranges_[i].finish();
         allocate(*this, pitched_memory_tag());
     }
 
     // ****************************************************************
     //        Constructing from shape and raw pointer
     // ****************************************************************
 
     template<size_t D>
     explicit ndarray(const extent_gen<D>& eg, value_type* ptr, const boost::shared_ptr<allocator> _allocator =
             boost::make_shared<default_allocator>()) :
             m_allocator(_allocator),
                     m_info(_allocator),
                     m_ptr(ptr) {
         m_info.resize(D);
         size_t size = 1;
         if (IsSame<memory_layout_type, row_major>::Result::value) {
             for (int i = D - 1; i >= 0; i--) {
                 m_info.host_shape[i] = eg.ranges_[i].finish();
                 m_info.host_stride[i] = size;
                 size *= eg.ranges_[i].finish();
             }
         } else {
             for (size_t i = 0; i < D; i++) {
                 m_info.host_shape[i] = eg.ranges_[i].finish();
                 m_info.host_stride[i] = size;
                 size *= eg.ranges_[i].finish();
             }
         }
         m_memory.reset(new memory<V, M>(ptr, size, m_allocator, false));
     }
 
     explicit ndarray(const std::vector<size_type>& shape, value_type* ptr,
             const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
             m_allocator(_allocator),
                     m_info(_allocator),
                     m_ptr(ptr) {
         unsigned int D = shape.size();
         m_info.resize(D);
         size_type size = 1;
         if (IsSame<memory_layout_type, row_major>::Result::value)
             for (int i = D - 1; i >= 0; i--) {
                 m_info.host_shape[i] = shape[i];
                 m_info.host_stride[i] = size;
                 size *= shape[i];
             }
         else
             for (size_t i = 0; i < D; i++) {
                 m_info.host_shape[i] = shape[i];
                 m_info.host_stride[i] = size;
                 size *= shape[i];
             }
     }
     template<int D, int E>
     explicit ndarray(const index_gen<D, E>& idx, value_type* ptr, const boost::shared_ptr<allocator> _allocator =
             boost::make_shared<default_allocator>()) :
             m_allocator(_allocator),
                     m_info(_allocator),
                     m_ptr(ptr) {
         m_info.resize(D);
         size_type size = 1;
         if (IsSame<memory_layout_type, row_major>::Result::value)
             for (int i = D - 1; i >= 0; i--) {
                 m_info.host_shape[i] = idx.ranges_[i].finish();
                 m_info.host_stride[i] = size;
                 size *= idx.ranges_[i].finish();
             }
         else
             for (size_t i = 0; i < D; i++) {
                 m_info.host_shape[i] = idx.ranges_[i].finish();
                 m_info.host_stride[i] = size;
                 size *= idx.ranges_[i].finish();
             }
     }
     // @} // constructors
 
     // ****************************************************************
     //   assignment operators (try not to reallocate if shapes match)
     // ****************************************************************
 
     template<class _M, class _L>
     ndarray& assign(const ndarray<V, _M, _L>& o, cudaStream_t stream = 0) {
         if (!copy_memory(o, false, stream))
             throw std::runtime_error("copying ndarray did not succeed. Maybe a shape mismatch?");
         return *this;
     }
 
     ndarray& operator=(const ndarray& o) {
         if (this == &o)
             return *this; // check for self-assignment
 
         // TODO make use of copy-and-swap idiom
         m_memory = o.mem();
         m_ptr = const_cast<V*>(o.ptr());
         m_info = o.info();
         return *this;
     }
 
     template<class _V>
     typename boost::enable_if_c<boost::is_convertible<_V, value_type>::value, ndarray&>::type operator=(
             const _V& scalar) {
         fill(*this, scalar);
         return *this;
     }
 
     template<class OM>
     ndarray& assign(const ndarray<value_type, OM, L>& o, cudaStream_t stream = 0) {
         if (!copy_memory(o, false, stream))
             copy_memory(o, linear_memory_tag(), stream);
         if (mem())
             // if mem() does not exist, we're just wrapping a pointer
             // of a std::vector or so -> simply keep it
             m_ptr = mem()->ptr();
         return *this;
     }
 
     template<class OM>
     ndarray& operator=(const ndarray<value_type, OM, L>& o) {
         return assign(o);
     }
 
     template<class OL>
     ndarray& operator=(const ndarray<value_type, M, OL>& o) {
         return assign(o);
     }
  // assignment
     template<class T>
     ndarray copy(T tag = linear_memory_tag(), cudaStream_t stream = 0) const {
         ndarray t(m_allocator);
         const ndarray& o = *this;
         t.m_info = o.info();
         t.copy_memory(o, tag, stream);
         t.m_ptr = t.mem()->ptr();
         return t;
     }
 
     ndarray copy() const {
         return copy(linear_memory_tag());
     }
 
     template<int D, int E>
     ndarray_view<V, M, L> operator[](const index_gen<D, E>& idx) const {
 
         ndarray_view<V, M, L> t(m_allocator);
         const ndarray& o = *this;
         t.m_memory = o.mem();
         t.m_ptr = const_cast<V*>(o.ptr());
 
         std::vector<int> shapes;
         std::vector<int> strides;
         shapes.reserve(D);
         strides.reserve(D);
         cuvAssert(o.ndim()==D);
 
         for (size_t i = 0; i < D; i++) {
             int start = idx.ranges_[i].get_start(0);
             int finish = idx.ranges_[i].get_finish(o.shape(i));
             int stride = idx.ranges_[i].stride();
             if (start < 0)
                 start += o.shape(i);
             if (finish < 0)
                 finish += o.shape(i);
 #ifndef NDEBUG
             cuvAssert(finish>start);
 #endif
             t.m_ptr += start * o.stride(i);
             if (idx.ranges_[i].is_degenerate()) {
                 // skip dimension
             } else {
                 shapes.push_back((finish - start) / stride);
                 strides.push_back(o.stride(i) * stride);
             }
         }
 
         // store in m_info
         t.m_info.resize(shapes.size());
 
         std::copy(shapes.begin(), shapes.end(), t.m_info.host_shape[0].ptr);
         std::copy(strides.begin(), strides.end(), t.m_info.host_stride[0].ptr);
         return t; // should not copy mem, only m_info
     }
 
     template<size_t D>
     void reshape(const extent_gen<D>& eg) {
         std::vector<size_type> shape(D);
         for (size_t i = 0; i < D; i++)
             shape[i] = eg.ranges_[i].finish();
         reshape(shape);
     }
     void reshape(const std::vector<size_type>& shape) {
         size_type new_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_type>());
         if (!is_c_contiguous())
             throw std::runtime_error("cannot reshape: ndarray is not c_contiguous");
         if (size() != new_size)
             throw std::runtime_error("cannot reshape: products do not match");
         m_info.resize(shape.size());
         size_type size = 1;
         if (IsSame<memory_layout_type, row_major>::Result::value)
             for (int i = shape.size() - 1; i >= 0; i--) {
                 m_info.host_shape[i] = shape[i];
                 m_info.host_stride[i] = size;
                 size *= shape[i];
             }
         else
             for (size_t i = 0; i < shape.size(); i++) {
                 m_info.host_shape[i] = shape[i];
                 m_info.host_stride[i] = size;
                 size *= shape[i];
             }
     }
     void reshape(size_type r, size_type c) {
         reshape(extents[r][c]);
     }
 
     void resize(const std::vector<size_type>& shape) {
         if (ndim() != 0) {
             size_type new_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_type>());
             if (is_c_contiguous() && size() == new_size) {
                 reshape(shape);
                 return;
             }
         }
 
         // free memory before we allocate new memory (important if pooling is active)
         m_memory.reset(new memory<V, M>(0, 0, m_allocator));
         *this = ndarray(shape, m_allocator);
     }
     template<size_t D>
     void resize(const extent_gen<D>& eg) {
         std::vector<size_type> shape(D);
         for (size_t i = 0; i < D; i++)
             shape[i] = eg.ranges_[i].finish();
         resize(shape);
     }
 
     void resize(size_type size) {
         resize(extents[size]);
     }
 
     void resize(size_type r, size_type c) {
         resize(extents[r][c]);
     }
 
     void dealloc() {
         m_memory.reset();
         m_ptr = NULL;
         m_info.host_shape.set_size(0);
     }
 
     template<class OM, class OL>
     bool copy_memory(const ndarray<V, OM, OL>& src, bool force_dst_contiguous, cudaStream_t stream) {
         if (effective_shape() != src.effective_shape() || !ptr()) {
             return false;
         }
 
         assert(m_memory.get());
         // ATTENTION: m_ptr might be different than m_memory->ptr()!
 
         // TODO: this could be probably implemented in the memory classes as well
 
         if (is_c_contiguous() && src.is_c_contiguous()) {
             // can copy w/o bothering about m_memory
             m_memory->copy_from(m_ptr, src.ptr(), src.size(), OM(), stream);
         } else if (is_c_contiguous() && src.is_2dcopyable()) {
             size_type row, col, pitch;
             detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL());
             m_memory->copy2d_from(m_ptr, src.ptr(), col, pitch, row, col, OM(), stream);
         } else if (!force_dst_contiguous && is_2dcopyable() && src.is_c_contiguous()) {
             size_type row, col, pitch;
             detail::get_pitched_params(row, col, pitch, info().host_shape, info().host_stride, L());
             m_memory->copy2d_from(m_ptr, src.ptr(), pitch, col, row, col, OM(), stream);
         } else if (!force_dst_contiguous && is_2dcopyable() && src.is_c_contiguous()) {
             size_type srow, scol, spitch;
             size_type drow, dcol, dpitch;
             detail::get_pitched_params(drow, dcol, dpitch, info().host_shape, info().host_stride, L());
             detail::get_pitched_params(srow, scol, spitch, src.info().host_shape, src.info().host_stride, OL());
             cuvAssert(scol==srow);
             cuvAssert(dcol==drow);
             m_memory->copy2d_from(m_ptr, src.ptr(), dpitch, spitch, srow, scol, OM(), stream);
         } else {
             throw std::runtime_error("copying of generic strides not implemented yet");
         }
 
         if (!IsSame<L, OL>::Result::value) {
             info().host_stride.reverse();
             info().host_shape.reverse();
         }
         return true;
     }
 
     template<class OM, class OL>
     void copy_memory(const ndarray<V, OM, OL>& src, linear_memory_tag, cudaStream_t stream) {
         if (copy_memory(src, true, stream)) // destination must be contiguous
             return;
         info().resize(src.ndim());
         info().host_shape = src.info().host_shape;
 
         // free old memory
         m_memory.reset(new memory<V, M>(m_allocator));
 
         linear_memory<V, M> d(src.size(), m_allocator);
         d.set_strides(info().host_stride, info().host_shape, L());
         if (src.is_c_contiguous()) {
             // easiest case: both linear, simply copy
             d.copy_from(src.ptr(), src.size(), OM(), stream);
         } else if (src.is_2dcopyable()) {
             // other memory is probably a pitched memory or some view onto an array
             size_type row, col, pitch;
             detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL());
             d.copy2d_from(src.ptr(), col, pitch, row, col, OM(), stream);
         } else {
             throw std::runtime_error("copying arbitrarily strided memory not implemented");
         }
         mem().reset(new memory<V, M>(d.release(), d.size(), m_allocator));
         if (!IsSame<L, OL>::Result::value) {
             info().host_stride.reverse();
             info().host_shape.reverse();
         }
     }
 
     template<class OM, class OL>
     void copy_memory(const ndarray<V, OM, OL>& src, pitched_memory_tag, cudaStream_t stream) {
         assert(src.ndim()>=2);
         if (copy_memory(src, false, stream)) // destination need not be contiguous
             return;
         info().resize(src.ndim());
         info().host_shape = src.info().host_shape;
         size_type row, col, pitch;
         detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL());
         pitched_memory<V, M> d(row, col);
         //dst.mem().reset(d);
         d->set_strides(info().host_stride, info().host_shape, L());
         if (src.is_2dcopyable()) {
             // other memory is probably a pitched memory or some view onto an array
             detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL());
             d.copy2d_from(src, stream);
         } else {
             throw std::runtime_error("copying arbitrarily strided memory not implemented");
         }
         mem().reset(new memory<V, M>(d.release(), d.size(), m_allocator));
 
         if (!IsSame<L, OL>::Result::value) {
             info().host_stride.reverse();
             info().host_shape.reverse();
         }
     }
 
 };
 
 template<class V, class M, class L = row_major>
 class ndarray_view: public ndarray<V, M, L>
 {
 private:
     typedef ndarray<V, M, L> super;
     using super::m_memory;
     using super::m_ptr;
     using super::m_info;
 
     template<class _V, class _M, class _L>
     friend class ndarray;
 
 public:
 
     ndarray_view(const boost::shared_ptr<allocator>& allocator) :
             ndarray<V, M, L>(allocator) {
     }
 
     ndarray_view& assign(const ndarray<V, M, L>& o, cudaStream_t stream = 0) {
         if (!this->copy_memory(o, false, stream))
             throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?");
         return *this;
     }
 
     ndarray_view& assign(const ndarray_view<V, M, L>& o, cudaStream_t stream = 0) {
         if (!this->copy_memory(o, false, stream))
             throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?");
         return *this;
     }
 
     template<class OM>
     ndarray_view& assign(const ndarray<V, OM, L>& o, cudaStream_t stream = 0) {
         if (!this->copy_memory(o, false, stream))
             throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?");
         return *this;
     }
 
     template<class OM>
     ndarray_view& assign(const ndarray_view<V, OM, L>& o, cudaStream_t stream = 0) {
         if (!this->copy_memory(o, false, stream))
             throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?");
         return *this;
     }
 
     ndarray_view& operator=(const ndarray<V, M, L>& o) {
         return assign(o);
     }
 
     ndarray_view& operator=(const ndarray_view<V, M, L>& o) {
         return assign(o);
     }
 
     template<class _V>
     typename boost::enable_if_c<boost::is_convertible<_V, V>::value, ndarray_view&>::type operator=(
             const _V& scalar) {
         super::operator=(scalar);
         return *this;
     }
 
     template<class OM>
     ndarray_view& operator=(const ndarray<V, OM, L>& o) {
         return assign(o);
     }
 
     template<class OM>
     ndarray_view& operator=(const ndarray_view<V, OM, L>& o) {
         return assign(o);
     }
 
     template<int D, int E>
     explicit ndarray_view(const ndarray<V, M, L>& o, const index_gen<D, E>& idx) :
             ndarray<V, M, L>(o.m_allocator)
     {
         m_memory = o.mem();
         m_ptr = const_cast<V*>(o.ptr());
         std::vector<int> shapes;
         std::vector<int> strides;
         shapes.reserve(D);
         strides.reserve(D);
         cuvAssert(o.ndim()==D);
         for (size_t i = 0; i < D; i++) {
             int start = idx.ranges_[i].get_start(0);
             int finish = idx.ranges_[i].get_finish(o.shape(i));
             int stride = idx.ranges_[i].stride();
             if (start < 0)
                 start += o.shape(i);
             if (finish < 0)
                 finish += o.shape(i);
 #ifndef NDEBUG
             cuvAssert(finish>start);
 #endif
             m_ptr += start * o.stride(i);
             if (idx.ranges_[i].is_degenerate()) {
                 // skip dimension
             } else {
                 shapes.push_back((finish - start) / stride);
                 strides.push_back(o.stride(i) * stride);
             }
         }
         // store in m_info
         m_info.resize(shapes.size());
         std::copy(shapes.begin(), shapes.end(), m_info.host_shape[0].ptr);
         std::copy(strides.begin(), strides.end(), m_info.host_stride[0].ptr);
     }
 
     template<int D, int E>
     explicit ndarray_view(const index_gen<D, E>& idx, const ndarray<V, M, L>& o) :
             ndarray<V, M, L>(o.m_allocator)
     {
         m_memory = o.mem();
         m_ptr = const_cast<V*>(o.ptr());
         std::vector<int> shapes;
         std::vector<int> strides;
         shapes.reserve(D);
         strides.reserve(D);
         cuvAssert(o.ndim()==D);
         for (size_t i = 0; i < D; i++) {
             int start = idx.ranges_[i].get_start(0);
             int finish = idx.ranges_[i].get_finish(o.shape(i));
             int stride = idx.ranges_[i].stride();
             if (start < 0)
                 start += o.shape(i);
             if (finish < 0)
                 finish += o.shape(i);
 #ifndef NDEBUG
             cuvAssert(finish>start);
 #endif
             m_ptr += start * o.stride(i);
             if (idx.ranges_[i].is_degenerate()) {
                 // skip dimension
             } else {
                 shapes.push_back((finish - start) / stride);
                 strides.push_back(o.stride(i) * stride);
             }
         }
         // store in m_info
         m_info.resize(shapes.size());
         std::copy(shapes.begin(), shapes.end(), m_info.host_shape[0].ptr);
         std::copy(strides.begin(), strides.end(), m_info.host_stride[0].ptr);
     }
 };
  // data_structures
 template<class V, class V2, class M, class M2, class L>
 bool equal_shape(const ndarray<V, M, L>& a, const ndarray<V2, M2, L>& b) {
     return a.effective_shape() == b.effective_shape();
 }
 
 
 template<class Mat, class NewVT>
 struct switch_value_type {
     typedef ndarray<NewVT, typename Mat::memory_space_type, typename Mat::memory_layout_type> type; 
 };
 template<class Mat, class NewML>
 struct switch_memory_layout_type {
     typedef ndarray<typename Mat::value_type, typename Mat::memory_space_type, NewML> type; 
 };
 template<class Mat, class NewMS>
 struct switch_memory_space_type {
     typedef ndarray<typename Mat::value_type, NewMS, typename Mat::memory_layout_type> type; 
 };
 
 }
 
 namespace std {
 
 template<class V>
 ostream& operator<<(ostream& o, const cuv::linear_memory<V, cuv::host_memory_space>& t) {
     o << "[ ";
     for (unsigned int i = 0; i < t.size(); i++)
         o << t[i] << " ";
     o << "]";
     return o;
 }
 
 template<class V>
 ostream& operator<<(ostream& o, const cuv::linear_memory<V, cuv::dev_memory_space>& t_) {
     cuv::linear_memory<V, cuv::host_memory_space> t = t_; // pull
     o << "[ ";
     for (unsigned int i = 0; i < t.size(); i++)
         o << t[i] << " ";
     o << "]";
     return o;
 }
 
 template<class V>
 ostream& operator<<(ostream& o, const cuv::pitched_memory<V, cuv::host_memory_space>& t) {
     o << "[ ";
     for (unsigned int i = 0; i < t.rows(); i++) {
         for (unsigned int j = 0; j < t.rows(); j++) {
             o << t(i, j) << " ";
         }
         if (i < t.rows() - 1)
             o << std::endl;
     }
     o << "]";
     return o;
 }
 
 template<class V>
 ostream& operator<<(ostream& o, const cuv::pitched_memory<V, cuv::dev_memory_space>& t_) {
     cuv::pitched_memory<V, cuv::host_memory_space> t = t_; // pull
     o << "[ ";
     for (unsigned int i = 0; i < t.rows(); i++) {
         for (unsigned int j = 0; j < t.rows(); j++) {
             o << t(i, j) << " ";
         }
         if (i < t.rows() - 1)
             o << std::endl;
     }
     o << "]";
     return o;
 }
 
 template<class V, class L>
 ostream& operator<<(ostream& o, const cuv::ndarray<V, cuv::dev_memory_space, L>& t) {
     return o << cuv::ndarray<V, cuv::host_memory_space, L>(t);
 }
 
 template<class V, class L>
 ostream& operator<<(ostream& o, const cuv::ndarray<V, cuv::host_memory_space, L>& t) {
     if (t.ndim() == 0)
         return o << "[]";
 
     if (t.ndim() == 1) {
         o << "[ ";
         for (unsigned int i = 0; i < t.shape(0); i++)
             o << t[i] << " ";
         return o << "]";
     }
     if (t.ndim() == 2) {
         o << "[";
         for (unsigned int i = 0; i < t.shape(0); ++i) {
             if (i > 0)
                 o << " ";
             o << "[ ";
             for (unsigned int j = 0; j < t.shape(1); j++)
                 o << t(i, j) << " ";
             o << "]";
             if (i != t.shape(0) - 1)
                 o << std::endl;
         }
         return o << "]";
     }
     if (t.ndim() == 3) {
         o << "[" << std::endl;
         for (unsigned int l = 0; l < t.shape(0); l++) {
             o << "[";
             for (unsigned int i = 0; i < t.shape(1); ++i) {
                 if (i > 0)
                     o << " ";
                 o << "[ ";
                 //for(unsigned int j=0;j<t.shape(2);j++) o<< t(l,i,j)<<" ";
                 for (unsigned int j = 0; j < t.shape(2); j++)
                     o << t[l * t.shape(1) * t.shape(2) + i * t.shape(2) + j] << " ";
                 o << "]";
                 if (i != t.shape(1) - 1)
                     o << std::endl;
             }
             o << "]";
             if (l < t.shape(0) - 1)
                 o << std::endl;
         }
         return o << "]";
     }
     throw std::runtime_error("printing of ndarrays with >3 dimensions not implemented");
 }
 } // io
 #endif