diff options
author | Soumith Chintala <soumith@gmail.com> | 2016-11-17 00:55:01 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-11-17 00:55:01 +0300 |
commit | 6d6fe2061c79ada02ffd15e5314472d65640f498 (patch) | |
tree | dda7e84b1f32f9f2818573e66164028535731950 | |
parent | 7ed4888c337db9051bed4458e558733db5a8d48c (diff) | |
parent | 7914f6e3738c68d5f47d6b9d67c09685c9c543b0 (diff) |
Merge pull request #602 from killeent/magma
Magma functions to generic
-rw-r--r-- | TensorMath.lua | 126 | ||||
-rw-r--r-- | lib/THC/CMakeLists.txt | 3 | ||||
-rw-r--r-- | lib/THC/THCTensorMath.h | 17 | ||||
-rw-r--r-- | lib/THC/THCTensorMathMagma.cu | 564 | ||||
-rw-r--r-- | lib/THC/THCTensorMathMagma.cuh | 22 | ||||
-rw-r--r-- | lib/THC/generic/THCTensorMathMagma.cu | 650 | ||||
-rw-r--r-- | lib/THC/generic/THCTensorMathMagma.h | 23 | ||||
-rw-r--r-- | test/test.lua | 127 |
8 files changed, 908 insertions, 624 deletions
diff --git a/TensorMath.lua b/TensorMath.lua index f94154d..61cd4e9 100644 --- a/TensorMath.lua +++ b/TensorMath.lua @@ -1173,6 +1173,114 @@ for k, Tensor_ in pairs(handledTypenames) do end end + if real == 'float' or real == 'double' then + + for _,name in ipairs({"gesv", "gels"}) do + wrap(name, + cname(name), + {{name=Tensor, returned=true}, + {name=Tensor, returned=true}, + {name=Tensor}, + {name=Tensor}}, + cname(name), + {{name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor}, + {name=Tensor}}) + end + + wrap("symeig", + cname("syev"), + {{name=Tensor, returned=true}, + {name=Tensor, returned=true}, + {name=Tensor}, + {name='charoption', values={'N', 'V'}, default='N'}, + {name='charoption', values={'U', 'L'}, default='U'}}, + cname("syev"), + {{name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor}, + {name='charoption', values={'N', 'V'}, default='N'}, + {name='charoption', values={'U', 'L'}, default='U'}}) + + wrap("eig", + cname("geev"), + {{name=Tensor, returned=true}, + {name=Tensor, returned=true}, + {name=Tensor}, + {name='charoption', values={'N', 'V'}, default='N'}}, + cname("geev"), + {{name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor}, + {name='charoption', values={'N', 'V'}, default='N'}}) + + wrap("svd", + cname("gesvd"), + {{name=Tensor, returned=true}, + {name=Tensor, returned=true}, + {name=Tensor, returned=true}, + {name=Tensor}, + {name='charoption', values={'A', 'S'}, default='S'}}, + cname("gesvd"), + {{name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor}, + {name='charoption', values={'A', 'S'}, default='S'}}) + + wrap("inverse", + cname("getri"), + {{name=Tensor, returned=true}, + {name=Tensor}}, + cname("getri"), + {{name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor}}) + + wrap("potri", + cname("potri"), + {{name=Tensor, returned=true}, + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}, + cname("potri"), + {{name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}) + + wrap("potrf", + cname("potrf"), + {{name=Tensor, returned=true}, + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}, + cname("potrf"), + {{name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}) + + wrap("potrs", + cname("potrs"), + {{name=Tensor, returned=true}, + {name=Tensor}, + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}, + cname("potrs"), + {{name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor}, + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}) + + wrap("qr", + cname("qr"), + {{name=Tensor, returned=true}, + {name=Tensor, returned=true}, + {name=Tensor}}, + cname("qr"), + {{name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor, default=true, returned=true, invisible=true}, + {name=Tensor}}) + + end + wrap("dot", cname("dot"), {{name=Tensor}, @@ -1818,28 +1926,34 @@ wrap("inverse", wrap("potri", cname("potri"), {{name=Tensor, returned=true}, - {name=Tensor}}, + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}, cname("potri"), {{name=Tensor, default=true, returned=true, invisible=true}, - {name=Tensor}}) + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}) wrap("potrf", cname("potrf"), {{name=Tensor, returned=true}, - {name=Tensor}}, + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}, cname("potrf"), {{name=Tensor, default=true, returned=true, invisible=true}, - {name=Tensor}}) + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}) wrap("potrs", cname("potrs"), {{name=Tensor, returned=true}, {name=Tensor}, - {name=Tensor}}, + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}, cname("potrs"), {{name=Tensor, default=true, returned=true, invisible=true}, {name=Tensor}, - {name=Tensor}}) + {name=Tensor}, + {name='charoption', values={'U', 'L'}, default='U'}}) wrap("qr", cname("qr"), diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt index 51f568a..c8916d3 100644 --- a/lib/THC/CMakeLists.txt +++ b/lib/THC/CMakeLists.txt @@ -239,6 +239,7 @@ INSTALL(FILES THCTensorInfo.cuh THCTensorTypeUtils.cuh THCTensorRandom.cuh + THCTensorMathMagma.cuh DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC") INSTALL(FILES @@ -264,6 +265,8 @@ INSTALL(FILES generic/THCTensorMathCompare.cu generic/THCTensorMathCompareT.h generic/THCTensorMathCompareT.cu + generic/THCTensorMathMagma.h + generic/THCTensorMathMagma.cu generic/THCTensorMathPairwise.h generic/THCTensorMathPairwise.cu generic/THCTensorMathPointwise.h diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h index 7cbef32..19ae679 100644 --- a/lib/THC/THCTensorMath.h +++ b/lib/THC/THCTensorMath.h @@ -10,6 +10,9 @@ #include "generic/THCTensorMathBlas.h" #include "THCGenerateAllTypes.h" +#include "generic/THCTensorMathMagma.h" +#include "THCGenerateAllTypes.h" + #include "generic/THCTensorMathPairwise.h" #include "THCGenerateAllTypes.h" @@ -40,20 +43,6 @@ #include "generic/THCTensorSort.h" #include "THCGenerateAllTypes.h" -// MAGMA (i.e. CUDA implementation of LAPACK functions) -THC_API void THCudaTensor_gesv(THCState *state, THCudaTensor *rb_, THCudaTensor *ra_, THCudaTensor *b_, THCudaTensor *a_); -THC_API void THCudaTensor_gels(THCState *state, THCudaTensor *rb_, THCudaTensor *ra_, THCudaTensor *b_, THCudaTensor *a_); -THC_API void THCudaTensor_syev(THCState *state, THCudaTensor *re_, THCudaTensor *rv_, THCudaTensor *a_, const char *jobz, const char *uplo); -THC_API void THCudaTensor_geev(THCState *state, THCudaTensor *re_, THCudaTensor *rv_, THCudaTensor *a_, const char *jobvr); -THC_API void THCudaTensor_gesvd(THCState *state, THCudaTensor *ru_, THCudaTensor *rs_, THCudaTensor *rv_, THCudaTensor *a, const char *jobu); -THC_API void THCudaTensor_gesvd2(THCState *state, THCudaTensor *ru_, THCudaTensor *rs_, THCudaTensor *rv_, THCudaTensor *ra_, THCudaTensor *a, const char *jobu); -THC_API void THCudaTensor_getri(THCState *state, THCudaTensor *ra_, THCudaTensor *a); -THC_API void THCudaTensor_potri(THCState *state, THCudaTensor *ra_, THCudaTensor *a); -THC_API void THCudaTensor_potrf(THCState *state, THCudaTensor *ra_, THCudaTensor *a); -THC_API void THCudaTensor_potrs(THCState *state, THCudaTensor *rb_, THCudaTensor *a, THCudaTensor *b); -THC_API void THCudaTensor_qr(THCState *state, THCudaTensor *rq_, THCudaTensor *rr_, THCudaTensor *a); - - THC_API int THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self); THC_API int THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self); diff --git a/lib/THC/THCTensorMathMagma.cu b/lib/THC/THCTensorMathMagma.cu index d599722..cac5d73 100644 --- a/lib/THC/THCTensorMathMagma.cu +++ b/lib/THC/THCTensorMathMagma.cu @@ -1,6 +1,7 @@ #include "THCGeneral.h" #include "THCTensorMath.h" #include "THCTensorCopy.h" +#include "THCTensorMathMagma.cuh" #include <algorithm> #ifdef USE_MAGMA @@ -22,564 +23,5 @@ void THCMagma_init(THCState *state) #endif } -#ifdef USE_MAGMA -static inline float* th_magma_smalloc_pinned(size_t n) -{ - float* ptr; - if (MAGMA_SUCCESS != magma_smalloc_pinned(&ptr, n)) - THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456); - return ptr; -} - -static inline int* th_magma_imalloc_pinned(size_t n) -{ - int* ptr; - if (MAGMA_SUCCESS != magma_imalloc_pinned(&ptr, n)) - THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456); - return ptr; -} - -static void THCudaTensor_copyArray1d(THCState *state, THCudaTensor *self, float *src, int k) -{ - long size[1] = { k }; - long stride[1] = { 1 }; - THCudaTensor_rawResize(state, self, 1, size, stride); - size_t len = k * sizeof(float); - THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice)); -} - -static void THCudaTensor_copyArray2d(THCState *state, THCudaTensor *self, float *src, int m, int n) -{ - long size[2] = { m, n }; - long stride[2] = { 1, m }; - THCudaTensor_rawResize(state, self, 2, size, stride); - size_t len = m * n * sizeof(float); - THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice)); -} - -static void THCudaTensor_copyTensor2d(THCState *state, float *dst, THCudaTensor *self) -{ - THAssert(self->nDimension == 2); - size_t len = THCudaTensor_nElement(state, self)*sizeof(float); - THCudaTensor *temp = THCudaTensor_newTranspose(state, self, 0, 1); - THCudaTensor *selfc = THCudaTensor_newContiguous(state, temp); - THCudaCheck(cudaMemcpy(dst, selfc->storage->data + selfc->storageOffset, len, cudaMemcpyDeviceToHost)); - THCudaTensor_free(state, temp); - THCudaTensor_free(state, selfc); -} - -#endif - -static THCudaTensor* THCudaTensor_newColumnMajor(THCState *state, THCudaTensor *self, THCudaTensor *src) -{ - THAssert(src->nDimension == 2); - if (self == src && self->stride[0] == 1 && self->stride[1] == self->size[0]) - { - THCudaTensor_retain(state, self); - return self; - } - - if (self == src) - self = THCudaTensor_new(state); - else - THCudaTensor_retain(state, self); - - long size[2] = { src->size[0], src->size[1] }; - long stride[2] = { 1, src->size[0] }; - - THCudaTensor_rawResize(state, self, 2, size, stride); - THCudaTensor_copy(state, self, src); - return self; -} - - -void THCudaTensor_gesv(THCState *state, THCudaTensor *rb_, THCudaTensor *ra_, THCudaTensor *b_, THCudaTensor *a_) -{ -#ifdef USE_MAGMA - THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional"); - THArgCheck(b_->nDimension == 2, 2, "b should be 2 dimensional"); - THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square"); - THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible"); - - int n = a_->size[0]; - int nrhs = b_->size[1]; - - THCudaTensor *a = THCudaTensor_newColumnMajor(state, ra_, a_); - THCudaTensor *b = THCudaTensor_newColumnMajor(state, rb_, b_); - float *a_data = THCudaTensor_data(state, a); - float *b_data = THCudaTensor_data(state, b); - - int *ipiv = th_magma_imalloc_pinned(n); - - int info; - magma_sgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info); - - if (info < 0) - THError("MAGMA gesv : Argument %d : illegal value", -info); - else if (info > 0) - THError("MAGMA gesv : U(%d,%d) is zero, singular U.", info, info); - - magma_free_pinned(ipiv); - THCudaTensor_freeCopyTo(state, a, ra_); - THCudaTensor_freeCopyTo(state, b, rb_); -#else - THError(NoMagma(gesv)); -#endif -} - -void THCudaTensor_gels(THCState *state, THCudaTensor *rb_, THCudaTensor *ra_, THCudaTensor *b_, THCudaTensor *a_) -{ -#ifdef USE_MAGMA - THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional"); - THArgCheck(b_->nDimension == 2, 1, "b should be 2 dimensional"); - THArgCheck(a_->size[0] == b_->size[0], 2, "size incompatible A,b"); - THArgCheck(a_->size[0] >= a_->size[1], 2, "A should have m >= n"); - - THCudaTensor *a = THCudaTensor_newColumnMajor(state, ra_, a_); - THCudaTensor *b = THCudaTensor_newColumnMajor(state, rb_, b_); - float *a_data = THCudaTensor_data(state, a); - float *b_data = THCudaTensor_data(state, b); - - int m = a->size[0]; - int n = a->size[1]; - int nrhs = b->size[1]; - float wkopt; - - int info; - magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info); - - float *hwork = th_magma_smalloc_pinned((size_t)wkopt); - magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info); - magma_free_pinned(hwork); - - if (info != 0) - THError("MAGMA gels : Argument %d : illegal value", -info); - - THCudaTensor_freeCopyTo(state, a, ra_); - THCudaTensor_freeCopyTo(state, b, rb_); -#else - THError(NoMagma(gels)); -#endif -} - -void THCudaTensor_syev(THCState *state, THCudaTensor *re_, THCudaTensor *rv_, THCudaTensor *a, const char *jobzs, const char *uplos) -{ -#ifdef USE_MAGMA - int n = a->size[0]; - int lda = n; - - magma_uplo_t uplo = uplos[0] == 'U' ? MagmaUpper : MagmaLower; - magma_vec_t jobz = jobzs[0] == 'N' ? MagmaNoVec : MagmaVec; - - THCudaTensor *input = THCudaTensor_newColumnMajor(state, rv_, a); - float *input_data = THCudaTensor_data(state, input); - - // eigen values and workspace - float *w = th_magma_smalloc_pinned(n); - float *wA = th_magma_smalloc_pinned(lda); - - // compute optimal size of work array - int info; - float lwork; - int liwork; - magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info); - - float *work = th_magma_smalloc_pinned((size_t)lwork); - int *iwork = th_magma_imalloc_pinned(liwork); - - // compute eigenvalues and, optionally, eigenvectors - magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info); - - // copy eigen values from w to re_ - if (info == 0) - THCudaTensor_copyArray1d(state, re_, w, n); - - magma_free_pinned(iwork); - magma_free_pinned(work); - magma_free_pinned(wA); - magma_free_pinned(w); - - // check error value - if (info > 0) - THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info); - else if (info < 0) - THError("MAGMA syev : Argument %d : illegal value", -info); - - THCudaTensor_freeCopyTo(state, input, rv_); -#else - THError(NoMagma(syev)); -#endif -} - -void THCudaTensor_geev(THCState *state, THCudaTensor *re_, THCudaTensor *rv_, THCudaTensor *a_, const char *jobvrs) -{ -#ifdef USE_MAGMA - THArgCheck(a_->nDimension == 2, 3, "A should be 2 dimensional"); - THArgCheck(a_->size[0] == a_->size[1], 3, "A should be square"); - - magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec; - int n = a_->size[0]; - - float *a_data = th_magma_smalloc_pinned(n * n); - THCudaTensor_copyTensor2d(state, a_data, a_); - - float *wr = th_magma_smalloc_pinned(n); - float *wi = th_magma_smalloc_pinned(n); - - float *vr_data = NULL; - int ldvr = 1; - if (jobvr == MagmaVec) - { - vr_data = th_magma_smalloc_pinned(n * n); - ldvr = n; - } - - float wkopt; - int info; - - magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info); - - int lwork = (int) wkopt; - float *work_data = th_magma_smalloc_pinned(lwork); - - magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info); - - if (info > 0) - THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info); - else if (info < 0) - THError("MAGMA geev : Argument %d : illegal value", -info); - - { - THCudaTensor_resize2d(state, re_, 2, n); - THCudaTensor *re = THCudaTensor_newContiguous(state, re_); - THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset, wr, n*sizeof(float), cudaMemcpyHostToDevice)); - THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset + n, wi, n*sizeof(float), cudaMemcpyHostToDevice)); - THCudaTensor_freeCopyTo(state, re, re_); - THCudaTensor_transpose(state, re_, NULL, 0, 1); - } - - if (jobvr == MagmaVec) - THCudaTensor_copyArray2d(state, rv_, vr_data, n, n); - - magma_free_pinned(work_data); - magma_free_pinned(vr_data); - magma_free_pinned(wi); - magma_free_pinned(wr); - magma_free_pinned(a_data); - -#else - THError(NoMagma(geev)); -#endif -} - -void THCudaTensor_gesvd(THCState *state, THCudaTensor *ru_, THCudaTensor *rs_, THCudaTensor *rv_, THCudaTensor *a, const char *jobu) -{ -#ifdef USE_MAGMA - THCudaTensor *ra_ = THCudaTensor_new(state); - THCudaTensor_gesvd2(state, ru_, rs_, rv_, ra_, a, jobu); - THCudaTensor_free(state, ra_); -#else - THError(NoMagma(gesvd)); -#endif -} - -void THCudaTensor_gesvd2(THCState *state, THCudaTensor *ru_, THCudaTensor *rs_, THCudaTensor *rv_, THCudaTensor *ra_, THCudaTensor *a, const char *jobus) -{ -#ifdef USE_MAGMA - THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional"); - - magma_vec_t jobu = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec; - magma_vec_t jobvt = jobu; - - int m = a->size[0]; - int n = a->size[1]; - int k = m < n ? m : n; - int j = (jobu == MagmaAllVec) ? m : k; - - float *a_data = th_magma_smalloc_pinned(m * n); - THCudaTensor_copyTensor2d(state, a_data, a); - - float *rs_data = th_magma_smalloc_pinned(k); - float *ru_data = th_magma_smalloc_pinned(m * j); - float *rv_data = th_magma_smalloc_pinned(n * n); - - float wkopt; - int info; - magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info); - - int lwork = (int) wkopt; - float *work_data = th_magma_smalloc_pinned(lwork); - - magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info); - - if (info > 0) - THError("MAGMA gesvd : %d superdiagonals failed to converge", info); - else if (info < 0) - THError("MAGMA gesvd : Argument %d : illegal value", -info); - - THCudaTensor_copyArray2d(state, rv_, rv_data, n, n); - THCudaTensor_transpose(state, rv_, NULL, 0, 1); - THCudaTensor_copyArray2d(state, ru_, ru_data, m, j); - THCudaTensor_copyArray1d(state, rs_, rs_data, k); - THCudaTensor_copyArray2d(state, ra_, a_data, m, n); - - magma_free_pinned(work_data); - magma_free_pinned(rv_data); - magma_free_pinned(ru_data); - magma_free_pinned(rs_data); - magma_free_pinned(a_data); -#else - THError(NoMagma(gesvd2)); -#endif -} - -void THCudaTensor_getri(THCState *state, THCudaTensor *ra_, THCudaTensor *a) -{ -#ifdef USE_MAGMA - THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); - - int info; - int n = a->size[0]; - int lwork = n * magma_get_sgetri_nb(n); - - THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a); - float *input_data = THCudaTensor_data(state, input); - - int *ipiv = th_magma_imalloc_pinned(n); - - THCudaTensor *work = THCudaTensor_newWithSize1d(state, lwork); - float *work_data = THCudaTensor_data(state, work); - - // Run LU - magma_sgetrf_gpu(n, n, input_data, n, ipiv, &info); - if (info > 0) - THError("MAGMA getrf : U(%d,%d) is 0, U is singular", info, info); - else if (info < 0) - THError("MAGMA getrf : Argument %d : illegal value", -info); - - // Inverse - magma_sgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info); - if (info > 0) - THError("MAGMA getri : U(%d,%d) is 0, U is singular", info, info); - else if (info < 0) - THError("MAGMA getri : Argument %d : illegal value", -info); - - THCudaTensor_free(state, work); - magma_free_pinned(ipiv); - THCudaTensor_freeCopyTo(state, input, ra_); -#else - THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); - - int n = a->size[0]; - - // input - THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a); - // output - THCudaTensor *output = THCudaTensor_newColumnMajor(state, ra_, a); - - size_t matrices_size = sizeof(float*); - - float **matrices1 = (float **)THAlloc(matrices_size); - const float **matrices1_const = (const float **)THAlloc(matrices_size); - float **matrices2 = (float **)THAlloc(matrices_size); - matrices1[0] = THCudaTensor_data(state, input); - matrices1_const[0] = THCudaTensor_data(state, input); - matrices2[0] = THCudaTensor_data(state, output); - - // Copy pointers to device. - float **d_matrices1, **d_matrices2; - const float **d_matrices1_const; - THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size)); - THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1_const, matrices_size)); - THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size)); - - THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size, - cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); - THCudaCheck(cudaMemcpyAsync(d_matrices1_const, matrices1_const, matrices_size, - cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); - THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size, - cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); - int info; - int *info_gpu; - THCudaCheck(THCudaMalloc(state, (void**)&info_gpu, sizeof(int))); - - int *ipiv_gpu; - THCudaCheck(THCudaMalloc(state, (void**)&ipiv_gpu, n * sizeof(int))); - - // Run LU - THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1); - - THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost)); - - if (info > 0) - THError("CUBLAS getrf : U(%d,%d) is 0, U is singular", info, info); - else if (info < 0) - THError("CUBLAS getrf : Argument %d : illegal value", -info); - - // Inverse - THCudaBlas_Sgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1); - if (info > 0) - THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info); - else if (info < 0) - THError("CUBLAS getri : Argument %d : illegal value", -info); - - THCudaCheck(THCudaFree(state, ipiv_gpu)); - THCudaCheck(THCudaFree(state, info_gpu)); - THCudaTensor_freeCopyTo(state, output, input); -#endif - -} - - -__global__ void THCudaTensor_copyUpperSymmetric(float *input, int n, int len) -{ - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) { - const int r = idx % n; - const int c = idx / n; - if (r > c) { - input[idx] = input[r*n + c]; - } - } -} - -void THCudaTensor_potri(THCState *state, THCudaTensor *ra_, THCudaTensor *a) -{ -#ifdef USE_MAGMA - THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); - - int n = a->size[0]; - - THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a); - float *input_data = THCudaTensor_data(state, input); - - int info; - magma_spotrf_gpu(MagmaUpper, n, input_data, n, &info); - if (info > 0) - THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info); - else if (info < 0) - THError("MAGMA potrf : Argument %d : illegal value", -info); - - magma_spotri_gpu(MagmaUpper, n, input_data, n, &info); - if (info > 0) - THError("MAGMA potri : A(%d,%d) is 0, A cannot be factorized", info, info); - else if (info < 0) - THError("MAGMA potri : Argument %d : illegal value", -info); - - cudaStream_t stream = THCState_getCurrentStream(state); - const int len = n*n; - dim3 blocks(std::min(DIVUP(len, 128), 65535)); - dim3 threads(128); - THCudaTensor_copyUpperSymmetric<<<blocks, threads, 0, stream>>>(input_data, n, len); - - THCudaTensor_freeCopyTo(state, input, ra_); -#else - THError(NoMagma(potri)); -#endif -} - -void THCudaTensor_potrf(THCState *state, THCudaTensor *ra_, THCudaTensor *a) -{ -#ifdef USE_MAGMA - THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); - - int n = a->size[0]; - - THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a); - float *input_data = THCudaTensor_data(state, input); - - int info; - magma_spotrf_gpu(MagmaUpper, n, input_data, n, &info); - - // check error value - if (info > 0) - THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info); - else if (info < 0) - THError("MAGMA potrf : Argument %d : illegal value", -info); - - THCudaTensor_triu(state, ra_, input, 0); - THCudaTensor_free(state, input); -#else - THError(NoMagma(potrf)); -#endif -} - -void THCudaTensor_potrs(THCState *state, THCudaTensor *rb_, THCudaTensor *b, THCudaTensor *a) -{ -#ifdef USE_MAGMA - THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); - - int n = a->size[0]; - int nrhs = b->size[1]; - - THCudaTensor *b_ = THCudaTensor_newColumnMajor(state, rb_, b); - float *b_data = THCudaTensor_data(state, b_); - THCudaTensor *a_ = THCudaTensor_newColumnMajor(state, a, a); - float *a_data = THCudaTensor_data(state, a_); - - int info; - magma_spotrs_gpu(MagmaUpper, n, nrhs, a_data, n, b_data, n, &info); - - // check error value - if (info < 0) - THError("MAGMA potrs : Argument %d : illegal value", -info); - - THCudaTensor_freeCopyTo(state, b_, rb_); - THCudaTensor_free(state, a_); -#else - THError(NoMagma(potrs)); -#endif -} - -void THCudaTensor_qr(THCState *state, THCudaTensor *rq_, THCudaTensor *rr_, THCudaTensor *a_) -{ -#ifdef USE_MAGMA - THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional"); - - THCudaTensor *a = THCudaTensor_newColumnMajor(state, rr_, a_); - int m = a->size[0]; - int n = a->size[1]; - int k = (m < n ? m : n); - -#ifdef MAGMA_V2 - int nb = magma_get_sgeqrf_nb(m, n); -#else - int nb = magma_get_sgeqrf_nb(m); -#endif - - float *a_data = THCudaTensor_data(state, a); - float *tau_data = th_magma_smalloc_pinned(n*n); - - THCudaTensor *work = THCudaTensor_newWithSize1d(state, (2*k + ((n+31)/32)*32)*nb); - float *work_data = THCudaTensor_data(state, work); - - int info; - magma_sgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info); - - if (info != 0) - THError("MAGMA geqrf : Argument %d : illegal value.", -info); - - THCudaTensor *q = THCudaTensor_newColumnMajor(state, rq_, a); - float *q_data = THCudaTensor_data(state, q); - - THCudaTensor_narrow(state, a, a, 0, 0, k); - THCudaTensor_triu(state, rr_, a, 0); - THCudaTensor_free(state, a); - - magma_sorgqr_gpu(m, n, k, q_data, m, tau_data, work_data, nb, &info); - - if (info != 0) - THError("MAGMA orgqr : Argument %d : illegal value.", -info); - - THCudaTensor_free(state, work); - magma_free_pinned(tau_data); - - THCudaTensor_narrow(state, q, q, 1, 0, k); - THCudaTensor_freeCopyTo(state, q, rq_); -#else - THError(NoMagma(qr)); -#endif -} +#include "generic/THCTensorMathMagma.cu" +#include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMathMagma.cuh b/lib/THC/THCTensorMathMagma.cuh new file mode 100644 index 0000000..6495049 --- /dev/null +++ b/lib/THC/THCTensorMathMagma.cuh @@ -0,0 +1,22 @@ +#ifndef THC_TENSOR_MATH_MAGMA_CUH +#define THC_TENSOR_MATH_MAGMA_CUH + +#ifdef USE_MAGMA +#include <magma.h> +#else +#include "THCBlas.h" +#endif + +#ifdef USE_MAGMA +template <typename T> +static inline T* th_magma_malloc_pinned(size_t n) +{ + void* ptr; + if (MAGMA_SUCCESS != magma_malloc_pinned(&ptr, n * sizeof(T))) + THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456); + return reinterpret_cast<T*>(ptr); +} + +#endif + +#endif // THC_TENSOR_MATH_MAGMA_CUH diff --git a/lib/THC/generic/THCTensorMathMagma.cu b/lib/THC/generic/THCTensorMathMagma.cu new file mode 100644 index 0000000..d302b13 --- /dev/null +++ b/lib/THC/generic/THCTensorMathMagma.cu @@ -0,0 +1,650 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathMagma.cu" +#else + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + +#ifdef USE_MAGMA + +static void THCTensor_(copyArray1d)(THCState *state, THCTensor *self, real *src, int k) +{ + long size[1] = { k }; + long stride[1] = { 1 }; + THCTensor_(rawResize)(state, self, 1, size, stride); + size_t len = k * sizeof(real); + THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice)); +} + +static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src, int m, int n) +{ + long size[2] = { m, n }; + long stride[2] = { 1, m }; + THCTensor_(rawResize)(state, self, 2, size, stride); + size_t len = m * n * sizeof(real); + THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice)); +} + +static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self) +{ + THAssert(self->nDimension == 2); + size_t len = THCTensor_(nElement)(state, self)*sizeof(real); + THCTensor *temp = THCTensor_(newTranspose)(state, self, 0, 1); + THCTensor *selfc = THCTensor_(newContiguous)(state, temp); + THCudaCheck(cudaMemcpy(dst, selfc->storage->data + selfc->storageOffset, len, cudaMemcpyDeviceToHost)); + THCTensor_(free)(state, temp); + THCTensor_(free)(state, selfc); +} + +#endif // USE_MAGMA + +static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src) +{ + THAssert(src->nDimension == 2); + if (self == src && self->stride[0] == 1 && self->stride[1] == self->size[0]) + { + THCTensor_(retain)(state, self); + return self; + } + + if (self == src) + self = THCTensor_(new)(state); + else + THCTensor_(retain)(state, self); + + long size[2] = { src->size[0], src->size[1] }; + long stride[2] = { 1, src->size[0] }; + + THCTensor_(rawResize)(state, self, 2, size, stride); + THCTensor_(copy)(state, self, src); + return self; +} + + +THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) +{ +#ifdef USE_MAGMA + THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional"); + THArgCheck(b_->nDimension == 2, 2, "b should be 2 dimensional"); + THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square"); + THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible"); + + int n = a_->size[0]; + int nrhs = b_->size[1]; + + THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); + THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_); + real *a_data = THCTensor_(data)(state, a); + real *b_data = THCTensor_(data)(state, b); + + int *ipiv = th_magma_malloc_pinned<int>(n); + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_sgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info); +#else + magma_dgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info); +#endif + + if (info < 0) + THError("MAGMA gesv : Argument %d : illegal value", -info); + else if (info > 0) + THError("MAGMA gesv : U(%d,%d) is zero, singular U.", info, info); + + magma_free_pinned(ipiv); + THCTensor_(freeCopyTo)(state, a, ra_); + THCTensor_(freeCopyTo)(state, b, rb_); +#else + THError(NoMagma(gesv)); +#endif +} + +THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) +{ +#ifdef USE_MAGMA + THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional"); + THArgCheck(b_->nDimension == 2, 1, "b should be 2 dimensional"); + THArgCheck(a_->size[0] == b_->size[0], 2, "size incompatible A,b"); + THArgCheck(a_->size[0] >= a_->size[1], 2, "A should have m >= n"); + + THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); + THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_); + real *a_data = THCTensor_(data)(state, a); + real *b_data = THCTensor_(data)(state, b); + + int m = a->size[0]; + int n = a->size[1]; + int nrhs = b->size[1]; + real wkopt; + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info); +#else + magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info); +#endif + + real *hwork = th_magma_malloc_pinned<real>((size_t)wkopt); + +#if defined(THC_REAL_IS_FLOAT) + magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info); +#else + magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info); +#endif + + magma_free_pinned(hwork); + + if (info != 0) + THError("MAGMA gels : Argument %d : illegal value", -info); + + THCTensor_(freeCopyTo)(state, a, ra_); + THCTensor_(freeCopyTo)(state, b, rb_); +#else + THError(NoMagma(gels)); +#endif +} + +THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos) +{ +#ifdef USE_MAGMA + int n = a->size[0]; + int lda = n; + + magma_uplo_t uplo = uplos[0] == 'U' ? MagmaUpper : MagmaLower; + magma_vec_t jobz = jobzs[0] == 'N' ? MagmaNoVec : MagmaVec; + + THCTensor *input = THCTensor_(newColumnMajor)(state, rv_, a); + real *input_data = THCTensor_(data)(state, input); + + // eigen values and workspace + real *w = th_magma_malloc_pinned<real>(n); + real *wA = th_magma_malloc_pinned<real>(lda); + + // compute optimal size of work array + int info; + real lwork; + int liwork; + +#if defined(THC_REAL_IS_FLOAT) + magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info); +#else + magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info); +#endif + + real *work = th_magma_malloc_pinned<real>((size_t)lwork); + int *iwork = th_magma_malloc_pinned<int>(liwork); + + // compute eigenvalues and, optionally, eigenvectors +#if defined(THC_REAL_IS_FLOAT) + magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info); +#else + magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info); +#endif + + // copy eigen values from w to re_ + if (info == 0) + THCTensor_(copyArray1d)(state, re_, w, n); + + magma_free_pinned(iwork); + magma_free_pinned(work); + magma_free_pinned(wA); + magma_free_pinned(w); + + // check error value + if (info > 0) + THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info); + else if (info < 0) + THError("MAGMA syev : Argument %d : illegal value", -info); + + THCTensor_(freeCopyTo)(state, input, rv_); +#else + THError(NoMagma(syev)); +#endif +} + +THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs) +{ +#ifdef USE_MAGMA + THArgCheck(a_->nDimension == 2, 3, "A should be 2 dimensional"); + THArgCheck(a_->size[0] == a_->size[1], 3, "A should be square"); + + magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec; + int n = a_->size[0]; + + real *a_data = th_magma_malloc_pinned<real>(n * n); + THCTensor_(copyTensor2d)(state, a_data, a_); + + real *wr = th_magma_malloc_pinned<real>(n); + real *wi = th_magma_malloc_pinned<real>(n); + + real *vr_data = NULL; + int ldvr = 1; + if (jobvr == MagmaVec) + { + vr_data = th_magma_malloc_pinned<real>(n * n); + ldvr = n; + } + + real wkopt; + int info; + +#if defined(THC_REAL_IS_FLOAT) + magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info); +#else + magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info); +#endif + + int lwork = (int) wkopt; + real *work_data = th_magma_malloc_pinned<real>(lwork); + +#if defined(THC_REAL_IS_FLOAT) + magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info); +#else + magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info); +#endif + + if (info > 0) + THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info); + else if (info < 0) + THError("MAGMA geev : Argument %d : illegal value", -info); + + { + THCTensor_(resize2d)(state, re_, 2, n); + THCTensor *re = THCTensor_(newContiguous)(state, re_); + THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice)); + THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice)); + THCTensor_(freeCopyTo)(state, re, re_); + THCTensor_(transpose)(state, re_, NULL, 0, 1); + } + + if (jobvr == MagmaVec) + THCTensor_(copyArray2d)(state, rv_, vr_data, n, n); + + magma_free_pinned(work_data); + magma_free_pinned(vr_data); + magma_free_pinned(wi); + magma_free_pinned(wr); + magma_free_pinned(a_data); + +#else + THError(NoMagma(geev)); +#endif +} + +THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu) +{ +#ifdef USE_MAGMA + THCTensor *ra_ = THCTensor_(new)(state); + THCTensor_(gesvd2)(state, ru_, rs_, rv_, ra_, a, jobu); + THCTensor_(free)(state, ra_); +#else + THError(NoMagma(gesvd)); +#endif +} + +THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus) +{ +#ifdef USE_MAGMA + THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional"); + + magma_vec_t jobu = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec; + magma_vec_t jobvt = jobu; + + int m = a->size[0]; + int n = a->size[1]; + int k = m < n ? m : n; + int j = (jobu == MagmaAllVec) ? m : k; + + real *a_data = th_magma_malloc_pinned<real>(m * n); + THCTensor_(copyTensor2d)(state, a_data, a); + + real *rs_data = th_magma_malloc_pinned<real>(k); + real *ru_data = th_magma_malloc_pinned<real>(m * j); + real *rv_data = th_magma_malloc_pinned<real>(n * n); + + real wkopt; + int info; + +#if defined(THC_REAL_IS_FLOAT) + magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info); +#else + magma_dgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info); +#endif + + int lwork = (int) wkopt; + real *work_data = th_magma_malloc_pinned<real>(lwork); + +#if defined(THC_REAL_IS_FLOAT) + magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info); +#else + magma_dgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info); +#endif + + if (info > 0) + THError("MAGMA gesvd : %d superdiagonals failed to converge", info); + else if (info < 0) + THError("MAGMA gesvd : Argument %d : illegal value", -info); + + THCTensor_(copyArray2d)(state, rv_, rv_data, n, n); + THCTensor_(transpose)(state, rv_, NULL, 0, 1); + THCTensor_(copyArray2d)(state, ru_, ru_data, m, j); + THCTensor_(copyArray1d)(state, rs_, rs_data, k); + THCTensor_(copyArray2d)(state, ra_, a_data, m, n); + + magma_free_pinned(work_data); + magma_free_pinned(rv_data); + magma_free_pinned(ru_data); + magma_free_pinned(rs_data); + magma_free_pinned(a_data); +#else + THError(NoMagma(gesvd2)); +#endif +} + +THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a) +{ +#ifdef USE_MAGMA + THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + + int info; + int n = a->size[0]; + int lwork = n * magma_get_sgetri_nb(n); + + THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); + real *input_data = THCTensor_(data)(state, input); + + int *ipiv = th_magma_malloc_pinned<int>(n); + + THCTensor *work = THCTensor_(newWithSize1d)(state, lwork); + real *work_data = THCTensor_(data)(state, work); + + // Run LU +#if defined(THC_REAL_IS_FLOAT) + magma_sgetrf_gpu(n, n, input_data, n, ipiv, &info); +#else + magma_dgetrf_gpu(n, n, input_data, n, ipiv, &info); +#endif + + if (info > 0) + THError("MAGMA getrf : U(%d,%d) is 0, U is singular", info, info); + else if (info < 0) + THError("MAGMA getrf : Argument %d : illegal value", -info); + + // Inverse +#if defined(THC_REAL_IS_FLOAT) + magma_sgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info); +#else + magma_dgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info); +#endif + + if (info > 0) + THError("MAGMA getri : U(%d,%d) is 0, U is singular", info, info); + else if (info < 0) + THError("MAGMA getri : Argument %d : illegal value", -info); + + THCTensor_(free)(state, work); + magma_free_pinned(ipiv); + THCTensor_(freeCopyTo)(state, input, ra_); +#else + THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + + int n = a->size[0]; + + // input + THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); + // output + THCTensor *output = THCTensor_(newColumnMajor)(state, ra_, a); + + size_t matrices_size = sizeof(real*); + + real **matrices1 = (real **)THAlloc(matrices_size); + const real **matrices1_const = (const real **)THAlloc(matrices_size); + real **matrices2 = (real **)THAlloc(matrices_size); + matrices1[0] = THCTensor_(data)(state, input); + matrices1_const[0] = THCTensor_(data)(state, input); + matrices2[0] = THCTensor_(data)(state, output); + + // Copy pointers to device. + real **d_matrices1, **d_matrices2; + const real **d_matrices1_const; + THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size)); + THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1_const, matrices_size)); + THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size)); + + THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size, + cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); + THCudaCheck(cudaMemcpyAsync(d_matrices1_const, matrices1_const, matrices_size, + cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); + THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size, + cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); + int info; + int *info_gpu; + THCudaCheck(THCudaMalloc(state, (void**)&info_gpu, sizeof(int))); + + int *ipiv_gpu; + THCudaCheck(THCudaMalloc(state, (void**)&ipiv_gpu, n * sizeof(int))); + + // Run LU +#if defined(THC_REAL_IS_FLOAT) + THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1); +#else + THCudaBlas_Dgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1); +#endif + + THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost)); + + if (info > 0) + THError("CUBLAS getrf : U(%d,%d) is 0, U is singular", info, info); + else if (info < 0) + THError("CUBLAS getrf : Argument %d : illegal value", -info); + + // Inverse +#if defined(THC_REAL_IS_FLOAT) + THCudaBlas_Sgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1); +#else + THCudaBlas_Dgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1); +#endif + + if (info > 0) + THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info); + else if (info < 0) + THError("CUBLAS getri : Argument %d : illegal value", -info); + + THCudaCheck(THCudaFree(state, ipiv_gpu)); + THCudaCheck(THCudaFree(state, info_gpu)); + THCTensor_(freeCopyTo)(state, output, input); +#endif +} + +__global__ void THCTensor_(copyUpperSymmetric)(real *input, int n, int len) +{ + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) { + const int r = idx % n; + const int c = idx / n; + if (r > c) { + input[idx] = input[r*n + c]; + } + } +} + +__global__ void THCTensor_(copyLowerSymmetric)(real *input, int n, int len) +{ + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) { + const int r = idx % n; + const int c = idx / n; + if (r < c) { + input[idx] = input[r*n + c]; + } + } +} + +THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) +{ +#ifdef USE_MAGMA + THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + + int n = a->size[0]; + magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; + + THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); + real *input_data = THCTensor_(data)(state, input); + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_spotri_gpu(ul, n, input_data, n, &info); +#else + magma_dpotri_gpu(ul, n, input_data, n, &info); +#endif + + if (info > 0) + THError("MAGMA potri : A(%d,%d) is 0, A cannot be factorized", info, info); + else if (info < 0) + THError("MAGMA potri : Argument %d : illegal value", -info); + + cudaStream_t stream = THCState_getCurrentStream(state); + const int len = n*n; + dim3 blocks(std::min(DIVUP(len, 128), 65535)); + dim3 threads(128); + if (uplo[0] == 'U') { + THCTensor_(copyUpperSymmetric)<<<blocks, threads, 0, stream>>>(input_data, n, len); + } else { + THCTensor_(copyLowerSymmetric)<<<blocks, threads, 0, stream>>>(input_data, n, len); + } + + THCTensor_(freeCopyTo)(state, input, ra_); +#else + THError(NoMagma(potri)); +#endif +} + +THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) +{ +#ifdef USE_MAGMA + THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional"); + THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + + int n = a->size[0]; + magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; + + THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); + real *input_data = THCTensor_(data)(state, input); + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_spotrf_gpu(ul, n, input_data, n, &info); +#else + magma_dpotrf_gpu(ul, n, input_data, n, &info); +#endif + + // check error value + if (info > 0) + THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info); + else if (info < 0) + THError("MAGMA potrf : Argument %d : illegal value", -info); + + if (uplo[0] == 'U') { + THCTensor_(triu)(state, ra_, input, 0); + } else { + THCTensor_(tril)(state, ra_, input, 0); + } + THCTensor_(free)(state, input); +#else + THError(NoMagma(potrf)); +#endif +} + +THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo) +{ +#ifdef USE_MAGMA + THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + + int n = a->size[0]; + int nrhs = b->size[1]; + magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; + + THCTensor *b_ = THCTensor_(newColumnMajor)(state, rb_, b); + real *b_data = THCTensor_(data)(state, b_); + THCTensor *a_ = THCTensor_(newColumnMajor)(state, a, a); + real *a_data = THCTensor_(data)(state, a_); + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_spotrs_gpu(ul, n, nrhs, a_data, n, b_data, n, &info); +#else + magma_dpotrs_gpu(ul, n, nrhs, a_data, n, b_data, n, &info); +#endif + + // check error value + if (info < 0) + THError("MAGMA potrs : Argument %d : illegal value", -info); + + THCTensor_(freeCopyTo)(state, b_, rb_); + THCTensor_(free)(state, a_); +#else + THError(NoMagma(potrs)); +#endif +} + +THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_) +{ +#ifdef USE_MAGMA + THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional"); + + THCTensor *a = THCTensor_(newColumnMajor)(state, rr_, a_); + int m = a->size[0]; + int n = a->size[1]; + int k = (m < n ? m : n); + +#ifdef MAGMA_V2 + int nb = magma_get_sgeqrf_nb(m, n); +#else + int nb = magma_get_sgeqrf_nb(m); +#endif + + real *a_data = THCTensor_(data)(state, a); + real *tau_data = th_magma_malloc_pinned<real>(n*n); + + THCTensor *work = THCTensor_(newWithSize1d)(state, (2*k + ((n+31)/32)*32)*nb); + real *work_data = THCTensor_(data)(state, work); + + int info; +#if defined(THC_REAL_IS_FLOAT) + magma_sgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info); +#else + magma_dgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info); +#endif + + if (info != 0) + THError("MAGMA geqrf : Argument %d : illegal value.", -info); + + THCTensor *q = THCTensor_(newColumnMajor)(state, rq_, a); + real *q_data = THCTensor_(data)(state, q); + + THCTensor_(narrow)(state, a, a, 0, 0, k); + THCTensor_(triu)(state, rr_, a, 0); + THCTensor_(free)(state, a); + +#if defined(THC_REAL_IS_FLOAT) + magma_sorgqr_gpu(m, n, k, q_data, m, tau_data, work_data, nb, &info); +#else + magma_dorgqr_gpu(m, n, k, q_data, m, tau_data, work_data, nb, &info); +#endif + + if (info != 0) + THError("MAGMA orgqr : Argument %d : illegal value.", -info); + + THCTensor_(free)(state, work); + magma_free_pinned(tau_data); + + THCTensor_(narrow)(state, q, q, 1, 0, k); + THCTensor_(freeCopyTo)(state, q, rq_); +#else + THError(NoMagma(qr)); +#endif +} + +#endif + +#endif diff --git a/lib/THC/generic/THCTensorMathMagma.h b/lib/THC/generic/THCTensorMathMagma.h new file mode 100644 index 0000000..938daea --- /dev/null +++ b/lib/THC/generic/THCTensorMathMagma.h @@ -0,0 +1,23 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathMagma.h" +#else + +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + +// MAGMA (i.e. CUDA implementation of LAPACK functions) +THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_); +THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_); +THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobz, const char *uplo); +THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvr); +THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu); +THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobu); +THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a); +THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo); +THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo); +THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *a, THCTensor *b, const char *uplo); +THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a); + + +#endif // defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + +#endif diff --git a/test/test.lua b/test/test.lua index 33147b1..c508d5d 100644 --- a/test/test.lua +++ b/test/test.lua @@ -2325,19 +2325,26 @@ end function test.inverse() local a = torch.eye(5):add(torch.Tensor(5, 5):uniform(-0.1, 0.1)) - local i1 = torch.inverse(a) - local i2 = torch.inverse(a:cuda()) - tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong inverse answer") + for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do + local at = a:type(typename) + local i1 = torch.inverse(at) + local i2 = torch.inverse(a:cuda()) + tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong inverse answer") + end end if cutorch.magma then function test.gesv() local a = torch.Tensor(5, 5):uniform(-1, 1) local b = torch.Tensor(5, 3):uniform(-1, 1) - local rb1, ra1 = torch.gesv(b, a) - local rb2, ra2 = torch.gesv(b:cuda(), a:cuda()) - tester:assertle((rb2 - rb1:cuda()):abs():max(), 1e-5, "wrong gesv answer") - tester:assertle((ra2 - ra1:cuda()):abs():max(), 1e-5, "wrong gesv answer") + for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do + local at = a:type(typename) + local bt = b:type(typename) + local rb1, ra1 = torch.gesv(bt, at) + local rb2, ra2 = torch.gesv(bt:cuda(), at:cuda()) + tester:assertle((rb2 - rb1:cuda()):abs():max(), 1e-5, "wrong gesv answer") + tester:assertle((ra2 - ra1:cuda()):abs():max(), 1e-5, "wrong gesv answer") + end end function test.gels() @@ -2355,10 +2362,14 @@ if cutorch.magma then { 0.5360, 0.2048, 0.2745}, { 0.8535,-0.3938,-0.2140}, } - local rb1, ra1 = torch.gels(b, a) - local rb2, ra2 = torch.gels(b:cuda(), a:cuda()) - tester:assertle((rb2 - rb1:cuda()):abs():max(), 5e-4, "wrong gels answer") - tester:assertle((ra2 - ra1:cuda()):abs():max(), 5e-4, "wrong gels answer") + for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do + local at = a:type(typename) + local bt = b:type(typename) + local rb1, ra1 = torch.gels(bt, at) + local rb2, ra2 = torch.gels(bt:cuda(), at:cuda()) + tester:assertle((rb2 - rb1:cuda()):abs():max(), 5e-4, "wrong gels answer") + tester:assertle((ra2 - ra1:cuda()):abs():max(), 5e-4, "wrong gels answer") + end end function test.symeig() @@ -2367,10 +2378,13 @@ if cutorch.magma then {-0.47, -6.39, 4.17, 0.00, 0.00}, {-7.20, 1.50, -1.51, 5.70, 0.00}, {-0.65, -6.34, 2.67, 1.80, -7.10}}):t() - local e1,v1 = torch.symeig(a, 'V') - local e2,v2 = torch.symeig(a:cuda(), 'V') - tester:assertle((e2 - e1:cuda()):abs():max(), 1e-5, "wrong symeig answer") - tester:assertle((v2 - v1:cuda()):abs():max(), 1e-5, "wrong symeig answer") + for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do + local at = a:type(typename) + local e1,v1 = torch.symeig(at, 'V') + local e2,v2 = torch.symeig(at:cuda(), 'V') + tester:assertle((e2 - e1:cuda()):abs():max(), 1e-5, "wrong symeig answer") + tester:assertle((v2 - v1:cuda()):abs():max(), 1e-5, "wrong symeig answer") + end end function test.eig() @@ -2381,10 +2395,13 @@ if cutorch.magma then { 0.5766, -0.6743, 0.6903, 0.3646, -0.4571}, {-0.8956, -0.4074, -0.7583, 0.1838, -0.0091}, } - local e1,v1 = torch.eig(a, 'V') - local e2,v2 = torch.eig(a:cuda(), 'V') - tester:assertle((e2 - e1:cuda()):abs():max(), 1e-6, "wrong eig answer") - tester:assertle((v2:abs() - v1:abs():cuda()):abs():max(), 1e-6, "wrong eig answer") + for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do + local at = a:type(typename) + local e1,v1 = torch.eig(at, 'V') + local e2,v2 = torch.eig(at:cuda(), 'V') + tester:assertle((e2 - e1:cuda()):abs():max(), 1e-6, "wrong eig answer") + tester:assertle((v2:abs() - v1:abs():cuda()):abs():max(), 1e-6, "wrong eig answer") + end end function test.svd() @@ -2395,17 +2412,20 @@ if cutorch.magma then {5.45, -0.27, 4.85, 0.74, 10.00, -6.02}, {3.16, 7.98, 3.01, 5.80, 4.27, -5.31}} - local u,s,v = torch.svd(a, 'A') + for _, typename in ipairs({'torch.CudaDoubleTensor', 'torch.CudaTensor'}) do + local at = a:type(typename) + local u,s,v = torch.svd(a, 'A') - local temp = torch.Tensor(a:size(2)):zero() - temp:narrow(1, 1, a:size(1)):copy(s) - local sigma = torch.diag(temp):resize(a:size(1), a:size(2)):cuda() + local temp = torch.Tensor(a:size(2)):zero() + temp:narrow(1, 1, a:size(1)):copy(s) + local sigma = torch.diag(temp):resize(a:size(1), a:size(2)):cuda() - local m = u * sigma * v:t() + local m = u * sigma * v:t() - tester:assertle((m - a):abs():max(), 1e-5, "svd: a != u * s * vT") - tester:assertle((u*u:t() - torch.eye(a:size(1)):cuda()):abs():max(), 1e-6, "svd: u should be unitary") - tester:assertle((v*v:t() - torch.eye(a:size(2)):cuda()):abs():max(), 1e-6, "svd: v should be unitary") + tester:assertle((m - a):abs():max(), 1e-5, "svd: a != u * s * vT") + tester:assertle((u*u:t() - torch.eye(a:size(1)):cuda()):abs():max(), 1e-6, "svd: u should be unitary") + tester:assertle((v*v:t() - torch.eye(a:size(2)):cuda()):abs():max(), 1e-6, "svd: v should be unitary") + end end @@ -2419,11 +2439,18 @@ if cutorch.magma then } A = A * A:t() - local i1 = torch.potri(A) - local i2 = torch.potri(A:cuda()) - local M = A:cuda() * i2 - tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong potri answer") - tester:assertle((M - torch.eye(A:size(1)):cuda()):abs():max(), 1e-5, "potri not an inverse") + for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do + local at = A:type(typename) + for _, triarg in ipairs({'U', 'L'}) do + local chol = torch.potrf(at, triarg) + + local i1 = torch.potri(chol, triarg) + local i2 = torch.potri(chol:cuda(), triarg) + local M = at:cuda() * i2 + tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong potri answer") + tester:assertle((M - torch.eye(at:size(1)):cuda()):abs():max(), 1e-5, "potri not an inverse") + end + end end function test.potrf() @@ -2434,9 +2461,14 @@ if cutorch.magma then {-0.6738, 0.4734,-1.1123, 2.4071,-1.2756}, {-3.3883, 0.2807, 0.8161,-1.2756, 4.3415}, } - local i1 = torch.potrf(A) - local i2 = torch.potrf(A:cuda()) - tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong potrf answer") + for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do + local at = A:type(typename) + for _, triarg in ipairs({'U', 'L'}) do + local i1 = torch.potrf(at, triarg) + local i2 = torch.potrf(at:cuda(), triarg) + tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong potrf answer") + end + end end function test.potrs() @@ -2452,10 +2484,16 @@ if cutorch.magma then {0.2334, 0.8594, 0.4103}, {0.7556, 0.1966, 0.9637}, {0.1420, 0.7185, 0.7476}}) - local chol = torch.potrf(A) - local solve1 = torch.potrs(B, chol) - local solve2 = torch.potrs(B:cuda(), chol:cuda()) - tester:assertle((solve2 - solve1:cuda()):abs():max(), 1e-4, "wrong potrs answer") + for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do + local at = A:type(typename) + local bt = B:type(typename) + for _, triarg in ipairs({'U', 'L'}) do + local chol = torch.potrf(at, triarg) + local solve1 = torch.potrs(bt, chol, triarg) + local solve2 = torch.potrs(bt:cuda(), chol:cuda(), triarg) + tester:assertle((solve2 - solve1:cuda()):abs():max(), 1e-4, "wrong potrs answer") + end + end end function test.qr() @@ -2466,10 +2504,13 @@ if cutorch.magma then {-0.2987, 1.9035, -1.4192, -0.9738, 1.4384}, {-0.5315, 0.4958, 0.4449, -0.4676, -0.4878}, } - local q1,r1 = torch.qr(A) - local q2,r2 = torch.qr(A:cuda()) - tester:assertle((q2 - q1:cuda()):abs():max(), 1e-5, "wrong qr answer") - tester:assertle((r2 - r1:cuda()):abs():max(), 1e-5, "wrong qr answer") + for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do + local at = A:type(typename) + local q1,r1 = torch.qr(at) + local q2,r2 = torch.qr(at:cuda()) + tester:assertle((q2 - q1:cuda()):abs():max(), 1e-5, "wrong qr answer") + tester:assertle((r2 - r1:cuda()):abs():max(), 1e-5, "wrong qr answer") + end end end |