Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/cutorch.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSoumith Chintala <soumith@gmail.com>2016-11-17 00:55:01 +0300
committerGitHub <noreply@github.com>2016-11-17 00:55:01 +0300
commit6d6fe2061c79ada02ffd15e5314472d65640f498 (patch)
treedda7e84b1f32f9f2818573e66164028535731950
parent7ed4888c337db9051bed4458e558733db5a8d48c (diff)
parent7914f6e3738c68d5f47d6b9d67c09685c9c543b0 (diff)
Merge pull request #602 from killeent/magma
Magma functions to generic
-rw-r--r--TensorMath.lua126
-rw-r--r--lib/THC/CMakeLists.txt3
-rw-r--r--lib/THC/THCTensorMath.h17
-rw-r--r--lib/THC/THCTensorMathMagma.cu564
-rw-r--r--lib/THC/THCTensorMathMagma.cuh22
-rw-r--r--lib/THC/generic/THCTensorMathMagma.cu650
-rw-r--r--lib/THC/generic/THCTensorMathMagma.h23
-rw-r--r--test/test.lua127
8 files changed, 908 insertions, 624 deletions
diff --git a/TensorMath.lua b/TensorMath.lua
index f94154d..61cd4e9 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -1173,6 +1173,114 @@ for k, Tensor_ in pairs(handledTypenames) do
end
end
+ if real == 'float' or real == 'double' then
+
+ for _,name in ipairs({"gesv", "gels"}) do
+ wrap(name,
+ cname(name),
+ {{name=Tensor, returned=true},
+ {name=Tensor, returned=true},
+ {name=Tensor},
+ {name=Tensor}},
+ cname(name),
+ {{name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor},
+ {name=Tensor}})
+ end
+
+ wrap("symeig",
+ cname("syev"),
+ {{name=Tensor, returned=true},
+ {name=Tensor, returned=true},
+ {name=Tensor},
+ {name='charoption', values={'N', 'V'}, default='N'},
+ {name='charoption', values={'U', 'L'}, default='U'}},
+ cname("syev"),
+ {{name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor},
+ {name='charoption', values={'N', 'V'}, default='N'},
+ {name='charoption', values={'U', 'L'}, default='U'}})
+
+ wrap("eig",
+ cname("geev"),
+ {{name=Tensor, returned=true},
+ {name=Tensor, returned=true},
+ {name=Tensor},
+ {name='charoption', values={'N', 'V'}, default='N'}},
+ cname("geev"),
+ {{name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor},
+ {name='charoption', values={'N', 'V'}, default='N'}})
+
+ wrap("svd",
+ cname("gesvd"),
+ {{name=Tensor, returned=true},
+ {name=Tensor, returned=true},
+ {name=Tensor, returned=true},
+ {name=Tensor},
+ {name='charoption', values={'A', 'S'}, default='S'}},
+ cname("gesvd"),
+ {{name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor},
+ {name='charoption', values={'A', 'S'}, default='S'}})
+
+ wrap("inverse",
+ cname("getri"),
+ {{name=Tensor, returned=true},
+ {name=Tensor}},
+ cname("getri"),
+ {{name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor}})
+
+ wrap("potri",
+ cname("potri"),
+ {{name=Tensor, returned=true},
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}},
+ cname("potri"),
+ {{name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}})
+
+ wrap("potrf",
+ cname("potrf"),
+ {{name=Tensor, returned=true},
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}},
+ cname("potrf"),
+ {{name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}})
+
+ wrap("potrs",
+ cname("potrs"),
+ {{name=Tensor, returned=true},
+ {name=Tensor},
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}},
+ cname("potrs"),
+ {{name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor},
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}})
+
+ wrap("qr",
+ cname("qr"),
+ {{name=Tensor, returned=true},
+ {name=Tensor, returned=true},
+ {name=Tensor}},
+ cname("qr"),
+ {{name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor, default=true, returned=true, invisible=true},
+ {name=Tensor}})
+
+ end
+
wrap("dot",
cname("dot"),
{{name=Tensor},
@@ -1818,28 +1926,34 @@ wrap("inverse",
wrap("potri",
cname("potri"),
{{name=Tensor, returned=true},
- {name=Tensor}},
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}},
cname("potri"),
{{name=Tensor, default=true, returned=true, invisible=true},
- {name=Tensor}})
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}})
wrap("potrf",
cname("potrf"),
{{name=Tensor, returned=true},
- {name=Tensor}},
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}},
cname("potrf"),
{{name=Tensor, default=true, returned=true, invisible=true},
- {name=Tensor}})
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}})
wrap("potrs",
cname("potrs"),
{{name=Tensor, returned=true},
{name=Tensor},
- {name=Tensor}},
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}},
cname("potrs"),
{{name=Tensor, default=true, returned=true, invisible=true},
{name=Tensor},
- {name=Tensor}})
+ {name=Tensor},
+ {name='charoption', values={'U', 'L'}, default='U'}})
wrap("qr",
cname("qr"),
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index 51f568a..c8916d3 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -239,6 +239,7 @@ INSTALL(FILES
THCTensorInfo.cuh
THCTensorTypeUtils.cuh
THCTensorRandom.cuh
+ THCTensorMathMagma.cuh
DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC")
INSTALL(FILES
@@ -264,6 +265,8 @@ INSTALL(FILES
generic/THCTensorMathCompare.cu
generic/THCTensorMathCompareT.h
generic/THCTensorMathCompareT.cu
+ generic/THCTensorMathMagma.h
+ generic/THCTensorMathMagma.cu
generic/THCTensorMathPairwise.h
generic/THCTensorMathPairwise.cu
generic/THCTensorMathPointwise.h
diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h
index 7cbef32..19ae679 100644
--- a/lib/THC/THCTensorMath.h
+++ b/lib/THC/THCTensorMath.h
@@ -10,6 +10,9 @@
#include "generic/THCTensorMathBlas.h"
#include "THCGenerateAllTypes.h"
+#include "generic/THCTensorMathMagma.h"
+#include "THCGenerateAllTypes.h"
+
#include "generic/THCTensorMathPairwise.h"
#include "THCGenerateAllTypes.h"
@@ -40,20 +43,6 @@
#include "generic/THCTensorSort.h"
#include "THCGenerateAllTypes.h"
-// MAGMA (i.e. CUDA implementation of LAPACK functions)
-THC_API void THCudaTensor_gesv(THCState *state, THCudaTensor *rb_, THCudaTensor *ra_, THCudaTensor *b_, THCudaTensor *a_);
-THC_API void THCudaTensor_gels(THCState *state, THCudaTensor *rb_, THCudaTensor *ra_, THCudaTensor *b_, THCudaTensor *a_);
-THC_API void THCudaTensor_syev(THCState *state, THCudaTensor *re_, THCudaTensor *rv_, THCudaTensor *a_, const char *jobz, const char *uplo);
-THC_API void THCudaTensor_geev(THCState *state, THCudaTensor *re_, THCudaTensor *rv_, THCudaTensor *a_, const char *jobvr);
-THC_API void THCudaTensor_gesvd(THCState *state, THCudaTensor *ru_, THCudaTensor *rs_, THCudaTensor *rv_, THCudaTensor *a, const char *jobu);
-THC_API void THCudaTensor_gesvd2(THCState *state, THCudaTensor *ru_, THCudaTensor *rs_, THCudaTensor *rv_, THCudaTensor *ra_, THCudaTensor *a, const char *jobu);
-THC_API void THCudaTensor_getri(THCState *state, THCudaTensor *ra_, THCudaTensor *a);
-THC_API void THCudaTensor_potri(THCState *state, THCudaTensor *ra_, THCudaTensor *a);
-THC_API void THCudaTensor_potrf(THCState *state, THCudaTensor *ra_, THCudaTensor *a);
-THC_API void THCudaTensor_potrs(THCState *state, THCudaTensor *rb_, THCudaTensor *a, THCudaTensor *b);
-THC_API void THCudaTensor_qr(THCState *state, THCudaTensor *rq_, THCudaTensor *rr_, THCudaTensor *a);
-
-
THC_API int THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self);
THC_API int THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self);
diff --git a/lib/THC/THCTensorMathMagma.cu b/lib/THC/THCTensorMathMagma.cu
index d599722..cac5d73 100644
--- a/lib/THC/THCTensorMathMagma.cu
+++ b/lib/THC/THCTensorMathMagma.cu
@@ -1,6 +1,7 @@
#include "THCGeneral.h"
#include "THCTensorMath.h"
#include "THCTensorCopy.h"
+#include "THCTensorMathMagma.cuh"
#include <algorithm>
#ifdef USE_MAGMA
@@ -22,564 +23,5 @@ void THCMagma_init(THCState *state)
#endif
}
-#ifdef USE_MAGMA
-static inline float* th_magma_smalloc_pinned(size_t n)
-{
- float* ptr;
- if (MAGMA_SUCCESS != magma_smalloc_pinned(&ptr, n))
- THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456);
- return ptr;
-}
-
-static inline int* th_magma_imalloc_pinned(size_t n)
-{
- int* ptr;
- if (MAGMA_SUCCESS != magma_imalloc_pinned(&ptr, n))
- THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456);
- return ptr;
-}
-
-static void THCudaTensor_copyArray1d(THCState *state, THCudaTensor *self, float *src, int k)
-{
- long size[1] = { k };
- long stride[1] = { 1 };
- THCudaTensor_rawResize(state, self, 1, size, stride);
- size_t len = k * sizeof(float);
- THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
-}
-
-static void THCudaTensor_copyArray2d(THCState *state, THCudaTensor *self, float *src, int m, int n)
-{
- long size[2] = { m, n };
- long stride[2] = { 1, m };
- THCudaTensor_rawResize(state, self, 2, size, stride);
- size_t len = m * n * sizeof(float);
- THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
-}
-
-static void THCudaTensor_copyTensor2d(THCState *state, float *dst, THCudaTensor *self)
-{
- THAssert(self->nDimension == 2);
- size_t len = THCudaTensor_nElement(state, self)*sizeof(float);
- THCudaTensor *temp = THCudaTensor_newTranspose(state, self, 0, 1);
- THCudaTensor *selfc = THCudaTensor_newContiguous(state, temp);
- THCudaCheck(cudaMemcpy(dst, selfc->storage->data + selfc->storageOffset, len, cudaMemcpyDeviceToHost));
- THCudaTensor_free(state, temp);
- THCudaTensor_free(state, selfc);
-}
-
-#endif
-
-static THCudaTensor* THCudaTensor_newColumnMajor(THCState *state, THCudaTensor *self, THCudaTensor *src)
-{
- THAssert(src->nDimension == 2);
- if (self == src && self->stride[0] == 1 && self->stride[1] == self->size[0])
- {
- THCudaTensor_retain(state, self);
- return self;
- }
-
- if (self == src)
- self = THCudaTensor_new(state);
- else
- THCudaTensor_retain(state, self);
-
- long size[2] = { src->size[0], src->size[1] };
- long stride[2] = { 1, src->size[0] };
-
- THCudaTensor_rawResize(state, self, 2, size, stride);
- THCudaTensor_copy(state, self, src);
- return self;
-}
-
-
-void THCudaTensor_gesv(THCState *state, THCudaTensor *rb_, THCudaTensor *ra_, THCudaTensor *b_, THCudaTensor *a_)
-{
-#ifdef USE_MAGMA
- THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
- THArgCheck(b_->nDimension == 2, 2, "b should be 2 dimensional");
- THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square");
- THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible");
-
- int n = a_->size[0];
- int nrhs = b_->size[1];
-
- THCudaTensor *a = THCudaTensor_newColumnMajor(state, ra_, a_);
- THCudaTensor *b = THCudaTensor_newColumnMajor(state, rb_, b_);
- float *a_data = THCudaTensor_data(state, a);
- float *b_data = THCudaTensor_data(state, b);
-
- int *ipiv = th_magma_imalloc_pinned(n);
-
- int info;
- magma_sgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info);
-
- if (info < 0)
- THError("MAGMA gesv : Argument %d : illegal value", -info);
- else if (info > 0)
- THError("MAGMA gesv : U(%d,%d) is zero, singular U.", info, info);
-
- magma_free_pinned(ipiv);
- THCudaTensor_freeCopyTo(state, a, ra_);
- THCudaTensor_freeCopyTo(state, b, rb_);
-#else
- THError(NoMagma(gesv));
-#endif
-}
-
-void THCudaTensor_gels(THCState *state, THCudaTensor *rb_, THCudaTensor *ra_, THCudaTensor *b_, THCudaTensor *a_)
-{
-#ifdef USE_MAGMA
- THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
- THArgCheck(b_->nDimension == 2, 1, "b should be 2 dimensional");
- THArgCheck(a_->size[0] == b_->size[0], 2, "size incompatible A,b");
- THArgCheck(a_->size[0] >= a_->size[1], 2, "A should have m >= n");
-
- THCudaTensor *a = THCudaTensor_newColumnMajor(state, ra_, a_);
- THCudaTensor *b = THCudaTensor_newColumnMajor(state, rb_, b_);
- float *a_data = THCudaTensor_data(state, a);
- float *b_data = THCudaTensor_data(state, b);
-
- int m = a->size[0];
- int n = a->size[1];
- int nrhs = b->size[1];
- float wkopt;
-
- int info;
- magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
-
- float *hwork = th_magma_smalloc_pinned((size_t)wkopt);
- magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info);
- magma_free_pinned(hwork);
-
- if (info != 0)
- THError("MAGMA gels : Argument %d : illegal value", -info);
-
- THCudaTensor_freeCopyTo(state, a, ra_);
- THCudaTensor_freeCopyTo(state, b, rb_);
-#else
- THError(NoMagma(gels));
-#endif
-}
-
-void THCudaTensor_syev(THCState *state, THCudaTensor *re_, THCudaTensor *rv_, THCudaTensor *a, const char *jobzs, const char *uplos)
-{
-#ifdef USE_MAGMA
- int n = a->size[0];
- int lda = n;
-
- magma_uplo_t uplo = uplos[0] == 'U' ? MagmaUpper : MagmaLower;
- magma_vec_t jobz = jobzs[0] == 'N' ? MagmaNoVec : MagmaVec;
-
- THCudaTensor *input = THCudaTensor_newColumnMajor(state, rv_, a);
- float *input_data = THCudaTensor_data(state, input);
-
- // eigen values and workspace
- float *w = th_magma_smalloc_pinned(n);
- float *wA = th_magma_smalloc_pinned(lda);
-
- // compute optimal size of work array
- int info;
- float lwork;
- int liwork;
- magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
-
- float *work = th_magma_smalloc_pinned((size_t)lwork);
- int *iwork = th_magma_imalloc_pinned(liwork);
-
- // compute eigenvalues and, optionally, eigenvectors
- magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
-
- // copy eigen values from w to re_
- if (info == 0)
- THCudaTensor_copyArray1d(state, re_, w, n);
-
- magma_free_pinned(iwork);
- magma_free_pinned(work);
- magma_free_pinned(wA);
- magma_free_pinned(w);
-
- // check error value
- if (info > 0)
- THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
- else if (info < 0)
- THError("MAGMA syev : Argument %d : illegal value", -info);
-
- THCudaTensor_freeCopyTo(state, input, rv_);
-#else
- THError(NoMagma(syev));
-#endif
-}
-
-void THCudaTensor_geev(THCState *state, THCudaTensor *re_, THCudaTensor *rv_, THCudaTensor *a_, const char *jobvrs)
-{
-#ifdef USE_MAGMA
- THArgCheck(a_->nDimension == 2, 3, "A should be 2 dimensional");
- THArgCheck(a_->size[0] == a_->size[1], 3, "A should be square");
-
- magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec;
- int n = a_->size[0];
-
- float *a_data = th_magma_smalloc_pinned(n * n);
- THCudaTensor_copyTensor2d(state, a_data, a_);
-
- float *wr = th_magma_smalloc_pinned(n);
- float *wi = th_magma_smalloc_pinned(n);
-
- float *vr_data = NULL;
- int ldvr = 1;
- if (jobvr == MagmaVec)
- {
- vr_data = th_magma_smalloc_pinned(n * n);
- ldvr = n;
- }
-
- float wkopt;
- int info;
-
- magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
-
- int lwork = (int) wkopt;
- float *work_data = th_magma_smalloc_pinned(lwork);
-
- magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
-
- if (info > 0)
- THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
- else if (info < 0)
- THError("MAGMA geev : Argument %d : illegal value", -info);
-
- {
- THCudaTensor_resize2d(state, re_, 2, n);
- THCudaTensor *re = THCudaTensor_newContiguous(state, re_);
- THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset, wr, n*sizeof(float), cudaMemcpyHostToDevice));
- THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset + n, wi, n*sizeof(float), cudaMemcpyHostToDevice));
- THCudaTensor_freeCopyTo(state, re, re_);
- THCudaTensor_transpose(state, re_, NULL, 0, 1);
- }
-
- if (jobvr == MagmaVec)
- THCudaTensor_copyArray2d(state, rv_, vr_data, n, n);
-
- magma_free_pinned(work_data);
- magma_free_pinned(vr_data);
- magma_free_pinned(wi);
- magma_free_pinned(wr);
- magma_free_pinned(a_data);
-
-#else
- THError(NoMagma(geev));
-#endif
-}
-
-void THCudaTensor_gesvd(THCState *state, THCudaTensor *ru_, THCudaTensor *rs_, THCudaTensor *rv_, THCudaTensor *a, const char *jobu)
-{
-#ifdef USE_MAGMA
- THCudaTensor *ra_ = THCudaTensor_new(state);
- THCudaTensor_gesvd2(state, ru_, rs_, rv_, ra_, a, jobu);
- THCudaTensor_free(state, ra_);
-#else
- THError(NoMagma(gesvd));
-#endif
-}
-
-void THCudaTensor_gesvd2(THCState *state, THCudaTensor *ru_, THCudaTensor *rs_, THCudaTensor *rv_, THCudaTensor *ra_, THCudaTensor *a, const char *jobus)
-{
-#ifdef USE_MAGMA
- THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
-
- magma_vec_t jobu = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec;
- magma_vec_t jobvt = jobu;
-
- int m = a->size[0];
- int n = a->size[1];
- int k = m < n ? m : n;
- int j = (jobu == MagmaAllVec) ? m : k;
-
- float *a_data = th_magma_smalloc_pinned(m * n);
- THCudaTensor_copyTensor2d(state, a_data, a);
-
- float *rs_data = th_magma_smalloc_pinned(k);
- float *ru_data = th_magma_smalloc_pinned(m * j);
- float *rv_data = th_magma_smalloc_pinned(n * n);
-
- float wkopt;
- int info;
- magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info);
-
- int lwork = (int) wkopt;
- float *work_data = th_magma_smalloc_pinned(lwork);
-
- magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info);
-
- if (info > 0)
- THError("MAGMA gesvd : %d superdiagonals failed to converge", info);
- else if (info < 0)
- THError("MAGMA gesvd : Argument %d : illegal value", -info);
-
- THCudaTensor_copyArray2d(state, rv_, rv_data, n, n);
- THCudaTensor_transpose(state, rv_, NULL, 0, 1);
- THCudaTensor_copyArray2d(state, ru_, ru_data, m, j);
- THCudaTensor_copyArray1d(state, rs_, rs_data, k);
- THCudaTensor_copyArray2d(state, ra_, a_data, m, n);
-
- magma_free_pinned(work_data);
- magma_free_pinned(rv_data);
- magma_free_pinned(ru_data);
- magma_free_pinned(rs_data);
- magma_free_pinned(a_data);
-#else
- THError(NoMagma(gesvd2));
-#endif
-}
-
-void THCudaTensor_getri(THCState *state, THCudaTensor *ra_, THCudaTensor *a)
-{
-#ifdef USE_MAGMA
- THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
- THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
-
- int info;
- int n = a->size[0];
- int lwork = n * magma_get_sgetri_nb(n);
-
- THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a);
- float *input_data = THCudaTensor_data(state, input);
-
- int *ipiv = th_magma_imalloc_pinned(n);
-
- THCudaTensor *work = THCudaTensor_newWithSize1d(state, lwork);
- float *work_data = THCudaTensor_data(state, work);
-
- // Run LU
- magma_sgetrf_gpu(n, n, input_data, n, ipiv, &info);
- if (info > 0)
- THError("MAGMA getrf : U(%d,%d) is 0, U is singular", info, info);
- else if (info < 0)
- THError("MAGMA getrf : Argument %d : illegal value", -info);
-
- // Inverse
- magma_sgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info);
- if (info > 0)
- THError("MAGMA getri : U(%d,%d) is 0, U is singular", info, info);
- else if (info < 0)
- THError("MAGMA getri : Argument %d : illegal value", -info);
-
- THCudaTensor_free(state, work);
- magma_free_pinned(ipiv);
- THCudaTensor_freeCopyTo(state, input, ra_);
-#else
- THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
- THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
-
- int n = a->size[0];
-
- // input
- THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a);
- // output
- THCudaTensor *output = THCudaTensor_newColumnMajor(state, ra_, a);
-
- size_t matrices_size = sizeof(float*);
-
- float **matrices1 = (float **)THAlloc(matrices_size);
- const float **matrices1_const = (const float **)THAlloc(matrices_size);
- float **matrices2 = (float **)THAlloc(matrices_size);
- matrices1[0] = THCudaTensor_data(state, input);
- matrices1_const[0] = THCudaTensor_data(state, input);
- matrices2[0] = THCudaTensor_data(state, output);
-
- // Copy pointers to device.
- float **d_matrices1, **d_matrices2;
- const float **d_matrices1_const;
- THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size));
- THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1_const, matrices_size));
- THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size));
-
- THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size,
- cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
- THCudaCheck(cudaMemcpyAsync(d_matrices1_const, matrices1_const, matrices_size,
- cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
- THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size,
- cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
- int info;
- int *info_gpu;
- THCudaCheck(THCudaMalloc(state, (void**)&info_gpu, sizeof(int)));
-
- int *ipiv_gpu;
- THCudaCheck(THCudaMalloc(state, (void**)&ipiv_gpu, n * sizeof(int)));
-
- // Run LU
- THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
-
- THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost));
-
- if (info > 0)
- THError("CUBLAS getrf : U(%d,%d) is 0, U is singular", info, info);
- else if (info < 0)
- THError("CUBLAS getrf : Argument %d : illegal value", -info);
-
- // Inverse
- THCudaBlas_Sgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
- if (info > 0)
- THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info);
- else if (info < 0)
- THError("CUBLAS getri : Argument %d : illegal value", -info);
-
- THCudaCheck(THCudaFree(state, ipiv_gpu));
- THCudaCheck(THCudaFree(state, info_gpu));
- THCudaTensor_freeCopyTo(state, output, input);
-#endif
-
-}
-
-
-__global__ void THCudaTensor_copyUpperSymmetric(float *input, int n, int len)
-{
- for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) {
- const int r = idx % n;
- const int c = idx / n;
- if (r > c) {
- input[idx] = input[r*n + c];
- }
- }
-}
-
-void THCudaTensor_potri(THCState *state, THCudaTensor *ra_, THCudaTensor *a)
-{
-#ifdef USE_MAGMA
- THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
- THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
-
- int n = a->size[0];
-
- THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a);
- float *input_data = THCudaTensor_data(state, input);
-
- int info;
- magma_spotrf_gpu(MagmaUpper, n, input_data, n, &info);
- if (info > 0)
- THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info);
- else if (info < 0)
- THError("MAGMA potrf : Argument %d : illegal value", -info);
-
- magma_spotri_gpu(MagmaUpper, n, input_data, n, &info);
- if (info > 0)
- THError("MAGMA potri : A(%d,%d) is 0, A cannot be factorized", info, info);
- else if (info < 0)
- THError("MAGMA potri : Argument %d : illegal value", -info);
-
- cudaStream_t stream = THCState_getCurrentStream(state);
- const int len = n*n;
- dim3 blocks(std::min(DIVUP(len, 128), 65535));
- dim3 threads(128);
- THCudaTensor_copyUpperSymmetric<<<blocks, threads, 0, stream>>>(input_data, n, len);
-
- THCudaTensor_freeCopyTo(state, input, ra_);
-#else
- THError(NoMagma(potri));
-#endif
-}
-
-void THCudaTensor_potrf(THCState *state, THCudaTensor *ra_, THCudaTensor *a)
-{
-#ifdef USE_MAGMA
- THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
- THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
-
- int n = a->size[0];
-
- THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a);
- float *input_data = THCudaTensor_data(state, input);
-
- int info;
- magma_spotrf_gpu(MagmaUpper, n, input_data, n, &info);
-
- // check error value
- if (info > 0)
- THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info);
- else if (info < 0)
- THError("MAGMA potrf : Argument %d : illegal value", -info);
-
- THCudaTensor_triu(state, ra_, input, 0);
- THCudaTensor_free(state, input);
-#else
- THError(NoMagma(potrf));
-#endif
-}
-
-void THCudaTensor_potrs(THCState *state, THCudaTensor *rb_, THCudaTensor *b, THCudaTensor *a)
-{
-#ifdef USE_MAGMA
- THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
-
- int n = a->size[0];
- int nrhs = b->size[1];
-
- THCudaTensor *b_ = THCudaTensor_newColumnMajor(state, rb_, b);
- float *b_data = THCudaTensor_data(state, b_);
- THCudaTensor *a_ = THCudaTensor_newColumnMajor(state, a, a);
- float *a_data = THCudaTensor_data(state, a_);
-
- int info;
- magma_spotrs_gpu(MagmaUpper, n, nrhs, a_data, n, b_data, n, &info);
-
- // check error value
- if (info < 0)
- THError("MAGMA potrs : Argument %d : illegal value", -info);
-
- THCudaTensor_freeCopyTo(state, b_, rb_);
- THCudaTensor_free(state, a_);
-#else
- THError(NoMagma(potrs));
-#endif
-}
-
-void THCudaTensor_qr(THCState *state, THCudaTensor *rq_, THCudaTensor *rr_, THCudaTensor *a_)
-{
-#ifdef USE_MAGMA
- THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional");
-
- THCudaTensor *a = THCudaTensor_newColumnMajor(state, rr_, a_);
- int m = a->size[0];
- int n = a->size[1];
- int k = (m < n ? m : n);
-
-#ifdef MAGMA_V2
- int nb = magma_get_sgeqrf_nb(m, n);
-#else
- int nb = magma_get_sgeqrf_nb(m);
-#endif
-
- float *a_data = THCudaTensor_data(state, a);
- float *tau_data = th_magma_smalloc_pinned(n*n);
-
- THCudaTensor *work = THCudaTensor_newWithSize1d(state, (2*k + ((n+31)/32)*32)*nb);
- float *work_data = THCudaTensor_data(state, work);
-
- int info;
- magma_sgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
-
- if (info != 0)
- THError("MAGMA geqrf : Argument %d : illegal value.", -info);
-
- THCudaTensor *q = THCudaTensor_newColumnMajor(state, rq_, a);
- float *q_data = THCudaTensor_data(state, q);
-
- THCudaTensor_narrow(state, a, a, 0, 0, k);
- THCudaTensor_triu(state, rr_, a, 0);
- THCudaTensor_free(state, a);
-
- magma_sorgqr_gpu(m, n, k, q_data, m, tau_data, work_data, nb, &info);
-
- if (info != 0)
- THError("MAGMA orgqr : Argument %d : illegal value.", -info);
-
- THCudaTensor_free(state, work);
- magma_free_pinned(tau_data);
-
- THCudaTensor_narrow(state, q, q, 1, 0, k);
- THCudaTensor_freeCopyTo(state, q, rq_);
-#else
- THError(NoMagma(qr));
-#endif
-}
+#include "generic/THCTensorMathMagma.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathMagma.cuh b/lib/THC/THCTensorMathMagma.cuh
new file mode 100644
index 0000000..6495049
--- /dev/null
+++ b/lib/THC/THCTensorMathMagma.cuh
@@ -0,0 +1,22 @@
+#ifndef THC_TENSOR_MATH_MAGMA_CUH
+#define THC_TENSOR_MATH_MAGMA_CUH
+
+#ifdef USE_MAGMA
+#include <magma.h>
+#else
+#include "THCBlas.h"
+#endif
+
+#ifdef USE_MAGMA
+template <typename T>
+static inline T* th_magma_malloc_pinned(size_t n)
+{
+ void* ptr;
+ if (MAGMA_SUCCESS != magma_malloc_pinned(&ptr, n * sizeof(T)))
+ THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456);
+ return reinterpret_cast<T*>(ptr);
+}
+
+#endif
+
+#endif // THC_TENSOR_MATH_MAGMA_CUH
diff --git a/lib/THC/generic/THCTensorMathMagma.cu b/lib/THC/generic/THCTensorMathMagma.cu
new file mode 100644
index 0000000..d302b13
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathMagma.cu
@@ -0,0 +1,650 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathMagma.cu"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+#ifdef USE_MAGMA
+
+static void THCTensor_(copyArray1d)(THCState *state, THCTensor *self, real *src, int k)
+{
+ long size[1] = { k };
+ long stride[1] = { 1 };
+ THCTensor_(rawResize)(state, self, 1, size, stride);
+ size_t len = k * sizeof(real);
+ THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
+}
+
+static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src, int m, int n)
+{
+ long size[2] = { m, n };
+ long stride[2] = { 1, m };
+ THCTensor_(rawResize)(state, self, 2, size, stride);
+ size_t len = m * n * sizeof(real);
+ THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
+}
+
+static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self)
+{
+ THAssert(self->nDimension == 2);
+ size_t len = THCTensor_(nElement)(state, self)*sizeof(real);
+ THCTensor *temp = THCTensor_(newTranspose)(state, self, 0, 1);
+ THCTensor *selfc = THCTensor_(newContiguous)(state, temp);
+ THCudaCheck(cudaMemcpy(dst, selfc->storage->data + selfc->storageOffset, len, cudaMemcpyDeviceToHost));
+ THCTensor_(free)(state, temp);
+ THCTensor_(free)(state, selfc);
+}
+
+#endif // USE_MAGMA
+
+static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src)
+{
+ THAssert(src->nDimension == 2);
+ if (self == src && self->stride[0] == 1 && self->stride[1] == self->size[0])
+ {
+ THCTensor_(retain)(state, self);
+ return self;
+ }
+
+ if (self == src)
+ self = THCTensor_(new)(state);
+ else
+ THCTensor_(retain)(state, self);
+
+ long size[2] = { src->size[0], src->size[1] };
+ long stride[2] = { 1, src->size[0] };
+
+ THCTensor_(rawResize)(state, self, 2, size, stride);
+ THCTensor_(copy)(state, self, src);
+ return self;
+}
+
+
+THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
+{
+#ifdef USE_MAGMA
+ THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
+ THArgCheck(b_->nDimension == 2, 2, "b should be 2 dimensional");
+ THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square");
+ THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible");
+
+ int n = a_->size[0];
+ int nrhs = b_->size[1];
+
+ THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
+ THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_);
+ real *a_data = THCTensor_(data)(state, a);
+ real *b_data = THCTensor_(data)(state, b);
+
+ int *ipiv = th_magma_malloc_pinned<int>(n);
+
+ int info;
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info);
+#else
+ magma_dgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info);
+#endif
+
+ if (info < 0)
+ THError("MAGMA gesv : Argument %d : illegal value", -info);
+ else if (info > 0)
+ THError("MAGMA gesv : U(%d,%d) is zero, singular U.", info, info);
+
+ magma_free_pinned(ipiv);
+ THCTensor_(freeCopyTo)(state, a, ra_);
+ THCTensor_(freeCopyTo)(state, b, rb_);
+#else
+ THError(NoMagma(gesv));
+#endif
+}
+
+THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
+{
+#ifdef USE_MAGMA
+ THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
+ THArgCheck(b_->nDimension == 2, 1, "b should be 2 dimensional");
+ THArgCheck(a_->size[0] == b_->size[0], 2, "size incompatible A,b");
+ THArgCheck(a_->size[0] >= a_->size[1], 2, "A should have m >= n");
+
+ THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
+ THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_);
+ real *a_data = THCTensor_(data)(state, a);
+ real *b_data = THCTensor_(data)(state, b);
+
+ int m = a->size[0];
+ int n = a->size[1];
+ int nrhs = b->size[1];
+ real wkopt;
+
+ int info;
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
+#else
+ magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
+#endif
+
+ real *hwork = th_magma_malloc_pinned<real>((size_t)wkopt);
+
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info);
+#else
+ magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info);
+#endif
+
+ magma_free_pinned(hwork);
+
+ if (info != 0)
+ THError("MAGMA gels : Argument %d : illegal value", -info);
+
+ THCTensor_(freeCopyTo)(state, a, ra_);
+ THCTensor_(freeCopyTo)(state, b, rb_);
+#else
+ THError(NoMagma(gels));
+#endif
+}
+
+THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos)
+{
+#ifdef USE_MAGMA
+ int n = a->size[0];
+ int lda = n;
+
+ magma_uplo_t uplo = uplos[0] == 'U' ? MagmaUpper : MagmaLower;
+ magma_vec_t jobz = jobzs[0] == 'N' ? MagmaNoVec : MagmaVec;
+
+ THCTensor *input = THCTensor_(newColumnMajor)(state, rv_, a);
+ real *input_data = THCTensor_(data)(state, input);
+
+ // eigen values and workspace
+ real *w = th_magma_malloc_pinned<real>(n);
+ real *wA = th_magma_malloc_pinned<real>(lda);
+
+ // compute optimal size of work array
+ int info;
+ real lwork;
+ int liwork;
+
+#if defined(THC_REAL_IS_FLOAT)
+ magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
+#else
+ magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
+#endif
+
+ real *work = th_magma_malloc_pinned<real>((size_t)lwork);
+ int *iwork = th_magma_malloc_pinned<int>(liwork);
+
+ // compute eigenvalues and, optionally, eigenvectors
+#if defined(THC_REAL_IS_FLOAT)
+ magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
+#else
+ magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
+#endif
+
+ // copy eigen values from w to re_
+ if (info == 0)
+ THCTensor_(copyArray1d)(state, re_, w, n);
+
+ magma_free_pinned(iwork);
+ magma_free_pinned(work);
+ magma_free_pinned(wA);
+ magma_free_pinned(w);
+
+ // check error value
+ if (info > 0)
+ THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
+ else if (info < 0)
+ THError("MAGMA syev : Argument %d : illegal value", -info);
+
+ THCTensor_(freeCopyTo)(state, input, rv_);
+#else
+ THError(NoMagma(syev));
+#endif
+}
+
+THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs)
+{
+#ifdef USE_MAGMA
+ THArgCheck(a_->nDimension == 2, 3, "A should be 2 dimensional");
+ THArgCheck(a_->size[0] == a_->size[1], 3, "A should be square");
+
+ magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec;
+ int n = a_->size[0];
+
+ real *a_data = th_magma_malloc_pinned<real>(n * n);
+ THCTensor_(copyTensor2d)(state, a_data, a_);
+
+ real *wr = th_magma_malloc_pinned<real>(n);
+ real *wi = th_magma_malloc_pinned<real>(n);
+
+ real *vr_data = NULL;
+ int ldvr = 1;
+ if (jobvr == MagmaVec)
+ {
+ vr_data = th_magma_malloc_pinned<real>(n * n);
+ ldvr = n;
+ }
+
+ real wkopt;
+ int info;
+
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
+#else
+ magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
+#endif
+
+ int lwork = (int) wkopt;
+ real *work_data = th_magma_malloc_pinned<real>(lwork);
+
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
+#else
+ magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
+#endif
+
+ if (info > 0)
+ THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
+ else if (info < 0)
+ THError("MAGMA geev : Argument %d : illegal value", -info);
+
+ {
+ THCTensor_(resize2d)(state, re_, 2, n);
+ THCTensor *re = THCTensor_(newContiguous)(state, re_);
+ THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice));
+ THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice));
+ THCTensor_(freeCopyTo)(state, re, re_);
+ THCTensor_(transpose)(state, re_, NULL, 0, 1);
+ }
+
+ if (jobvr == MagmaVec)
+ THCTensor_(copyArray2d)(state, rv_, vr_data, n, n);
+
+ magma_free_pinned(work_data);
+ magma_free_pinned(vr_data);
+ magma_free_pinned(wi);
+ magma_free_pinned(wr);
+ magma_free_pinned(a_data);
+
+#else
+ THError(NoMagma(geev));
+#endif
+}
+
+THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu)
+{
+#ifdef USE_MAGMA
+ THCTensor *ra_ = THCTensor_(new)(state);
+ THCTensor_(gesvd2)(state, ru_, rs_, rv_, ra_, a, jobu);
+ THCTensor_(free)(state, ra_);
+#else
+ THError(NoMagma(gesvd));
+#endif
+}
+
+THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus)
+{
+#ifdef USE_MAGMA
+ THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+
+ magma_vec_t jobu = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec;
+ magma_vec_t jobvt = jobu;
+
+ int m = a->size[0];
+ int n = a->size[1];
+ int k = m < n ? m : n;
+ int j = (jobu == MagmaAllVec) ? m : k;
+
+ real *a_data = th_magma_malloc_pinned<real>(m * n);
+ THCTensor_(copyTensor2d)(state, a_data, a);
+
+ real *rs_data = th_magma_malloc_pinned<real>(k);
+ real *ru_data = th_magma_malloc_pinned<real>(m * j);
+ real *rv_data = th_magma_malloc_pinned<real>(n * n);
+
+ real wkopt;
+ int info;
+
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info);
+#else
+ magma_dgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info);
+#endif
+
+ int lwork = (int) wkopt;
+ real *work_data = th_magma_malloc_pinned<real>(lwork);
+
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info);
+#else
+ magma_dgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info);
+#endif
+
+ if (info > 0)
+ THError("MAGMA gesvd : %d superdiagonals failed to converge", info);
+ else if (info < 0)
+ THError("MAGMA gesvd : Argument %d : illegal value", -info);
+
+ THCTensor_(copyArray2d)(state, rv_, rv_data, n, n);
+ THCTensor_(transpose)(state, rv_, NULL, 0, 1);
+ THCTensor_(copyArray2d)(state, ru_, ru_data, m, j);
+ THCTensor_(copyArray1d)(state, rs_, rs_data, k);
+ THCTensor_(copyArray2d)(state, ra_, a_data, m, n);
+
+ magma_free_pinned(work_data);
+ magma_free_pinned(rv_data);
+ magma_free_pinned(ru_data);
+ magma_free_pinned(rs_data);
+ magma_free_pinned(a_data);
+#else
+ THError(NoMagma(gesvd2));
+#endif
+}
+
+THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
+{
+#ifdef USE_MAGMA
+ THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+ THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+ int info;
+ int n = a->size[0];
+ int lwork = n * magma_get_sgetri_nb(n);
+
+ THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+ real *input_data = THCTensor_(data)(state, input);
+
+ int *ipiv = th_magma_malloc_pinned<int>(n);
+
+ THCTensor *work = THCTensor_(newWithSize1d)(state, lwork);
+ real *work_data = THCTensor_(data)(state, work);
+
+ // Run LU
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sgetrf_gpu(n, n, input_data, n, ipiv, &info);
+#else
+ magma_dgetrf_gpu(n, n, input_data, n, ipiv, &info);
+#endif
+
+ if (info > 0)
+ THError("MAGMA getrf : U(%d,%d) is 0, U is singular", info, info);
+ else if (info < 0)
+ THError("MAGMA getrf : Argument %d : illegal value", -info);
+
+ // Inverse
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info);
+#else
+ magma_dgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info);
+#endif
+
+ if (info > 0)
+ THError("MAGMA getri : U(%d,%d) is 0, U is singular", info, info);
+ else if (info < 0)
+ THError("MAGMA getri : Argument %d : illegal value", -info);
+
+ THCTensor_(free)(state, work);
+ magma_free_pinned(ipiv);
+ THCTensor_(freeCopyTo)(state, input, ra_);
+#else
+ THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+ THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+ int n = a->size[0];
+
+ // input
+ THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+ // output
+ THCTensor *output = THCTensor_(newColumnMajor)(state, ra_, a);
+
+ size_t matrices_size = sizeof(real*);
+
+ real **matrices1 = (real **)THAlloc(matrices_size);
+ const real **matrices1_const = (const real **)THAlloc(matrices_size);
+ real **matrices2 = (real **)THAlloc(matrices_size);
+ matrices1[0] = THCTensor_(data)(state, input);
+ matrices1_const[0] = THCTensor_(data)(state, input);
+ matrices2[0] = THCTensor_(data)(state, output);
+
+ // Copy pointers to device.
+ real **d_matrices1, **d_matrices2;
+ const real **d_matrices1_const;
+ THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size));
+ THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1_const, matrices_size));
+ THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size));
+
+ THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size,
+ cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+ THCudaCheck(cudaMemcpyAsync(d_matrices1_const, matrices1_const, matrices_size,
+ cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+ THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size,
+ cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+ int info;
+ int *info_gpu;
+ THCudaCheck(THCudaMalloc(state, (void**)&info_gpu, sizeof(int)));
+
+ int *ipiv_gpu;
+ THCudaCheck(THCudaMalloc(state, (void**)&ipiv_gpu, n * sizeof(int)));
+
+ // Run LU
+#if defined(THC_REAL_IS_FLOAT)
+ THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
+#else
+ THCudaBlas_Dgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
+#endif
+
+ THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost));
+
+ if (info > 0)
+ THError("CUBLAS getrf : U(%d,%d) is 0, U is singular", info, info);
+ else if (info < 0)
+ THError("CUBLAS getrf : Argument %d : illegal value", -info);
+
+ // Inverse
+#if defined(THC_REAL_IS_FLOAT)
+ THCudaBlas_Sgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
+#else
+ THCudaBlas_Dgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
+#endif
+
+ if (info > 0)
+ THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info);
+ else if (info < 0)
+ THError("CUBLAS getri : Argument %d : illegal value", -info);
+
+ THCudaCheck(THCudaFree(state, ipiv_gpu));
+ THCudaCheck(THCudaFree(state, info_gpu));
+ THCTensor_(freeCopyTo)(state, output, input);
+#endif
+}
+
+__global__ void THCTensor_(copyUpperSymmetric)(real *input, int n, int len)
+{
+ for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) {
+ const int r = idx % n;
+ const int c = idx / n;
+ if (r > c) {
+ input[idx] = input[r*n + c];
+ }
+ }
+}
+
+__global__ void THCTensor_(copyLowerSymmetric)(real *input, int n, int len)
+{
+ for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) {
+ const int r = idx % n;
+ const int c = idx / n;
+ if (r < c) {
+ input[idx] = input[r*n + c];
+ }
+ }
+}
+
+THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
+{
+#ifdef USE_MAGMA
+ THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+ THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+ int n = a->size[0];
+ magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower;
+
+ THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+ real *input_data = THCTensor_(data)(state, input);
+
+ int info;
+#if defined(THC_REAL_IS_FLOAT)
+ magma_spotri_gpu(ul, n, input_data, n, &info);
+#else
+ magma_dpotri_gpu(ul, n, input_data, n, &info);
+#endif
+
+ if (info > 0)
+ THError("MAGMA potri : A(%d,%d) is 0, A cannot be factorized", info, info);
+ else if (info < 0)
+ THError("MAGMA potri : Argument %d : illegal value", -info);
+
+ cudaStream_t stream = THCState_getCurrentStream(state);
+ const int len = n*n;
+ dim3 blocks(std::min(DIVUP(len, 128), 65535));
+ dim3 threads(128);
+ if (uplo[0] == 'U') {
+ THCTensor_(copyUpperSymmetric)<<<blocks, threads, 0, stream>>>(input_data, n, len);
+ } else {
+ THCTensor_(copyLowerSymmetric)<<<blocks, threads, 0, stream>>>(input_data, n, len);
+ }
+
+ THCTensor_(freeCopyTo)(state, input, ra_);
+#else
+ THError(NoMagma(potri));
+#endif
+}
+
+THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
+{
+#ifdef USE_MAGMA
+ THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+ THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+ int n = a->size[0];
+ magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower;
+
+ THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
+ real *input_data = THCTensor_(data)(state, input);
+
+ int info;
+#if defined(THC_REAL_IS_FLOAT)
+ magma_spotrf_gpu(ul, n, input_data, n, &info);
+#else
+ magma_dpotrf_gpu(ul, n, input_data, n, &info);
+#endif
+
+ // check error value
+ if (info > 0)
+ THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info);
+ else if (info < 0)
+ THError("MAGMA potrf : Argument %d : illegal value", -info);
+
+ if (uplo[0] == 'U') {
+ THCTensor_(triu)(state, ra_, input, 0);
+ } else {
+ THCTensor_(tril)(state, ra_, input, 0);
+ }
+ THCTensor_(free)(state, input);
+#else
+ THError(NoMagma(potrf));
+#endif
+}
+
+THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo)
+{
+#ifdef USE_MAGMA
+ THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+ int n = a->size[0];
+ int nrhs = b->size[1];
+ magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower;
+
+ THCTensor *b_ = THCTensor_(newColumnMajor)(state, rb_, b);
+ real *b_data = THCTensor_(data)(state, b_);
+ THCTensor *a_ = THCTensor_(newColumnMajor)(state, a, a);
+ real *a_data = THCTensor_(data)(state, a_);
+
+ int info;
+#if defined(THC_REAL_IS_FLOAT)
+ magma_spotrs_gpu(ul, n, nrhs, a_data, n, b_data, n, &info);
+#else
+ magma_dpotrs_gpu(ul, n, nrhs, a_data, n, b_data, n, &info);
+#endif
+
+ // check error value
+ if (info < 0)
+ THError("MAGMA potrs : Argument %d : illegal value", -info);
+
+ THCTensor_(freeCopyTo)(state, b_, rb_);
+ THCTensor_(free)(state, a_);
+#else
+ THError(NoMagma(potrs));
+#endif
+}
+
+THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_)
+{
+#ifdef USE_MAGMA
+ THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional");
+
+ THCTensor *a = THCTensor_(newColumnMajor)(state, rr_, a_);
+ int m = a->size[0];
+ int n = a->size[1];
+ int k = (m < n ? m : n);
+
+#ifdef MAGMA_V2
+ int nb = magma_get_sgeqrf_nb(m, n);
+#else
+ int nb = magma_get_sgeqrf_nb(m);
+#endif
+
+ real *a_data = THCTensor_(data)(state, a);
+ real *tau_data = th_magma_malloc_pinned<real>(n*n);
+
+ THCTensor *work = THCTensor_(newWithSize1d)(state, (2*k + ((n+31)/32)*32)*nb);
+ real *work_data = THCTensor_(data)(state, work);
+
+ int info;
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
+#else
+ magma_dgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
+#endif
+
+ if (info != 0)
+ THError("MAGMA geqrf : Argument %d : illegal value.", -info);
+
+ THCTensor *q = THCTensor_(newColumnMajor)(state, rq_, a);
+ real *q_data = THCTensor_(data)(state, q);
+
+ THCTensor_(narrow)(state, a, a, 0, 0, k);
+ THCTensor_(triu)(state, rr_, a, 0);
+ THCTensor_(free)(state, a);
+
+#if defined(THC_REAL_IS_FLOAT)
+ magma_sorgqr_gpu(m, n, k, q_data, m, tau_data, work_data, nb, &info);
+#else
+ magma_dorgqr_gpu(m, n, k, q_data, m, tau_data, work_data, nb, &info);
+#endif
+
+ if (info != 0)
+ THError("MAGMA orgqr : Argument %d : illegal value.", -info);
+
+ THCTensor_(free)(state, work);
+ magma_free_pinned(tau_data);
+
+ THCTensor_(narrow)(state, q, q, 1, 0, k);
+ THCTensor_(freeCopyTo)(state, q, rq_);
+#else
+ THError(NoMagma(qr));
+#endif
+}
+
+#endif
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathMagma.h b/lib/THC/generic/THCTensorMathMagma.h
new file mode 100644
index 0000000..938daea
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathMagma.h
@@ -0,0 +1,23 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathMagma.h"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+// MAGMA (i.e. CUDA implementation of LAPACK functions)
+THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_);
+THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_);
+THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobz, const char *uplo);
+THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvr);
+THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu);
+THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobu);
+THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a);
+THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo);
+THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo);
+THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *a, THCTensor *b, const char *uplo);
+THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a);
+
+
+#endif // defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+#endif
diff --git a/test/test.lua b/test/test.lua
index 33147b1..c508d5d 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -2325,19 +2325,26 @@ end
function test.inverse()
local a = torch.eye(5):add(torch.Tensor(5, 5):uniform(-0.1, 0.1))
- local i1 = torch.inverse(a)
- local i2 = torch.inverse(a:cuda())
- tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong inverse answer")
+ for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+ local at = a:type(typename)
+ local i1 = torch.inverse(at)
+ local i2 = torch.inverse(a:cuda())
+ tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong inverse answer")
+ end
end
if cutorch.magma then
function test.gesv()
local a = torch.Tensor(5, 5):uniform(-1, 1)
local b = torch.Tensor(5, 3):uniform(-1, 1)
- local rb1, ra1 = torch.gesv(b, a)
- local rb2, ra2 = torch.gesv(b:cuda(), a:cuda())
- tester:assertle((rb2 - rb1:cuda()):abs():max(), 1e-5, "wrong gesv answer")
- tester:assertle((ra2 - ra1:cuda()):abs():max(), 1e-5, "wrong gesv answer")
+ for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+ local at = a:type(typename)
+ local bt = b:type(typename)
+ local rb1, ra1 = torch.gesv(bt, at)
+ local rb2, ra2 = torch.gesv(bt:cuda(), at:cuda())
+ tester:assertle((rb2 - rb1:cuda()):abs():max(), 1e-5, "wrong gesv answer")
+ tester:assertle((ra2 - ra1:cuda()):abs():max(), 1e-5, "wrong gesv answer")
+ end
end
function test.gels()
@@ -2355,10 +2362,14 @@ if cutorch.magma then
{ 0.5360, 0.2048, 0.2745},
{ 0.8535,-0.3938,-0.2140},
}
- local rb1, ra1 = torch.gels(b, a)
- local rb2, ra2 = torch.gels(b:cuda(), a:cuda())
- tester:assertle((rb2 - rb1:cuda()):abs():max(), 5e-4, "wrong gels answer")
- tester:assertle((ra2 - ra1:cuda()):abs():max(), 5e-4, "wrong gels answer")
+ for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+ local at = a:type(typename)
+ local bt = b:type(typename)
+ local rb1, ra1 = torch.gels(bt, at)
+ local rb2, ra2 = torch.gels(bt:cuda(), at:cuda())
+ tester:assertle((rb2 - rb1:cuda()):abs():max(), 5e-4, "wrong gels answer")
+ tester:assertle((ra2 - ra1:cuda()):abs():max(), 5e-4, "wrong gels answer")
+ end
end
function test.symeig()
@@ -2367,10 +2378,13 @@ if cutorch.magma then
{-0.47, -6.39, 4.17, 0.00, 0.00},
{-7.20, 1.50, -1.51, 5.70, 0.00},
{-0.65, -6.34, 2.67, 1.80, -7.10}}):t()
- local e1,v1 = torch.symeig(a, 'V')
- local e2,v2 = torch.symeig(a:cuda(), 'V')
- tester:assertle((e2 - e1:cuda()):abs():max(), 1e-5, "wrong symeig answer")
- tester:assertle((v2 - v1:cuda()):abs():max(), 1e-5, "wrong symeig answer")
+ for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+ local at = a:type(typename)
+ local e1,v1 = torch.symeig(at, 'V')
+ local e2,v2 = torch.symeig(at:cuda(), 'V')
+ tester:assertle((e2 - e1:cuda()):abs():max(), 1e-5, "wrong symeig answer")
+ tester:assertle((v2 - v1:cuda()):abs():max(), 1e-5, "wrong symeig answer")
+ end
end
function test.eig()
@@ -2381,10 +2395,13 @@ if cutorch.magma then
{ 0.5766, -0.6743, 0.6903, 0.3646, -0.4571},
{-0.8956, -0.4074, -0.7583, 0.1838, -0.0091},
}
- local e1,v1 = torch.eig(a, 'V')
- local e2,v2 = torch.eig(a:cuda(), 'V')
- tester:assertle((e2 - e1:cuda()):abs():max(), 1e-6, "wrong eig answer")
- tester:assertle((v2:abs() - v1:abs():cuda()):abs():max(), 1e-6, "wrong eig answer")
+ for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+ local at = a:type(typename)
+ local e1,v1 = torch.eig(at, 'V')
+ local e2,v2 = torch.eig(at:cuda(), 'V')
+ tester:assertle((e2 - e1:cuda()):abs():max(), 1e-6, "wrong eig answer")
+ tester:assertle((v2:abs() - v1:abs():cuda()):abs():max(), 1e-6, "wrong eig answer")
+ end
end
function test.svd()
@@ -2395,17 +2412,20 @@ if cutorch.magma then
{5.45, -0.27, 4.85, 0.74, 10.00, -6.02},
{3.16, 7.98, 3.01, 5.80, 4.27, -5.31}}
- local u,s,v = torch.svd(a, 'A')
+ for _, typename in ipairs({'torch.CudaDoubleTensor', 'torch.CudaTensor'}) do
+ local at = a:type(typename)
+ local u,s,v = torch.svd(a, 'A')
- local temp = torch.Tensor(a:size(2)):zero()
- temp:narrow(1, 1, a:size(1)):copy(s)
- local sigma = torch.diag(temp):resize(a:size(1), a:size(2)):cuda()
+ local temp = torch.Tensor(a:size(2)):zero()
+ temp:narrow(1, 1, a:size(1)):copy(s)
+ local sigma = torch.diag(temp):resize(a:size(1), a:size(2)):cuda()
- local m = u * sigma * v:t()
+ local m = u * sigma * v:t()
- tester:assertle((m - a):abs():max(), 1e-5, "svd: a != u * s * vT")
- tester:assertle((u*u:t() - torch.eye(a:size(1)):cuda()):abs():max(), 1e-6, "svd: u should be unitary")
- tester:assertle((v*v:t() - torch.eye(a:size(2)):cuda()):abs():max(), 1e-6, "svd: v should be unitary")
+ tester:assertle((m - a):abs():max(), 1e-5, "svd: a != u * s * vT")
+ tester:assertle((u*u:t() - torch.eye(a:size(1)):cuda()):abs():max(), 1e-6, "svd: u should be unitary")
+ tester:assertle((v*v:t() - torch.eye(a:size(2)):cuda()):abs():max(), 1e-6, "svd: v should be unitary")
+ end
end
@@ -2419,11 +2439,18 @@ if cutorch.magma then
}
A = A * A:t()
- local i1 = torch.potri(A)
- local i2 = torch.potri(A:cuda())
- local M = A:cuda() * i2
- tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong potri answer")
- tester:assertle((M - torch.eye(A:size(1)):cuda()):abs():max(), 1e-5, "potri not an inverse")
+ for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+ local at = A:type(typename)
+ for _, triarg in ipairs({'U', 'L'}) do
+ local chol = torch.potrf(at, triarg)
+
+ local i1 = torch.potri(chol, triarg)
+ local i2 = torch.potri(chol:cuda(), triarg)
+ local M = at:cuda() * i2
+ tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong potri answer")
+ tester:assertle((M - torch.eye(at:size(1)):cuda()):abs():max(), 1e-5, "potri not an inverse")
+ end
+ end
end
function test.potrf()
@@ -2434,9 +2461,14 @@ if cutorch.magma then
{-0.6738, 0.4734,-1.1123, 2.4071,-1.2756},
{-3.3883, 0.2807, 0.8161,-1.2756, 4.3415},
}
- local i1 = torch.potrf(A)
- local i2 = torch.potrf(A:cuda())
- tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong potrf answer")
+ for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+ local at = A:type(typename)
+ for _, triarg in ipairs({'U', 'L'}) do
+ local i1 = torch.potrf(at, triarg)
+ local i2 = torch.potrf(at:cuda(), triarg)
+ tester:assertle((i2 - i1:cuda()):abs():max(), 1e-5, "wrong potrf answer")
+ end
+ end
end
function test.potrs()
@@ -2452,10 +2484,16 @@ if cutorch.magma then
{0.2334, 0.8594, 0.4103},
{0.7556, 0.1966, 0.9637},
{0.1420, 0.7185, 0.7476}})
- local chol = torch.potrf(A)
- local solve1 = torch.potrs(B, chol)
- local solve2 = torch.potrs(B:cuda(), chol:cuda())
- tester:assertle((solve2 - solve1:cuda()):abs():max(), 1e-4, "wrong potrs answer")
+ for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+ local at = A:type(typename)
+ local bt = B:type(typename)
+ for _, triarg in ipairs({'U', 'L'}) do
+ local chol = torch.potrf(at, triarg)
+ local solve1 = torch.potrs(bt, chol, triarg)
+ local solve2 = torch.potrs(bt:cuda(), chol:cuda(), triarg)
+ tester:assertle((solve2 - solve1:cuda()):abs():max(), 1e-4, "wrong potrs answer")
+ end
+ end
end
function test.qr()
@@ -2466,10 +2504,13 @@ if cutorch.magma then
{-0.2987, 1.9035, -1.4192, -0.9738, 1.4384},
{-0.5315, 0.4958, 0.4449, -0.4676, -0.4878},
}
- local q1,r1 = torch.qr(A)
- local q2,r2 = torch.qr(A:cuda())
- tester:assertle((q2 - q1:cuda()):abs():max(), 1e-5, "wrong qr answer")
- tester:assertle((r2 - r1:cuda()):abs():max(), 1e-5, "wrong qr answer")
+ for _, typename in ipairs({'torch.DoubleTensor', 'torch.FloatTensor'}) do
+ local at = A:type(typename)
+ local q1,r1 = torch.qr(at)
+ local q2,r2 = torch.qr(at:cuda())
+ tester:assertle((q2 - q1:cuda()):abs():max(), 1e-5, "wrong qr answer")
+ tester:assertle((r2 - r1:cuda()):abs():max(), 1e-5, "wrong qr answer")
+ end
end
end