I am using cusolverDnCgesvdjBatched function to calculate singular value decomposition (SVD) of multiple matrices, I use cuda-memcheck to check any memory issues, I am getting an error like this in the cusolverDnCgesvdjBatched function.
========= Invalid __global__ write of size 4
========= at 0x000062f8 in void batched_svd_parallel_jacobi_32x16<float2, float>(int, int, int, int, float2*, __int64, int, float*, float2*, __int64, int, float2*, __int64, int, float, int, int*, float, int, int*, int, float)
========= by thread (0,0,0) in block (4,0,0)
========= Address 0x701019010 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll (cuProfilerStop + 0x904ce) [0x2ae05e]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll (cuProfilerStop + 0x92e73) [0x2b0a03]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll [0x84cb7]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll [0x86e03]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ca]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll (cuModuleUnload + 0x1d6) [0x1d5d36]
========= Host Frame:D:SVDx64ReleaseSVD.exe (cudart::module::unload + 0x115) [0x9535]
========= Host Frame:D:SVDx64ReleaseSVD.exe (cudart::contextState::unloadAllModules + 0x196) [0x9b36]
========= Host Frame:D:SVDx64ReleaseSVD.exe (cudart::contextStateManager::destroyAllContextStatesOnRuntimeUnload + 0x78) [0xa188]
========= Host Frame:D:SVDx64ReleaseSVD.exe (cudart::globalState::~globalState + 0x3d) [0x24dd]
========= Host Frame:D:SVDx64ReleaseSVD.exe (cudart::set<cudart::globalModule * __ptr64>::rehash + 0x106) [0x74c6]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (execute_onexit_table + 0x156) [0x142d6]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (execute_onexit_table + 0x7b) [0x141fb]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (execute_onexit_table + 0x34) [0x141b4]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (exit + 0x142) [0x20522]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (exit + 0xcb) [0x204ab]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (exit + 0x6e) [0x2044e]
========= Host Frame:D:SVDx64ReleaseSVD.exe (gpuErrchk + 0x4c) [0xf0dc]
========= Host Frame:D:SVDx64ReleaseSVD.exe (main + 0x3ef) [0xebaf]
========= Host Frame:D:SVDx64ReleaseSVD.exe (__scrt_common_main_seh + 0x10c) [0xf5c4]
========= Host Frame:C:WINDOWSSystem32KERNEL32.dll (BaseThreadInitThunk + 0x14) [0x17034]
========= Host Frame:C:WINDOWSSYSTEM32
tdll.dll (RtlUserThreadStart + 0x21) [0x52651]
=========
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll (cuProfilerStop + 0x904ce) [0x2ae05e]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll (cuProfilerStop + 0x92e73) [0x2b0a03]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll [0x84cb7]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll [0x86e03]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ca]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepository
vami.inf_amd64_72390dc4652f28fa
vcuda64.dll (cuModuleUnload + 0x1d6) [0x1d5d36]
========= Host Frame:D:SVDx64ReleaseSVD.exe (cudart::module::unload + 0x115) [0x9535]
========= Host Frame:D:SVDx64ReleaseSVD.exe (cudart::contextState::unloadAllModules + 0x196) [0x9b36]
========= Host Frame:D:SVDx64ReleaseSVD.exe (cudart::contextStateManager::destroyAllContextStatesOnRuntimeUnload + 0x78) [0xa188]
========= Host Frame:D:SVDx64ReleaseSVD.exe (cudart::globalState::~globalState + 0x3d) [0x24dd]
========= Host Frame:D:SVDx64ReleaseSVD.exe (cudart::set<cudart::globalModule * __ptr64>::rehash + 0x106) [0x74c6]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (execute_onexit_table + 0x156) [0x142d6]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (execute_onexit_table + 0x7b) [0x141fb]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (execute_onexit_table + 0x34) [0x141b4]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (exit + 0x142) [0x20522]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (exit + 0xcb) [0x204ab]
========= Host Frame:C:WINDOWSSystem32ucrtbase.dll (exit + 0x6e) [0x2044e]
========= Host Frame:D:SVDx64ReleaseSVD.exe (gpuErrchk + 0x4c) [0xf0dc]
========= Host Frame:D:SVDx64ReleaseSVD.exe (main + 0x3ef) [0xebaf]
========= Host Frame:D:SVDx64ReleaseSVD.exe (__scrt_common_main_seh + 0x10c) [0xf5c4]
========= Host Frame:C:WINDOWSSystem32KERNEL32.dll (BaseThreadInitThunk + 0x14) [0x17034]
========= Host Frame:C:WINDOWSSYSTEM32
tdll.dll (RtlUserThreadStart + 0x21) [0x52651]
=========
========= ERROR SUMMARY: 8 errors
I am attaching the whole code I am using.
kernel.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define FULLSVD
#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 10;
const int N = 5;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 256;
TimingGPU timerGPU;
// --- Setting the host matrix
cuComplex *h_A = (cuComplex *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++)
{
for (unsigned int j = 0; j < N; j++)
{
h_A[k * M * N + j * M + i] = make_float2((1. / (k + 1)) * (i + j * j) * (i + j), (1. / (k + 1)) * (i + j * j) * (i + j));
//printf("[%d, %d] %f
", i, j, h_A[j*M + i]);
//printf("%f %f", h_A[j*M + i].x, h_A[j * M + i].y);
}
//printf("
");
}
// --- Setting the device matrix and moving the host matrix to the device
cuComplex *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(cuComplex)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(cuComplex), cudaMemcpyHostToDevice));
// --- host side SVD results space
float *h_S = (float *)malloc(N * numMatrices * sizeof(float));
cuComplex *h_U = NULL;
cuComplex *h_V = NULL;
#ifdef FULLSVD
h_U = (cuComplex *)malloc(M * M * numMatrices * sizeof(cuComplex));
h_V = (cuComplex *)malloc(N * N * numMatrices * sizeof(cuComplex));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(float)));
cuComplex *d_U = NULL;
cuComplex *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(cuComplex)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(cuComplex)));
#endif
cuComplex *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnCgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(cuComplex) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnCgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N