NOTE: The ability to call cublas functions from device code has been removed from CUDA as of CUDA 10.0. The description in this answer only pertains to CUDA 9.x usage and prior. See here.
The warnings about sm_50 are benign. That's my way of saying "they can be safely ignored in this case".
Regarding the code you currently have posted, the problem relates to what is described in the dynamic parallelism documentation around the use of thread-local memory here.
In a nutshell, local memory of the parent thread is "out of scope" in a child kernel launch. Although it's not entirely obvious, the cublas calls from device code are (attempting) to launch child kernels. This means that declarations like this:
int p[3], info[1],
will be problematic if those pointers (e.g. p
, info
) are passed to a child kernel. The numerical values of the pointers themselves will not be corrupted, but they will not point to anything "meaningful" in the memory space of the child kernel.
There are multiple ways to solve this, but one possible solution is to replace any stack/local allocations of this type with allocations from the "device heap" which can be made via in-kernel malloc
.
Here is a fully worked code/example that seems to work correctly for me. The output seems to be correct for the inversion of the given sample matrix:
$ cat t605.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define PERR(call)
if (call) {
fprintf(stderr, "%s:%d Error [%s] on "#call"
", __FILE__, __LINE__,
cudaGetErrorString(cudaGetLastError()));
exit(1);
}
#define ERRCHECK
if (cudaPeekAtLastError()) {
fprintf(stderr, "%s:%d Error [%s]
", __FILE__, __LINE__,
cudaGetErrorString(cudaGetLastError()));
exit(1);
}
__global__ void
inv_kernel(float *a_i, float *c_o, int n)
{
int *p = (int *)malloc(3*sizeof(int));
int *info = (int *)malloc(sizeof(int));
int batch;
cublasHandle_t hdl;
cublasStatus_t status = cublasCreate_v2(&hdl);
printf("handle %d n = %d
", status, n);
info[0] = 0;
batch = 1;
float **a = (float **)malloc(sizeof(float *));
*a = a_i;
const float **aconst = (const float **)a;
float **c = (float **)malloc(sizeof(float *));
*c = c_o;
// See
// http://docs.nvidia.com/cuda/pdf/CUDA_Dynamic_Parallelism_Programming_Guide.pdf
//http://stackoverflow.com/questions/27094612/cublas-matrix-inversion-from-device
status = cublasSgetrfBatched(hdl, n, a, n, p, info, batch);
__syncthreads();
printf("rf %d info %d
", status, info[0]);
status = cublasSgetriBatched(hdl, n, aconst, n, p,
c, n, info, batch);
__syncthreads();
printf("ri %d info %d
", status, info[0]);
cublasDestroy_v2(hdl);
printf("done
");
}
static void
run_inv(float *in, float *out, int n)
{
float *a_d, *c_d;
PERR(cudaMalloc(&a_d, n*n*sizeof(float)));
PERR(cudaMalloc(&c_d, n*n*sizeof(float)));
PERR(cudaMemcpy(a_d, in, n*n*sizeof(float), cudaMemcpyHostToDevice));
inv_kernel<<<1, 1>>>(a_d, c_d, n);
cudaDeviceSynchronize();
ERRCHECK;
PERR(cudaMemcpy(out, c_d, n*n*sizeof(float), cudaMemcpyDeviceToHost));
PERR(cudaFree(a_d));
PERR(cudaFree(c_d));
}
int
main(int argc, char **argv)
{
float c[9];
float a[] = {
1, 2, 3,
0, 4, 5,
1, 0, 6 };
run_inv(a, c, 3);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++) printf("%f, ",c[(3*i)+j]);
printf("
");}
return 0;
}
$ nvcc -arch=sm_35 -rdc=true -o t605 t605.cu -lcublas_device -lcudadevrt
nvlink warning : SM Arch ('sm_35') not found in '/shared/apps/cuda/CUDA-v6.5.14/bin/..//lib64/libcublas_device.a:maxwell_sgemm.asm.o'
nvlink warning : SM Arch ('sm_35') not found in '/shared/apps/cuda/CUDA-v6.5.14/bin/..//lib64/libcublas_device.a:maxwell_sm50_sgemm.o'
$ ./t605
handle 0 n = 3
rf 0 info 0
ri 0 info 0
done
1.090909, -0.545455, -0.090909,
0.227273, 0.136364, -0.227273,
-0.181818, 0.090909, 0.181818,
$
For CUDA 10.0 and newer users, I would suggest using ordinary batched cublas functions from host code. In particular for matrix inverse, one option is to use matinvBatched.