The CUDA_ERROR_INVALID_IMAGE
error should only be returned by cuModuleLoad
when the module file is invalid. If it is missing or contains an architecture mismatch you should probably see a CUDA_ERROR_FILE_NOT_FOUND
or CUDA_ERROR_INVALID_SOURCE
error. You haven't given us enough details or code to say for certain what is happening, but in principle at least, the API code you have should work.
To show how this should work, consider the following working example on Linux with CUDA 5.5:
First your kernel:
#include <cmath>
using namespace std;
__device__ __inline__ float trim(unsigned char value)
{
return fminf((unsigned char)255, fmaxf(value, (unsigned char)0));
}
__constant__ char z = 1;
__global__ void kernel(unsigned char* img, const float* a)
{
int ix = blockIdx.x;
int iy = threadIdx.x;
int tid = iy*blockDim.x + ix;
float x = (float)ix / blockDim.x;
float y = (float)iy / gridDim.x;
//placeholder
img[tid*4+0] = trim((a[0]*z*z+a[1]*z+a[2]) * 255.0f);
img[tid*4+1] = trim((a[3]*z*z+a[4]*z+a[5]) * 255.0f);
img[tid*4+2] = trim((a[6]*z*z+a[7]*z+a[8]) * 255.0f);
img[tid*4+3] = 255;
}
Then a simple program to load the cubin into a context at runtime:
#include <cuda.h>
#include <string>
#include <iostream>
#define Errchk(ans) { DrvAssert((ans), __FILE__, __LINE__); }
inline void DrvAssert( CUresult code, const char *file, int line)
{
if (code != CUDA_SUCCESS) {
std::cout << "Error: " << code << " " << file << "@" << line << std::endl;
exit(code);
} else {
std::cout << "Success: " << file << "@" << line << std::endl;
}
}
int main(void)
{
Errchk( cuInit(0) );
CUdevice device;
Errchk( cuDeviceGet(&device, 0) );
CUcontext ctx;
Errchk( cuCtxCreate(&ctx, 0, device) );
CUmodule module;
std::string path = "qkernel.cubin";
Errchk( cuModuleLoad(&module, path.c_str()) );
cuCtxDetach(ctx);
return 0;
}
Build the cubin for the architecture of the device present in the host (a GTX670 in this case):
$ nvcc -arch=sm_30 -Xptxas="-v" --cubin qkernel.cu
ptxas info : 11 bytes gmem, 1 bytes cmem[3]
ptxas info : Compiling entry function '_Z6kernelPhPKf' for 'sm_30'
ptxas info : Function properties for _Z6kernelPhPKf
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 10 registers, 336 bytes cmem[0]
and the host program:
$ nvcc -o qexe qmain.cc -lcuda
then run:
$ ./qexe
Success: qmain.cc@18
Success: qmain.cc@20
Success: qmain.cc@22
Success: qmain.cc@26
The module code loads. If I delete the cubin and run again, I see this:
$ rm qkernel.cubin
$ ./qexe
Success: qmain.cc@18
Success: qmain.cc@20
Success: qmain.cc@22
Error: 301 qmain.cc@26
If I compile for an incompatible architecture, I see this:
$ nvcc -arch=sm_10 -Xptxas="-v" --cubin qkernel.cu
ptxas info : 0 bytes gmem, 1 bytes cmem[0]
ptxas info : Compiling entry function '_Z6kernelPhPKf' for 'sm_10'
ptxas info : Used 5 registers, 32 bytes smem, 4 bytes cmem[1]
$ ./qexe
Success: qmain.cc@18
Success: qmain.cc@20
Success: qmain.cc@22
Error: 300 qmain.cc@26
If I compile to an object file, not a cubin, I see this:
$ nvcc -arch=sm_30 -Xptxas="-v" -c -o qkernel.cubin qkernel.cu
ptxas info : 11 bytes gmem, 1 bytes cmem[3]
ptxas info : Compiling entry function '_Z6kernelPhPKf' for 'sm_30'
ptxas info : Function properties for _Z6kernelPhPKf
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 10 registers, 336 bytes cmem[0]
$ ./qexe
Success: qmain.cc@18
Success: qmain.cc@20
Success: qmain.cc@22
Error: 200 qmain.cc@26
This is the only way I can get the code to emit a CUDA_ERROR_INVALID_IMAGE
error. All I can suggest is to try my code and recipe and see if you can get it to work.