This answer has nothing to do with CUDA itself, but is a general C implementation.
I can't quite follow what you are doing (especially with carry
) but you could try this snippet based on my own big num functions. I defined dtype
to make it easier to test with smaller fields. Note that I don't specifically use a carry
, but carry forward the partial product.
// little-endian
#include <stdio.h>
#include <stdint.h>
#include <limits.h>
#define dtype uint8_t // for testing
//#define dtype uint32_t // for proper ver
#define SHIFTS (sizeof(dtype)*CHAR_BIT)
#define NIBBLES (SHIFTS/4)
#define ARRLEN 8
typedef struct UN_256fe {
dtype uint[ARRLEN];
} UN_256fe;
typedef struct UN_288bite {
dtype uint[ARRLEN+1];
} UN_288bite;
void multiply(UN_288bite *product, UN_256fe *operand, dtype multiplier)
{
int i;
uint64_t partial = 0;
for (i=0; i<ARRLEN; i++) {
partial = partial + (uint64_t)multiplier * operand->uint[i];
product->uint[i] = (dtype)partial;
partial >>= SHIFTS; // carry
}
product->uint[i] = (dtype)partial;
}
int main(void)
{
int i;
dtype multiplier = 0xAA;
UN_256fe operand = { 1, 2, 3, 4, 5, 6, 7, 8};
UN_288bite product;
multiply(&product, &operand, multiplier);
for(i=ARRLEN-1; i>=0; i--)
printf("%0*X", NIBBLES, operand.uint[i]);
printf("
* %0*X =
", NIBBLES, multiplier);
for(i=ARRLEN; i>=0; i--)
printf("%0*X", NIBBLES, product.uint[i]);
printf("
");
return 0;
}
Program output for uint8_t
0807060504030201
* AA =
0554A9FF54A9FF54AA
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…