This is known as Morton number, which is a specific case of parallel expand which is in turn the reverse of compress right in the following questions
One general solution might be
uint64_t bit_expand(uint64_t x) // 00000000ABCDEFGH
{
x = ((x & 0xFFFF0000) << 32) | ((x & 0x0000FFFF) << 16); // ABCD0000EFGH0000
x = (x & 0xFF000000FF000000) | ((x & 0x00FF000000FF0000) >> 8); // AB00CD00EF00GH00
x = (x & 0xF000F000F000F000) | ((x & 0x0F000F000F000F00) >> 4); // A0B0C0D0E0F0G0H0
x = (x & 0xC0C0C0C0C0C0C0C0) | ((x & 0x3030303030303030) >> 2);
x = (x & 0xA0A0A0A0A0A0A0A0) | ((x & 0x5050505050505050) >> 1);
return x;
}
However the constant generation might be inefficient on RISC architectures because the 64-bit immediate can't be stored in a single instruction like on x86. Even on x86 the output assembly is quite long. Here is another possible implementation as described on Bit Twiddling Hacks
static const unsigned int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
static const unsigned int S[] = {1, 2, 4, 8};
unsigned int x; // Interleave lower 16 bits of x and y, so the bits of x
unsigned int y; // are in the even positions and bits from y in the odd;
unsigned int z; // z gets the resulting 32-bit Morton Number.
// x and y must initially be less than 65536.
x = (x | (x << S[3])) & B[3];
x = (x | (x << S[2])) & B[2];
x = (x | (x << S[1])) & B[1];
x = (x | (x << S[0])) & B[0];
y = (y | (y << S[3])) & B[3];
y = (y | (y << S[2])) & B[2];
y = (y | (y << S[1])) & B[1];
y = (y | (y << S[0])) & B[0];
z = x | (y << 1);
A lookup table can also be used
#define EXPAND4(a) ((((a) & 0x8) << 4) | (((a) & 0x4) << 2)
| (((a) & 0x2) << 1) | (((a) & 0x1)))
const uint8_t LUT[16] = {
EXPAND4( 0), EXPAND4( 1), EXPAND4( 2), EXPAND4( 3),
EXPAND4( 4), EXPAND4( 5), EXPAND4( 6), EXPAND4( 7),
EXPAND4( 8), EXPAND4( 9), EXPAND4(10), EXPAND4(11),
EXPAND4(12), EXPAND4(13), EXPAND4(14), EXPAND4(15)
};
output = ((uint64_t)LUT[(x >> 28) & 0xF] << 56) | ((uint64_t)LUT[(x >> 24) & 0xF] << 48)
| ((uint64_t)LUT[(x >> 20) & 0xF] << 40) | ((uint64_t)LUT[(x >> 16) & 0xF] << 32)
| ((uint64_t)LUT[(x >> 12) & 0xF] << 24) | ((uint64_t)LUT[(x >> 8) & 0xF] << 16)
| ((uint64_t)LUT[(x >> 4) & 0xF] << 8) | ((uint64_t)LUT[(x >> 0) & 0xF] << 0);
The size of the lookup table may be increased if necessary
On x86 with BMI2 there's hardware support with PDEP instruction which can be accessed via the following intrinsic
output = _pdep_u64(x, 0xaaaaaaaaaaaaaaaaULL);
Another solution on architectures without bit deposit/expand instruction but with fast multipliers
uint64_t spaceOut8bits(uint8_t b)
{
uint64_t MAGIC = 0x8040201008040201;
uint64_t MASK = 0x8080808080808080;
uint64_t expand8bits = htobe64(((MAGIC*b) & MASK) >> 7);
uint64_t spacedOutBits = expand8bits*0x41041 & 0xAA000000AA000000;
return (spacedOutBits | (spacedOutBits << 24)) & 0xFFFF000000000000;
}
uint64_t spaceOut64bits(uint64_t x)
{
return (spaceOut8bits(x >> 24) >> 0)
| (spaceOut8bits(x >> 16) >> 16)
| (spaceOut8bits(x >> 8) >> 32)
| (spaceOut8bits(x >> 0) >> 48);
}
The way it works is like this
- The first step expands the input bits from
abcdefgh
to a0000000 b0000000 c0000000 d0000000 e0000000 f0000000 g0000000 h0000000 and store in expand8bits
- Then we move those spaced out bits close together by multiplying and masking in the next step. After that
spacedOutBits
will contain a0b0c0d0 00000000 00000000 00000000 e0f0g0h0 00000000 00000000 00000000. We'll merge the two bytes in the result together
The magic number for bringing the bits closer is calculated like this
a0000000b0000000c0000000d0000000e0000000f0000000g0000000h0000000
× 1000001000001000001
────────────────────────────────────────────────────────────────
a0000000b0000000c0000000d0000000e0000000f0000000g0000000h0000000
00b0000000c0000000d0000000e0000000f0000000g0000000h0000000
+ 0000c0000000d0000000e0000000f0000000g0000000h0000000
000000d0000000e0000000f0000000g0000000h0000000
────────────────────────────────────────────────────────────────
a0b0c0d0b0c0d0e0c0d0e0f0d0e0f0g0e0f0g0h0f0g0h000g0h00000h0000000
& 1010101000000000000000000000000010101010000000000000000000000000
────────────────────────────────────────────────────────────────
a0b0c0d0000000000000000000000000e0f0g0h0000000000000000000000000
The output assembly can be seen here. You can change the compiler to see how it's done on various architectures
There's also an alternative way on the Bit Twiddling Hacks page
z = ((x * 0x0101010101010101ULL & 0x8040201008040201ULL) *
0x0102040810204081ULL >> 49) & 0x5555 |
((y * 0x0101010101010101ULL & 0x8040201008040201ULL) *
0x0102040810204081ULL >> 48) & 0xAAAA;
More solutions can be found in <a