Which classes of algorithms could be using punpcklbw
?
In particular, what is punpcklbw xmm0, xmm0
doing ?
And yet, what is maskedPow2_Value
useful for ?
maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val; // Val is int
maskedPow2_Value = 0x101010101010101i64 * maskedValue;
(or mov r9, 101010101010101h; imul rdx, r9;
twice)
A complete example (the function is named CompressPacket but it may be misleading), asresult of decompilation by IDA:
void *__cdecl CompressPacket(void *Dst, int Val, size_t Size)
{
__int64 maskedPow2_Value; // rdx
unsigned int v5; // ecx
__int64 *bufferOut; // rcx
size_t size_; // r9
size_t i; // r9
size_t size__; // r9
size_t counter; // r8
size_t j; // r9
void *result; // rax
__m128i v13; // xmm0
__int64 lsb4; // rax
size_t counter1; // r9
size_t k; // r9
size_t lsb4_; // r8
__int64 maskedValue; // rdx
*(_QWORD *)&Val = (unsigned __int8)Val;
maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val;
bufferOut = (__int64 *)((char *)Dst + Size);
result = Dst;
switch ( Size )
{
case 0ui64:
return result;
case 1ui64:
goto LBL_1_F;
case 2ui64:
goto LBL_2_E;
case 3ui64:
goto LBL_3_F;
case 4ui64:
goto LBL_4_C;
case 5ui64:
goto LBL_5_D;
case 6ui64:
goto LBL_6_E;
case 7ui64:
goto LBL_7_F;
case 8ui64:
*(bufferOut - 1) = maskedValue;
return result;
case 9ui64:
*(__int64 *)((char *)bufferOut - 9) = maskedValue;
*((_BYTE *)bufferOut - 1) = maskedValue;
return result;
case 0xAui64:
*(__int64 *)((char *)bufferOut - 10) = maskedValue;
*((_WORD *)bufferOut - 1) = maskedValue;
return result;
case 0xBui64:
*(__int64 *)((char *)bufferOut - 11) = maskedValue;
goto LBL_3_F;
case 0xCui64:
*(__int64 *)((char *)bufferOut - 12) = maskedValue;
LBL_4_C:
*((_DWORD *)bufferOut - 1) = maskedValue;
return result;
case 0xDui64:
*(__int64 *)((char *)bufferOut - 13) = maskedValue;
LBL_5_D:
*(_DWORD *)((char *)bufferOut - 5) = maskedValue;
*((_BYTE *)bufferOut - 1) = maskedValue;
return result;
case 0xEui64:
*(__int64 *)((char *)bufferOut - 14) = maskedValue;
LBL_6_E:
*(_DWORD *)((char *)bufferOut - 6) = maskedValue;
LBL_2_E:
*((_WORD *)bufferOut - 1) = maskedValue;
return result;
case 0xFui64:
*(__int64 *)((char *)bufferOut - 15) = maskedValue;
LBL_7_F:
*(_DWORD *)((char *)bufferOut - 7) = maskedValue;
LBL_3_F:
*(_WORD *)((char *)bufferOut - 3) = maskedValue;
LBL_1_F:
*((_BYTE *)bufferOut - 1) = maskedValue;
return result;
default:
if ( _bittest(dword_7FFFF4B237D8, 1u) )
{
memset(bufferOut, maskedValue, Size);
return Dst;
}
maskedPow2_Value = 0x101010101010101i64 * maskedValue;
if ( !_bittest(dword_7FFFF4B237D8, 2u) )
{
if ( Size >= 0x40 )
{
v5 = -(int)bufferOut & 7;
if ( v5 )
{
Size -= v5;
*(_QWORD *)Dst = maskedPow2_Value;
}
bufferOut = (__int64 *)((char *)Dst + v5);
size_ = Size;
Size &= 0x3Fu;
for ( i = size_ >> 6; i; *(bufferOut - 1) = maskedPow2_Value )
{
*bufferOut = maskedPow2_Value;
bufferOut[1] = maskedPow2_Value;
bufferOut[2] = maskedPow2_Value;
bufferOut += 8;
*(bufferOut - 5) = maskedPow2_Value;
*(bufferOut - 4) = maskedPow2_Value;
--i;
*(bufferOut - 3) = maskedPow2_Value;
*(bufferOut - 2) = maskedPow2_Value;
}
}
size__ = Size;
counter = Size & 7;
for ( j = size__ >> 3; j; --j )
*bufferOut++ = maskedPow2_Value;
for ( ; counter; --counter )
{
*(_BYTE *)bufferOut = maskedPow2_Value;
bufferOut = (__int64 *)((char *)bufferOut + 1);
}
return Dst;
}
v13 = _mm_unpacklo_epi8((__m128i)(unsigned __int64)maskedPow2_Value, (__m128i)(unsigned __int64)maskedPow2_Value);
if ( ((unsigned __int8)bufferOut & 0xF) != 0 )
{
*(__m128i *)bufferOut = v13;
lsb4 = (unsigned __int8)bufferOut & 0xF;
bufferOut = (__int64 *)((char *)bufferOut - lsb4 + 16);
Size = lsb4 + Size - 16;
}
counter1 = Size >> 7;
if ( Size >> 7 )
{
do
{
*(__m128i *)bufferOut = v13;
*((__m128i *)bufferOut + 1) = v13;
bufferOut += 16;
*((__m128i *)bufferOut - 6) = v13;
*((__m128i *)bufferOut - 5) = v13;
--counter1;
*((__m128i *)bufferOut - 4) = v13;
*((__m128i *)bufferOut - 3) = v13;
*((__m128i *)bufferOut - 2) = v13;
*((__m128i *)bufferOut - 1) = v13;
}
while ( counter1 );
Size &= 0x7Fu;
}
for ( k = Size >> 4; k; --k )
{
*(__m128i *)bufferOut = v13;
bufferOut += 2;
}
lsb4_ = Size & 0xF;
if ( lsb4_ )
*(__m128i *)((char *)bufferOut + lsb4_ - 16) = v13;
return Dst;
}
}
and the disassembly, by IDA too:
.text:00007FFFF4AF6440 ; void *__cdecl CompressPacket(void *Dst, int Val, size_t Size)
.text:00007FFFF4AF6440 CompressPacket proc near ; CODE XREF: j_memset↑j
.text:00007FFFF4AF6440 ; Concurrency::details::ResourceManager::CreateAllocatedNodeData(void)+49↑p ...
.text:00007FFFF4AF6440 mov r11, rcx
.text:00007FFFF4AF6443 movzx edx, dl ; Move with Zero-Extend
.text:00007FFFF4AF6446 cmp r8, 10h ; switch 16 cases
.text:00007FFFF4AF644A jb SetBytes15 ; Jump if Below (CF=1)
.text:00007FFFF4AF6450
.text:00007FFFF4AF6450 def_7FFFF4AF65D2: ; jumptable 00007FFFF4AF65D2 default case
.text:00007FFFF4AF6450 bt cs:dword_7FFFF4B237D8, 1
.text:00007FFFF4AF6458 jnb short mset05 ; Jump if Not Below (CF=0)
.text:00007FFFF4AF645A push rdi
.text:00007FFFF4AF645B mov rdi, rcx
.text:00007FFFF4AF645E mov eax, edx
.text:00007FFFF4AF6460 mov rcx, r8
.text:00007FFFF4AF6463 rep stosb ; Store String
.text:00007FFFF4AF6465 pop rdi
.text:00007FFFF4AF6466 jmp short mset60 ; Jump
.text:00007FFFF4AF6468 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6468
.text:00007FFFF4AF6468 mset05: ; CODE XREF: CompressPacket+18↑j
.text:00007FFFF4AF6468 mov r9, 101010101010101h
.text:00007FFFF4AF6472 imul rdx, r9 ; Signed Multiply
.text:00007FFFF4AF6476 bt cs:dword_7FFFF4B237D8, 2 ; Bit Test
.text:00007FFFF4AF647E jb msetxmm10 ; Jump if Below (CF=1)
.text:00007FFFF4AF6484 cmp r8, 40h ; '@' ; Compare Two Operands
.text:00007FFFF4AF6488 jb short mset20 ; Jump if Below (CF=1)
.text:00007FFFF4AF648A neg rcx ; Two's Complement Negation
.text:00007FFFF4AF648D and ecx, 7 ; Logical AND
.text:00007FFFF4AF6490 jz short mset10 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF6492 sub r8, rcx ; Integer Subtraction
.text:00007FFFF4AF6495 mov [r11], rdx
.text:00007FFFF4AF6498
.text:00007FFFF4AF6498 mset10: ; CODE XREF: CompressPacket+50↑j
.text:00007FFFF4AF6498 add rcx, r11 ; Add
.text:00007FFFF4AF649B mov r9, r8
.text:00007FFFF4AF649E and r8, 3Fh ; Logical AND
.text:00007FFFF4AF64A2 shr r9, 6 ; Shift Logical Right
.text:00007FFFF4AF64A6 jnz short mset80 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64A8
.text:00007FFFF4AF64A8 mset20: ; CODE XREF: CompressPacket+48↑j
.text:00007FFFF4AF64A8 ; CompressPacket+CF↓j
.text:00007FFFF4AF64A8 mov r9, r8
.text:00007FFFF4AF64AB and r8, 7 ; Logical AND
.text:00007FFFF4AF64AF shr r9, 3 ; Shift Logical Right
.text:00007FFFF4AF64B3 jz short mset40 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64B5 db 66h, 66h
.text:00007FFFF4AF64B5 xchg ax, ax ; Exchange Register/Memory with Register
.text:00007FFFF4AF64B9 nop ; No Operation
.text:00007FFFF4AF64BA
.text:00007FFFF4AF64BA mset30: ; CODE XREF: CompressPacket+84↓j
.text:00007FFFF4AF64BA mov [rcx], rdx
.text:00007FFFF4AF64BD add rcx, 8 ; Add
.text:00007FFFF4AF64C1 dec r9 ; Decrement by 1
.text:00007FFFF4AF64C4 jnz short mset30 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64C6
.text:00007FFFF4AF64C6 mset40: ; CODE XREF: CompressPacket+73↑j
.text:00007FFFF4AF64C6 test r8, r8 ; Logical Compare
.text:00007FFFF4AF64C9 jz short mset60 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64CB
.text:00007FFFF4AF64CB mset50: ; CODE XREF: CompressPacket+93↓j
.text:00007FFFF4AF64CB mov [rcx], dl
.text:00007FFFF4AF64CD inc rcx ; Increment by 1
.text:00007FFFF4AF64D0 dec r8 ; Decrement by 1
.text:00007FFFF4AF64D3 jnz short mset50 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64D5
.text:00007FFFF4AF64D5 mset60: ; CODE XREF: CompressPacket+26↑j
.text:00007FFFF4AF64D5 ; CompressPacket+89↑j
.text:00007FFFF4AF64D5 mov rax, r11
.text:00007FFFF4AF64D8 retn ; Return Near from Procedure
.text:00007FFFF4AF64D8 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64D9 db 0Fh, 1Fh, 80h, 4 dup(0)
.text:00007FFFF4AF64E0 db 3 dup(66h), 90h
.text:00007FFFF4AF64E4 db 2 dup(66h), 90h
.text:00007FFFF4AF64E7 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64E7
.text:00007FFFF4AF64E7 mset80: ; CODE XREF: CompressPacket+66↑j
.text:00007FFFF4AF64E7 ; CompressPacket+CD↓j
.text:00007FFFF4AF64E7 mov [rcx], rdx
.text:00007FFFF4AF64EA mov [rcx+8], rdx
.text:00007FFFF4AF64EE mov [rcx+10h], rdx
.text:00007FFFF4AF64F2 add rcx, 40h ; '@' ; Add
.text:00007FFFF4AF64F6 mov [rcx-28h], rdx
.text:00007FFFF4AF64FA mov [rcx-20h], rdx
.text:00007FFFF4AF64FE