assembly - Which are the use case of punpcklbw (interleave in MMX/SSE/AVX)?

Question

Welcome To Ask or Share your Answers For Others

assembly - Which are the use case of punpcklbw (interleave in MMX/SSE/AVX)?

posted Oct 7, 2021 in Technique[技术] by 深蓝 (71.8m points)

assembly - Which are the use case of punpcklbw (interleave in MMX/SSE/AVX)?

Which classes of algorithms could be using punpcklbw ?
In particular, what is punpcklbw xmm0, xmm0doing ?
And yet, what is maskedPow2_Value useful for ?

maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val; // Val is int maskedPow2_Value = 0x101010101010101i64 * maskedValue;

(or mov r9, 101010101010101h; imul rdx, r9; twice)

A complete example (the function is named CompressPacket but it may be misleading), asresult of decompilation by IDA:

void *__cdecl CompressPacket(void *Dst, int Val, size_t Size)
{
  __int64 maskedPow2_Value; // rdx
  unsigned int v5; // ecx
  __int64 *bufferOut; // rcx
  size_t size_; // r9
  size_t i; // r9
  size_t size__; // r9
  size_t counter; // r8
  size_t j; // r9
  void *result; // rax
  __m128i v13; // xmm0
  __int64 lsb4; // rax
  size_t counter1; // r9
  size_t k; // r9
  size_t lsb4_; // r8
  __int64 maskedValue; // rdx

  *(_QWORD *)&Val = (unsigned __int8)Val;
  maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val;
  bufferOut = (__int64 *)((char *)Dst + Size);
  result = Dst;
  switch ( Size )
  {
    case 0ui64:
      return result;
    case 1ui64:
      goto LBL_1_F;
    case 2ui64:
      goto LBL_2_E;
    case 3ui64:
      goto LBL_3_F;
    case 4ui64:
      goto LBL_4_C;
    case 5ui64:
      goto LBL_5_D;
    case 6ui64:
      goto LBL_6_E;
    case 7ui64:
      goto LBL_7_F;
    case 8ui64:
      *(bufferOut - 1) = maskedValue;
      return result;
    case 9ui64:
      *(__int64 *)((char *)bufferOut - 9) = maskedValue;
      *((_BYTE *)bufferOut - 1) = maskedValue;
      return result;
    case 0xAui64:
      *(__int64 *)((char *)bufferOut - 10) = maskedValue;
      *((_WORD *)bufferOut - 1) = maskedValue;
      return result;
    case 0xBui64:
      *(__int64 *)((char *)bufferOut - 11) = maskedValue;
      goto LBL_3_F;
    case 0xCui64:
      *(__int64 *)((char *)bufferOut - 12) = maskedValue;
LBL_4_C:
      *((_DWORD *)bufferOut - 1) = maskedValue;
      return result;
    case 0xDui64:
      *(__int64 *)((char *)bufferOut - 13) = maskedValue;
LBL_5_D:
      *(_DWORD *)((char *)bufferOut - 5) = maskedValue;
      *((_BYTE *)bufferOut - 1) = maskedValue;
      return result;
    case 0xEui64:
      *(__int64 *)((char *)bufferOut - 14) = maskedValue;
LBL_6_E:
      *(_DWORD *)((char *)bufferOut - 6) = maskedValue;
LBL_2_E:
      *((_WORD *)bufferOut - 1) = maskedValue;
      return result;
    case 0xFui64:
      *(__int64 *)((char *)bufferOut - 15) = maskedValue;
LBL_7_F:
      *(_DWORD *)((char *)bufferOut - 7) = maskedValue;
LBL_3_F:
      *(_WORD *)((char *)bufferOut - 3) = maskedValue;
LBL_1_F:
      *((_BYTE *)bufferOut - 1) = maskedValue;
      return result;
    default:
      if ( _bittest(dword_7FFFF4B237D8, 1u) )
      {
        memset(bufferOut, maskedValue, Size);
        return Dst;
      }
      maskedPow2_Value = 0x101010101010101i64 * maskedValue;
      if ( !_bittest(dword_7FFFF4B237D8, 2u) )
      {
        if ( Size >= 0x40 )
        {
          v5 = -(int)bufferOut & 7;
          if ( v5 )
          {
            Size -= v5;
            *(_QWORD *)Dst = maskedPow2_Value;
          }
          bufferOut = (__int64 *)((char *)Dst + v5);
          size_ = Size;
          Size &= 0x3Fu;
          for ( i = size_ >> 6; i; *(bufferOut - 1) = maskedPow2_Value )
          {
            *bufferOut = maskedPow2_Value;
            bufferOut[1] = maskedPow2_Value;
            bufferOut[2] = maskedPow2_Value;
            bufferOut += 8;
            *(bufferOut - 5) = maskedPow2_Value;
            *(bufferOut - 4) = maskedPow2_Value;
            --i;
            *(bufferOut - 3) = maskedPow2_Value;
            *(bufferOut - 2) = maskedPow2_Value;
          }
        }
        size__ = Size;
        counter = Size & 7;
        for ( j = size__ >> 3; j; --j )
          *bufferOut++ = maskedPow2_Value;
        for ( ; counter; --counter )
        {
          *(_BYTE *)bufferOut = maskedPow2_Value;
          bufferOut = (__int64 *)((char *)bufferOut + 1);
        }
        return Dst;
      }
      v13 = _mm_unpacklo_epi8((__m128i)(unsigned __int64)maskedPow2_Value, (__m128i)(unsigned __int64)maskedPow2_Value);
      if ( ((unsigned __int8)bufferOut & 0xF) != 0 )
      {
        *(__m128i *)bufferOut = v13;
        lsb4 = (unsigned __int8)bufferOut & 0xF;
        bufferOut = (__int64 *)((char *)bufferOut - lsb4 + 16);
        Size = lsb4 + Size - 16;
      }
      counter1 = Size >> 7;
      if ( Size >> 7 )
      {
        do
        {
          *(__m128i *)bufferOut = v13;
          *((__m128i *)bufferOut + 1) = v13;
          bufferOut += 16;
          *((__m128i *)bufferOut - 6) = v13;
          *((__m128i *)bufferOut - 5) = v13;
          --counter1;
          *((__m128i *)bufferOut - 4) = v13;
          *((__m128i *)bufferOut - 3) = v13;
          *((__m128i *)bufferOut - 2) = v13;
          *((__m128i *)bufferOut - 1) = v13;
        }
        while ( counter1 );
        Size &= 0x7Fu;
      }
      for ( k = Size >> 4; k; --k )
      {
        *(__m128i *)bufferOut = v13;
        bufferOut += 2;
      }
      lsb4_ = Size & 0xF;
      if ( lsb4_ )
        *(__m128i *)((char *)bufferOut + lsb4_ - 16) = v13;
      return Dst;
  }
}

and the disassembly, by IDA too:

.text:00007FFFF4AF6440 ; void *__cdecl CompressPacket(void *Dst, int Val, size_t Size)
.text:00007FFFF4AF6440 CompressPacket  proc near               ; CODE XREF: j_memset↑j
.text:00007FFFF4AF6440                                         ; Concurrency::details::ResourceManager::CreateAllocatedNodeData(void)+49↑p ...
.text:00007FFFF4AF6440                 mov     r11, rcx
.text:00007FFFF4AF6443                 movzx   edx, dl         ; Move with Zero-Extend
.text:00007FFFF4AF6446                 cmp     r8, 10h         ; switch 16 cases
.text:00007FFFF4AF644A                 jb      SetBytes15      ; Jump if Below (CF=1)
.text:00007FFFF4AF6450
.text:00007FFFF4AF6450 def_7FFFF4AF65D2:                       ; jumptable 00007FFFF4AF65D2 default case
.text:00007FFFF4AF6450                 bt      cs:dword_7FFFF4B237D8, 1
.text:00007FFFF4AF6458                 jnb     short mset05    ; Jump if Not Below (CF=0)
.text:00007FFFF4AF645A                 push    rdi
.text:00007FFFF4AF645B                 mov     rdi, rcx
.text:00007FFFF4AF645E                 mov     eax, edx
.text:00007FFFF4AF6460                 mov     rcx, r8
.text:00007FFFF4AF6463                 rep stosb               ; Store String
.text:00007FFFF4AF6465                 pop     rdi
.text:00007FFFF4AF6466                 jmp     short mset60    ; Jump
.text:00007FFFF4AF6468 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6468
.text:00007FFFF4AF6468 mset05:                                 ; CODE XREF: CompressPacket+18↑j
.text:00007FFFF4AF6468                 mov     r9, 101010101010101h
.text:00007FFFF4AF6472                 imul    rdx, r9         ; Signed Multiply
.text:00007FFFF4AF6476                 bt      cs:dword_7FFFF4B237D8, 2 ; Bit Test
.text:00007FFFF4AF647E                 jb      msetxmm10       ; Jump if Below (CF=1)
.text:00007FFFF4AF6484                 cmp     r8, 40h ; '@'   ; Compare Two Operands
.text:00007FFFF4AF6488                 jb      short mset20    ; Jump if Below (CF=1)
.text:00007FFFF4AF648A                 neg     rcx             ; Two's Complement Negation
.text:00007FFFF4AF648D                 and     ecx, 7          ; Logical AND
.text:00007FFFF4AF6490                 jz      short mset10    ; Jump if Zero (ZF=1)
.text:00007FFFF4AF6492                 sub     r8, rcx         ; Integer Subtraction
.text:00007FFFF4AF6495                 mov     [r11], rdx
.text:00007FFFF4AF6498
.text:00007FFFF4AF6498 mset10:                                 ; CODE XREF: CompressPacket+50↑j
.text:00007FFFF4AF6498                 add     rcx, r11        ; Add
.text:00007FFFF4AF649B                 mov     r9, r8
.text:00007FFFF4AF649E                 and     r8, 3Fh         ; Logical AND
.text:00007FFFF4AF64A2                 shr     r9, 6           ; Shift Logical Right
.text:00007FFFF4AF64A6                 jnz     short mset80    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64A8
.text:00007FFFF4AF64A8 mset20:                                 ; CODE XREF: CompressPacket+48↑j
.text:00007FFFF4AF64A8                                         ; CompressPacket+CF↓j
.text:00007FFFF4AF64A8                 mov     r9, r8
.text:00007FFFF4AF64AB                 and     r8, 7           ; Logical AND
.text:00007FFFF4AF64AF                 shr     r9, 3           ; Shift Logical Right
.text:00007FFFF4AF64B3                 jz      short mset40    ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64B5                 db      66h, 66h
.text:00007FFFF4AF64B5                 xchg    ax, ax          ; Exchange Register/Memory with Register
.text:00007FFFF4AF64B9                 nop                     ; No Operation
.text:00007FFFF4AF64BA
.text:00007FFFF4AF64BA mset30:                                 ; CODE XREF: CompressPacket+84↓j
.text:00007FFFF4AF64BA                 mov     [rcx], rdx
.text:00007FFFF4AF64BD                 add     rcx, 8          ; Add
.text:00007FFFF4AF64C1                 dec     r9              ; Decrement by 1
.text:00007FFFF4AF64C4                 jnz     short mset30    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64C6
.text:00007FFFF4AF64C6 mset40:                                 ; CODE XREF: CompressPacket+73↑j
.text:00007FFFF4AF64C6                 test    r8, r8          ; Logical Compare
.text:00007FFFF4AF64C9                 jz      short mset60    ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64CB
.text:00007FFFF4AF64CB mset50:                                 ; CODE XREF: CompressPacket+93↓j
.text:00007FFFF4AF64CB                 mov     [rcx], dl
.text:00007FFFF4AF64CD                 inc     rcx             ; Increment by 1
.text:00007FFFF4AF64D0                 dec     r8              ; Decrement by 1
.text:00007FFFF4AF64D3                 jnz     short mset50    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64D5
.text:00007FFFF4AF64D5 mset60:                                 ; CODE XREF: CompressPacket+26↑j
.text:00007FFFF4AF64D5                                         ; CompressPacket+89↑j
.text:00007FFFF4AF64D5                 mov     rax, r11
.text:00007FFFF4AF64D8                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF64D8 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64D9                 db 0Fh, 1Fh, 80h, 4 dup(0)
.text:00007FFFF4AF64E0                 db 3 dup(66h), 90h
.text:00007FFFF4AF64E4                 db 2 dup(66h), 90h
.text:00007FFFF4AF64E7 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64E7
.text:00007FFFF4AF64E7 mset80:                                 ; CODE XREF: CompressPacket+66↑j
.text:00007FFFF4AF64E7                                         ; CompressPacket+CD↓j
.text:00007FFFF4AF64E7                 mov     [rcx], rdx
.text:00007FFFF4AF64EA                 mov     [rcx+8], rdx
.text:00007FFFF4AF64EE                 mov     [rcx+10h], rdx
.text:00007FFFF4AF64F2                 add     rcx, 40h ; '@'  ; Add
.text:00007FFFF4AF64F6                 mov     [rcx-28h], rdx
.text:00007FFFF4AF64FA                 mov     [rcx-20h], rdx
.text:00007FFFF4AF64FE

与恶龙缠斗过久,自身亦成为恶龙；凝视深渊过久,深渊将回以凝视…

1 Reply

深蓝 · Answer 1 · 2021-10-06T19:21:30+0000

A common use case is unpacking with zeros to widen 8-bit numbers to 16-bit (with zero-extension), like SSE4.1 pmovzxbw. Or especially to unpack both low and high halves of a 16-byte register to get two vectors of 8x 16-bit elements each. That's kind of the only use case where the "unpack" name makes sense, and packuswb is its inverse, combining 2 registers down to 1. (Or packsswb for signed saturation.)

The "unpack" name is otherwise very strange; it's just a shuffle that interleaves elements from two registers. ARM NEON has a similar shuffle whose mnemonic is "zip".

In your case, it's part of broadcasting a byte into an XMM register, as part of memset. i.e. it's part of what _mm_set_epi8(x) does.

Multiply with 0x0101010101010101 repeats a byte 8 times in a 64-bit integer. This lets you use scalar-integer stores for an odd 8 bytes (not a multiple of 16), like the mov [r11], rdx store.

Given this 8-byte broadcast as an input (via movaq), only one SIMD shuffle is needed. Duplicating the low 8 with punpcklqdq would have been my choice because 8-byte granularity shuffles are more efficient on really old CPUs like Core 2. But interleaving the byte with each other is equivalent because they're all the same anyway, resulting in an XMM register that holds 16 copies of the same byte.

In fact, SSE2 can broadcast a dword with one instruction: pshufd xmm0, xmm0, 0, so if not for wanting an 8-byte scalar, it could have just used imul edx, r9d, 0x01010101.

Implementing memset with 8-byte mov and 16-byte movups stores of course needs this as an input, if it's using that strategy instead of the rep stosb strategy.

With SSSE3 you can broadcast a single byte directly with one pshufb with an all-zero vector (without needing a multiply first) selecting the 0th element of the source for every element of the destination. Or with AVX2 vpbroadcastb. Skipping the integer multiply step would be fine; you can use movq [mem], xmm0 8-byte stores from xmm0 instead of from RDX.

With a byte at the bottom of an xmm register and garbage in the other elements (i.e. if you didn't use imul), 2x punpcklbw + pshufd can broadcast with just SSE2. Or of course punpcklbw xmm0,xmm0 / punpcklwd xmm0,xmm0 as the first 2 shuffles. Or punpcklbw xmm0,xmm0 / pshuflw xmm0,xmm0, 0 / punpcklqdq xmm0,xmm0.

Categories

assembly - Which are the use case of punpcklbw (interleave in MMX/SSE/AVX)?

assembly - Which are the use case of punpcklbw (interleave in MMX/SSE/AVX)?

Please log in or register to add a comment.

Please log in or register to reply this article.

1 Reply

Please log in or register to add a comment.

Just Browsing Browsing

Most popular tags