If you need signed 32x32 bit integer multiplication then the following example at software.intel.com looks like it should do what you want:
static inline __m128i muly(const __m128i &a, const __m128i &b)
{
__m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/
__m128i tmp2 = _mm_mul_epu32( _mm_srli_si128(a,4), _mm_srli_si128(b,4)); /* mul 3,1 */
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
}
You might want to have two builds - one for old CPUs and one for recent CPUs, in which case you could do the following:
static inline __m128i muly(const __m128i &a, const __m128i &b)
{
#ifdef __SSE4_1__ // modern CPU - use SSE 4.1
return _mm_mullo_epi32(a, b);
#else // old CPU - use SSE 2
__m128i tmp1 = _mm_mul_epu32(a,b); /* mul 2,0*/
__m128i tmp2 = _mm_mul_epu32( _mm_srli_si128(a,4), _mm_srli_si128(b,4)); /* mul 3,1 */
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
#endif
}
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…