[[COLOR="Red"]Example 5-15 Replacing the Streaming SIMD Extensions Code with the MMX
Technology Code[/COLOR
cmpnleps xmm1, xmm0 ;create mask in Streaming SIMD
;Extensions format
maxps xmm0, [esi+ecx];get max values
movaps [esi+ecx], xmm1;store mask into memory
movq mm1, [esi+ecx];put lower part of mask into mm1
add ecx, 16 ;increment pointer
movaps xmm1, [esi+ecx];load next four aligned floats
packssdw mm1, [esi+ecx-8];pack lower and upper parts
;of the mask
pand mm1, mm3 ;get indices mask of max values
paddw mm3, mm4 ;increment indices
pmaxsw mm2, mm1 ;get indices corresponding to max
jnz loopMax

Example 5-16 Typical Dot Product Implementation


movaps (%eax,%ecx,4), %xmm0 // 1st
movaps (%ebx,%ecx,4), %xmm1
mulps %xmm1, %xmm0
addps %xmm0, %xmm7
movaps 16(%eax,%ecx,4), %xmm2 // 2nd

movaps 16(%ebx,%ecx,4), %xmm3
mulps %xmm3, %xmm2
addps %xmm2, %xmm7
movaps 32(%eax,%ecx,4), %xmm4 // 3rd
movaps 32(%ebx,%ecx,4), %xmm5
mulps %xmm5, %xmm4
addps %xmm4, %xmm7
movaps 48(%eax,%ecx,4), %xmm6 // 4th
movaps 48(%ebx,%ecx,4), %xmm0
mulps %xmm6, %xmm0
addps %xmm0, %xmm7
subl $16, %ecx // loop count
jnz inner_loop