/*
 Esta es una funcin que se optimiz un poquito
 usando asmbench. Solo la incluyo en caso de que
 necesite un cdigo con SSE para jugar.
 */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

/* ASM_BENCH */
inline
void sse_func(void)
{
  float src_1[4] = {1,2,3,4};
  float src_2[4] = {1,2,3,4};
  float src_3[4] = {1,2,3,4};
   
  __asm__ __volatile__
  (
    "movups (%%eax), %%xmm0\n"
    "movups (%%ebx), %%xmm1\n"
    "movups (%%ecx), %%xmm2\n"
    "subps %%xmm1,  %%xmm0\n"
    "subps %%xmm2,  %%xmm1\n"

    "movaps %%xmm0, %%xmm2\n"
    "movaps %%xmm1, %%xmm3\n"
    "shufps $0b11110000, %%xmm0, %%xmm4\n"
    "shufps $0b11110000, %%xmm1, %%xmm5\n"
    "shufps $0b10010011, %%xmm2, %%xmm2\n"
    "shufps $0b10010011, %%xmm3, %%xmm3\n"
    "movss  %%xmm4, %%xmm2\n"
    "movss  %%xmm5, %%xmm3\n"
    "addps  %%xmm0, %%xmm2\n"
    "shufps $0b00001010, %%xmm4, %%xmm4\n"
    "addps  %%xmm1, %%xmm3\n"
    "shufps $0b00001010, %%xmm5, %%xmm5\n"
    "andps  %%xmm7, %%xmm2\n"
    "andps  %%xmm7, %%xmm3\n"
    "movups %%xmm2, (%%eax)\n"
    "movups %%xmm3, (%%ebx)\n"
    "maxps  %%xmm2, %%xmm3\n"
    "maxps  %%xmm3, %%xmm6\n"    
  :
  : "a"(src_1), "b"(src_2), "c"(src_3)
  );
}
