#include <stdio.h>
#include <stdlib.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <mmintrin.h>

//ICC Linux
long long readTSC ();

long long readTSC ()
{
  long long t;
  asm volatile (".byte 0x0f,0x31" : "=A" (t));

  return t;      
}
double dtime()
{
  return (double) readTSC();
}


#define N 1000
#define MAXITER 10
float A[4*N], B[4*N], C[4*N] __attribute__((aligned(16))); 
__m128 TA[N], TB[N], TC[N] __attribute__((aligned(16))); 
double t1;

void noSSE () {
  int i,k;
  double tex[MAXITER];
 
  for (k=0; k <MAXITER; k++) {
    t1=dtime();
    for ( i = 0; i <4*N; i++ ) 
      C[i]= A[i]*A[i]+ B[i]*B[i];
    tex[k]=dtime()-t1 ;
  }
  for (k=0; k <MAXITER; k++) 
    printf ("Sans SSE \t Temps calcul par element %E  pour N = %d \n", tex[k]/N, N);
}

void withSSE () {
  int i,k;
  double tex[MAXITER];
  
  for (k=0; k <MAXITER; k++) {
    t1=dtime();
    //	 TA = (__m128)(A);
    //TB = (__m128)(B);
    for ( i = 0; i < N; i++ ){
      TA[i]= _mm_mul_ps(TA[i],TA[i]);
      TB[i]= _mm_mul_ps(TB[i],TB[i]);
      TC[i]= _mm_add_ps(TA[i],TB[i]);
    }
    tex[k]=dtime()-t1 ;
  }
  for (k=0; k <MAXITER; k++) 
    printf ("Intrinsics SSE \t Temps calcul par element %E  pour N = %d \n", tex[k]/N, N);
}

main () {
  noSSE();
  withSSE();
}



