 


#include <stdio.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <mmintrin.h>


/* ICC sous Windows
#define CPUID __asm __emit 0fh __asm __emit 0a2h
#define RDTSC __asm __emit 0fh __asm __emit 031h

unsigned __int64 readTSC(){
		unsigned cycles_low, cycles_high;

		__asm {
				pushad
				CPUID			
				RDTSC
				mov	cycles_high, edx			
				mov	cycles_low, eax	
				popad
				}
			return ((unsigned __int64)cycles_high << 32) | cycles_low;
	}
// ICC Windows (fin) */

//ICC Linux
long long readTSC ();

long long readTSC ()
{
  long long t;
  asm volatile (".byte 0x0f,0x31" : "=A" (t));

  return t;      
}
double dtime()
{
  return (double) readTSC();
}
// ICC Linux (fin) */


#define N 100

int AI[N][N], YI[N][N], XI[N][N], BI[N], CI[N], SI;

float AF[N][N], YF[N][N], XF[N][N], BF[N],CF[N], SF; 
double begin, benchtime;

int min (int a,int b)
{if (a<b) return a;
else return b; 
}

main_PS_F()
{
	int i, j, k, m;

	for (m=0;m<10;m++)
{
begin=dtime();
SF=0.0;
for (i=0;i<N;i++)
SF+=BF[i]*CF[i]; 
benchtime=dtime() -begin;
printf ("TE_PS_F %d  %f \n",N, (double) benchtime/N);
}
	printf ("\n");
}

main_PS_I()
{
	int i, j, k, m;

	for (m=0;m<10;m++)
{
begin=dtime();
SI=0;
for (i=0;i<N;i++)
SI+=BI[i]*CI[i]; 
benchtime=dtime() -begin;
printf ("TE_PS_I %d  %f \n",N, (double) benchtime/N);
}
	printf ("\n");
}

main_MM_ijk_F()
{
	int i, j, k, m;

	for (m=0;m<10;m++)
{
begin=dtime();
for (i=0;i<N;i++)
for (j=0;j<N;j++)
{
SF=0.0;
for (k=0; k<N; k++)
SF+=AF[i][k]*XF[k][j];
YF[i][j]=SF;
}
benchtime=dtime() -begin;
printf ("TE_ijk_F %d  %f \n",N, (double) benchtime/(N*N));
}
	printf ("\n");
}

main_MM_ijk_I()
{
	int i, j, k, m;

	for (m=0;m<10;m++)
{
begin=dtime();
for (i=0;i<N;i++)
for (j=0;j<N;j++)
{
SI=0;
for (k=0; k<N; k++)
SI+=AI[i][k]*XI[k][j];
YI[i][j]=SI;
}
benchtime=dtime() -begin;
printf ("TE_ijk_I %d  %f \n",N, (double) benchtime/(N*N));
}
	printf ("\n");
}

main_MM_ikj_F()
{
	int i, j, k, m;

	for (m=0;m<10;m++)
{
begin=dtime();
for (i=0;i<N;i++)
for (k=0;k<N;k++)
{
SF=AF[i][k];
for (j=0; j<N; j++)
YF[i][j]+=SF*XF[k][j];
}
benchtime=dtime() -begin;
printf ("TE_ikj_F %d  %f \n",N, (double) benchtime/(N*N));
}
	printf ("\n");
}

main_MM_ikj_I()
{
	int i, j, k, m;

	for (m=0;m<10;m++)
{
begin=dtime();
for (i=0;i<N;i++)
for (k=0;k<N;k++)
{
SI=AI[i][k];
for (j=0; j<N; j++)
YI[i][j]+=SI*XI[k][j];
}
benchtime=dtime() -begin;
printf ("TE_ikj_I %d  %f \n",N, (double) benchtime/(N*N));
}
	printf ("\n");
}




main()
{
main_PS_F();
main_PS_I(); 
main_MM_ijk_F();
main_MM_ikj_F();
main_MM_ijk_I();
main_MM_ikj_I();


} 










