// à rajouter dans main.c

#include <xmmintrin.h>
#include <emmintrin.h>
#include <mmintrin.h>

// des define utiles
#define decd(va,vb) _mm_or_si128 (_mm_srli_si128(va,1),_mm_slli_si128(vb,15))  // va=XS ; vb=XS+1 ;points a gauche (+1)
#define decg(va,vb) _mm_or_si128 (_mm_slli_si128(va,1),_mm_srli_si128(vb,15)) //va = XS ; vb=XS-1 ; points à droite (-1)
#define ld16(a) 	_mm_load_si128(&a)			//chargement aligné
#define st16(a, b)	_mm_store_si128(&a, b)		//rangement aligné
#define or(a,b) 		_mm_or_si128(a,b)			//ou logique
#define xor(a,b) 	_mm_xor_si128(a,b)			//ou exclusif 
#define maxbu(a,b)	_mm_max_epu8(a,b)			// max 8 bits non signés
#define minbu(a,b)	_mm_min_epu8(a,b)			// min 8 bits non signés
#define addh(a,b) 	_mm_add_epi16(a,b)			// addition 16 bits signée
#define addhu(a,b) 	_mm_add_epu16(a,b)			// addition 16 bits non signée
#define subh(a,b) 	_mm_sub_epi16(a,b)			//soustraction 16 bits signée
#define b2hl(a,b)	_mm_unpacklo_epi8 (a,b)	//4 octets bas entrelacés vers 4 shorts
#define b2hh(a,b)	_mm_unpackhi_epi8 (a,b)	//4 octets haut entrelacés vers 4 shorts
#define h2b(a,b)	_mm_packus_epi16(a,b)	//8 shorts (a) dans 8 octets bas
							//8 shorts (b) dans 8 octets haut
#define srli128(a,v)	_mm_srli_si128(a,v)	// décalage droite des 128 bits de a de v octets
#define slli128(a,v)	_mm_slli_si128(a,v)	// décalage gauche des 128 bits de a de v octets 
#define srai16(a,v)	_mm_srai_epi16(a,v)	// déc. droite des8x 16 bits de a de v bits
#define slli16(a,v)	_mm_slli_epi16(a,v)	//déc. gauche des 8x16 bits de a de v bits

// à changer dans nrutil.c

// ------------------------------------------------ 
byte** bmatrix(long nrl, long nrh, long ncl, long nch)
// ------------------------------------------------ 
// allocate an byte matrix with subscript range m[nrl..nrh][ncl..nch] 
{
  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
  byte **m;

  /* allocate pointers to rows */
  m=(byte **) _mm_malloc((size_t)((nrow+NR_END)*sizeof(byte*)),16);
  if (!m) nrerror("allocation failure 1 in bmatrix()");
  m += NR_END;
  m -= nrl;

  /* allocate rows and set pointers to them */
  m[nrl]=(byte *) _mm_malloc((size_t)((nrow*ncol+NR_END)*sizeof(byte)),16);
  if (!m[nrl]) nrerror("allocation failure 2 in bmatrix()");
  m[nrl] += NR_END;
  m[nrl] -= ncl;

  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;

  /* return pointer to array of pointers to rows */
  return m;
}



/* ---------------------------------------------------------------- */
void lapl(byte **X, long i0, long i1, long j0, long j1, byte **Y)
/* ---------------------------------------------------------------- */
{
  int i, j, tmp;
 
  for(i=i0+1; i<=i1-1; i++) {

    for(j=j0+1; j<=j1-1; j++) {
			
      tmp= (int) ((X[i][j]<<2)  -  (X[i-1][j] + X[i+1][j] + X[i][j-1] + X[i][j+1]));
      Y[i][j]= (unsigned char) ((tmp +128)) ;
      
    }

  }
  
}


/* ---------------------------------------------------------------- */
void lapl_simd(byte **X, long i0, long i1, long j0, long j1, byte **Y)
/* ---------------------------------------------------------------- */
{
  int i, j;
  __m128i aij, aijm, aijp, aimj, aipj;
  __m128i aijh, aijmh, aijph, aimjh, aipjh, aijb, aijmb, aijpb, aimjb, aipjb, zero, tmph, tmpb, biais;
  __m128i **XS, **YS;
  short val=128;

  XS=X; YS=Y; 
  
 biais = _mm_set1_epi16(val);
 zero = _mm_set1_epi16 (0);
  for(i=i0+1; i<=(i1-1); i++) {

    for(j=j0+1; j<=(j1-1)/16; j++) {
		aimj=ld16(XS[i-1][j]);					//i-1,j simd
		aij=ld16(XS[i][j]);					//i, j simd
		aipj=ld16(XS[i+1][j]);					//i+1, j simd
		
		aijm=decd(aij,ld16(XS[i][j+1]));		//i, j-1 simd
		aijp=decg(aij,ld16(XS[i][j-1]));		//i, j+1 simd

		aimjb = b2hl(aimj,zero);
		aijb = b2hl(aij,zero);
		aipjb = b2hl(aipj,zero);
		aijmb = b2hl(aijm,zero);
		aijpb = b2hl(aijp,zero);
		
		aimjh = b2hh(aimj,zero);
		aijh = b2hh(aij,zero);
		aipjh = b2hh(aipj,zero);
		aijmh = b2hh(aijm,zero);
		aijph = b2hh(aijp,zero);

		tmpb= addh (aimjb, aipjb);
		tmpb = addh (tmpb, aijmb);
		tmpb = addh (tmpb, aijpb);
		tmpb = subh (slli16(aijb,2), tmpb);
		tmpb=addh(tmpb,biais);
		
		tmph= addh (aimjh, aipjh);
		tmph = addh (tmph, aijmh);
		tmph = addh (tmph, aijph);
		tmph = subh (slli16(aijh,2), tmph);
		tmph= addh(tmph,biais);
		tmpb = h2b(tmpb,tmph);
		st16(YS[i][j], tmpb); 
    
    }
  }
}

