/*
 * Subset of inputs computed with SSE
 *
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 3 or later of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * $Id: inputs.c,v 1.3 2011/10/31 09:41:15 c_anthon Exp $
 */

#include "config.h"
#include "gnubg-types.h"
#include "sse.h"
#include "eval.h"

#if USE_SSE_VECTORIZE
#ifdef USE_SSE2 
#include <emmintrin.h> 
#else
#include <xmmintrin.h> 
#endif
#endif

SSE_ALIGN(float inpvec[16][4]) = { 
/*  0 */				{ 0.0, 0.0, 0.0, 0.0 },
/*  1 */				{ 1.0, 0.0, 0.0, 0.0 },
/*  2 */				{ 0.0, 1.0, 0.0, 0.0 },
/*  3 */				{ 0.0, 0.0, 1.0, 0.0 },
/*  4 */				{ 0.0, 0.0, 1.0, 0.5 },
/*  5 */				{ 0.0, 0.0, 1.0, 1.0 },
/*  6 */				{ 0.0, 0.0, 1.0, 1.5 },
/*  7 */				{ 0.0, 0.0, 1.0, 2.0 },
/*  8 */				{ 0.0, 0.0, 1.0, 2.5 },
/*  9 */				{ 0.0, 0.0, 1.0, 3.0 },
/* 10 */				{ 0.0, 0.0, 1.0, 3.5 },
/* 11 */				{ 0.0, 0.0, 1.0, 4.0 },
/* 12 */				{ 0.0, 0.0, 1.0, 4.5 },
/* 13 */				{ 0.0, 0.0, 1.0, 5.0 },
/* 14 */				{ 0.0, 0.0, 1.0, 5.5 },
/* 15 */				{ 0.0, 0.0, 1.0, 6.0 } };

SSE_ALIGN(float inpvecb[16][4]) = { 
/*  0 */				{ 0.0, 0.0, 0.0, 0.0 },
/*  1 */				{ 1.0, 0.0, 0.0, 0.0 },
/*  2 */				{ 1.0, 1.0, 0.0, 0.0 },
/*  3 */				{ 1.0, 1.0, 1.0, 0.0 },
/*  4 */				{ 1.0, 1.0, 1.0, 0.5 },
/*  5 */				{ 1.0, 1.0, 1.0, 1.0 },
/*  6 */				{ 1.0, 1.0, 1.0, 1.5 },
/*  7 */				{ 1.0, 1.0, 1.0, 2.0 },
/*  8 */				{ 1.0, 1.0, 1.0, 2.5 },
/*  9 */				{ 1.0, 1.0, 1.0, 3.0 },
/* 10 */				{ 1.0, 1.0, 1.0, 3.5 },
/* 11 */				{ 1.0, 1.0, 1.0, 4.0 },
/* 12 */				{ 1.0, 1.0, 1.0, 4.5 },
/* 13 */				{ 1.0, 1.0, 1.0, 5.0 },
/* 14 */				{ 1.0, 1.0, 1.0, 5.5 },
/* 15 */				{ 1.0, 1.0, 1.0, 6.0 } };

#if USE_SSE_VECTORIZE
extern void
baseInputs(const TanBoard anBoard, float arInput[])
{
  int i = 3;

	const unsigned int *pB = &anBoard[0][0];
	float *pInput = &arInput[0];
	register __m128 vec0;
	register __m128 vec1;
	register __m128 vec2;
	register __m128 vec3;
	register __m128 vec4;
	register __m128 vec5;
	register __m128 vec6;
	register __m128 vec7;
	
	while ( i-- ){
					vec0 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec0 );
					vec1 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4 , vec1 );
					vec2 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4 , vec2 );
					vec3 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4 , vec3 );
					vec4 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4 , vec4 );
					vec5 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4 , vec5 );
					vec6 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4 , vec6 );
					vec7 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4 , vec7 );
					pInput += 4;
	}

	/* bar */
	vec0 = _mm_load_ps(inpvecb[*pB++]);
	_mm_store_ps(pInput, vec0 );
	pInput += 4;

	i = 3;
	while ( i-- ){
					vec0 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec0 );
					vec1 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4, vec1 );
					vec2 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4, vec2 );
					vec3 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4, vec3 );
					vec4 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4, vec4 );
					vec5 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4, vec5 );
					vec6 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4, vec6 );
					vec7 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput += 4, vec7 );
					pInput += 4;
	}
	
	/* bar */
	vec0 = _mm_load_ps(inpvecb[*pB]);
	_mm_store_ps(pInput, vec0 );
	
	return;
}
#else
extern void
baseInputs(const TanBoard anBoard, float arInput[])
{
  int j, i;
    
  for(j = 0; j < 2; ++j ) {
    float* afInput = arInput + j * 25*4;
    const unsigned int* board = anBoard[j];
    
    /* Points */
    for( i = 0; i < 24; i++ ) {
      int nc = board[ i ];
      
      afInput[ i * 4 + 0 ] = inpvec[ nc ][ 0 ];
      afInput[ i * 4 + 1 ] = inpvec[ nc ][ 1 ];
      afInput[ i * 4 + 2 ] = inpvec[ nc ][ 2 ];
      afInput[ i * 4 + 3 ] = inpvec[ nc ][ 3 ];
    }

    /* Bar */
    {
      int nc = board[ 24 ];
      
      afInput[ 24 * 4 + 0 ] = inpvecb[ nc ][ 0 ];
      afInput[ 24 * 4 + 1 ] = inpvecb[ nc ][ 1 ];
      afInput[ 24 * 4 + 2 ] = inpvecb[ nc ][ 2 ];
      afInput[ 24 * 4 + 3 ] = inpvecb[ nc ][ 3 ];
    }
  }
}
#endif 
