# SSE优化在数学库中的应用之二

引自：http://hi.baidu.com/sige_online/blog/item/d8fdfffc8f0033f7fd037fac.html

#include <xmmintrin.h>

__m128 a , b , c;

c = _mm_add_ps( a , b );

float a[4] , b[4] , c[4];

for( int i = 0 ; i < 4 ; ++ i )

c[i] = a[i] + b[i];

a = b * c + d / e;

__m128 a = _mm_add_ps( _mm_mul_ps( b , c ) , _mm_p_ps( d , e ) );

#define FP_ONE_BITS 0x3F800000

// r = 1/p

#define FP_INV(r,p)                                                   /

{                                                                          /

int _i = 2 * FP_ONE_BITS - *(int *)&(p);                     /

r = *(float *)&_i;                                                 /

r = r * (2.0f - (p) * r);                                         /

}

float __rcp<float>( const float& a ) {

register float r;

__m128 rcp = _mm_load_ss( &a );

rcp = _mm_rcp_ss( rcp );

_mm_store_ss( &r, rcp );

/* [2 * rcpps(x) - (x * rcpps(x) * rcpps(x))]*/

r = 2.0f * r - ( a * r * r );

return r;

}

原理一致，只不过我们还可以用_mm_rcp_ps并行求四分量的倒数。如果你还对SSE的威力有所保留，那我建议你设计一个测试单元测试一下使用除法求倒数与使用SSE求倒数，看效率到底是谁更高、高多少。当然，我自己已经测试过很多次了。