Brainstorming: Noise optimization

Code:

class cCubicNoise

{

public:

    void Generate1D(

        double * a_Array,                ///< Array to generate into

        int a_SizeX,                     ///< Size of the array (num doubles)

        double a_StartX, double a_EndX,  ///< Noise-space coords of the array

        double * a_Workspace = NULL      ///< Workspace that this function can use and trash, same size as a_Array

    );

    

    void Generate2D(

        double * a_Array,                ///< Array to generate into [x + a_SizeX * y]

        int a_SizeX, int a_SizeY         ///< Size of the array (num doubles), in each direction

        double a_StartX, double a_EndX,  ///< Noise-space coords of the array in the X direction

        double a_StartY, double a_EndY,  ///< Noise-space coords of the array in the Y direction

        double * a_Workspace = NULL      ///< Workspace that this function can use and trash, same size as a_Array

    );

    

    void Generate3D(

        double * a_Array,                       ///< Array to generate into [x + a_SizeX * y + a_SizeX * a_SizeY * z]

        int a_SizeX, int a_SizeY, int a_SizeZ,  ///< Size of the array (num doubles), in each direction

        double a_StartX, double a_EndX,         ///< Noise-space coords of the array in the X direction

        double a_StartY, double a_EndY,         ///< Noise-space coords of the array in the Y direction

        double a_StartZ, double a_EndZ,         ///< Noise-space coords of the array in the Z direction

        double * a_Workspace = NULL             ///< Workspace that this function can use and trash, same size as a_Array

    );

} ;



// Same interface for the other noise classes.

__m128 SSE_IntNoise( const __m128i & a_X4 ) { __m128i X4 = _mm_xor_si128( _mm_slli_epi32( a_X4, 13 ), a_X4 ); //_mm_sub_ps( _mm_set_ps1( 1.0f ) // 1.f - __m128 result = _mm_sub_ps( _mm_set_ps1( 1.0f ) , _mm_div_ps( // ( ( (x * ((x*x)*15731 + 789221)) + 1376312589 ) & 0x7fffffff ) / 1073741824.0f _mm_cvtepi32_ps( // (float) -> converts to float _mm_and_si128( // ( (x * ((x*x)*15731 + 789221)) + 1376312589 ) & 0x7fffffff _mm_set1_epi32( 0x7fffffff ) // 0x7fffffff , _mm_add_epi32( // (x * ((x*x)*15731 + 789221)) + 1376312589 _mm_set1_epi32( 1376312589 ) // 1376312589 , _mm_mul_epu32( // x * ((x*x)*15731 + 789221) X4 , _mm_add_epi32( // ((x*x)*15731 + 789221) _mm_set1_epi32( 789221 ) // 789221 , _mm_mul_epu32( // ((x*x)*15731) _mm_mul_epu32( X4, X4 ) // x*x , _mm_set1_epi32( 15731 ) // 15731 ) ) ) ) ) ) , _mm_set_ps1( 1073741824.0f ) // 1073741824.0f ) ); return result; }