BEAST/BSE - Better Audio System and Sound Engine
0.8.2
|
00001 // Licensed GNU LGPL v2.1 or later: http://www.gnu.org/licenses/lgpl.html 00002 #ifndef __BSE_RESAMPLER_TCC__ 00003 #define __BSE_RESAMPLER_TCC__ 00004 00005 #include <vector> 00006 #include <bse/bseresampler.hh> 00007 #include <sfi/sfi.hh> 00008 #include <math.h> 00009 #include <string.h> 00010 #include <stdlib.h> 00011 #include <stdio.h> 00012 #ifdef __SSE__ 00013 #include <xmmintrin.h> 00014 #endif 00015 00016 namespace Bse { 00017 namespace Resampler { 00018 using std::vector; 00019 using std::min; 00020 using std::max; 00021 using std::copy; 00022 00023 /* see: http://ds9a.nl/gcc-simd/ */ 00024 union F4Vector 00025 { 00026 float f[4]; 00027 #ifdef __SSE__ 00028 __m128 v; // vector of four single floats 00029 #endif 00030 }; 00031 00047 template<class Accumulator> static inline Accumulator 00048 fir_process_one_sample (const float *input, 00049 const float *taps, /* [0..order-1] */ 00050 const guint order) 00051 { 00052 Accumulator out = 0; 00053 for (guint i = 0; i < order; i++) 00054 out += input[i] * taps[i]; 00055 return out; 00056 } 00057 00068 static inline void 00069 fir_process_4samples_sse (const float *input, 00070 const float *sse_taps, 00071 const guint order, 00072 float *out0, 00073 float *out1, 00074 float *out2, 00075 float *out3) 00076 { 00077 #ifdef __SSE__ 00078 /* input and taps must be 16-byte aligned */ 00079 const F4Vector *input_v = reinterpret_cast<const F4Vector *> (input); 00080 const F4Vector *sse_taps_v = reinterpret_cast<const F4Vector *> (sse_taps); 00081 F4Vector out0_v, out1_v, out2_v, out3_v; 00082 00083 out0_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[0].v); 00084 out1_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[1].v); 00085 out2_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[2].v); 00086 out3_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[3].v); 00087 00088 for (guint i = 1; i < (order + 6) / 4; i++) 00089 { 00090 out0_v.v = _mm_add_ps (out0_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 0].v)); 00091 out1_v.v = _mm_add_ps (out1_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 1].v)); 00092 out2_v.v = _mm_add_ps (out2_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 2].v)); 00093 out3_v.v = _mm_add_ps (out3_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 3].v)); 00094 } 00095 00096 *out0 = out0_v.f[0] + out0_v.f[1] + out0_v.f[2] + out0_v.f[3]; 00097 *out1 = out1_v.f[0] + out1_v.f[1] + out1_v.f[2] + out1_v.f[3]; 00098 *out2 = out2_v.f[0] + out2_v.f[1] + out2_v.f[2] + out2_v.f[3]; 00099 *out3 = out3_v.f[0] + out3_v.f[1] + out3_v.f[2] + out3_v.f[3]; 00100 #else 00101 g_assert_not_reached(); 00102 #endif 00103 } 00104 00105 00124 static inline vector<float> 00125 fir_compute_sse_taps (const vector<float>& taps) 00126 { 00127 const int order = taps.size(); 00128 vector<float> sse_taps ((order + 6) / 4 * 16); 00129 00130 for (int j = 0; j < 4; j++) 00131 for (int i = 0; i < order; i++) 00132 { 00133 int k = i + j; 00134 sse_taps[(k / 4) * 16 + (k % 4) + j * 4] = taps[i]; 00135 } 00136 00137 return sse_taps; 00138 } 00139 00149 static inline bool 00150 fir_test_filter_sse (bool verbose, 00151 const guint max_order = 64) 00152 { 00153 int errors = 0; 00154 if (verbose) 00155 printf ("testing SSE filter implementation:\n\n"); 00156 00157 for (guint order = 0; order < max_order; order++) 00158 { 00159 vector<float> taps (order); 00160 for (guint i = 0; i < order; i++) 00161 taps[i] = i + 1; 00162 00163 AlignedArray<float,16> sse_taps (fir_compute_sse_taps (taps)); 00164 if (verbose) 00165 { 00166 for (uint i = 0; i < sse_taps.size(); i++) 00167 { 00168 printf ("%3d", (int) (sse_taps[i] + 0.5)); 00169 if (i % 4 == 3) 00170 printf (" |"); 00171 if (i % 16 == 15) 00172 printf (" ||| upper bound = %d\n", (order + 6) / 4); 00173 } 00174 printf ("\n\n"); 00175 } 00176 00177 AlignedArray<float,16> random_mem (order + 4); 00178 for (guint i = 0; i < order + 4; i++) 00179 random_mem[i] = 1.0 - rand() / (0.5 * RAND_MAX); 00180 00181 /* FIXME: the problem with this test is that we explicitely test SSE code 00182 * here, but the test case is not compiled with -msse within the BEAST tree 00183 */ 00184 float out[4]; 00185 fir_process_4samples_sse (&random_mem[0], &sse_taps[0], order, 00186 &out[0], &out[1], &out[2], &out[3]); 00187 00188 double avg_diff = 0.0; 00189 for (int i = 0; i < 4; i++) 00190 { 00191 double diff = fir_process_one_sample<double> (&random_mem[i], &taps[0], order) - out[i]; 00192 avg_diff += fabs (diff); 00193 } 00194 avg_diff /= (order + 1); 00195 bool is_error = (avg_diff > 0.00001); 00196 if (is_error || verbose) 00197 printf ("*** order = %d, avg_diff = %g\n", order, avg_diff); 00198 if (is_error) 00199 errors++; 00200 } 00201 if (errors) 00202 printf ("*** %d errors detected\n", errors); 00203 else 00204 printf ("filter implementation ok.\n"); 00205 00206 return (errors == 0); 00207 } 00208 00216 template<guint ORDER, bool USE_SSE> 00217 class Upsampler2 : public Resampler2 { 00218 vector<float> taps; 00219 AlignedArray<float,16> history; 00220 AlignedArray<float,16> sse_taps; 00221 protected: 00222 /* fast SSE optimized convolution */ 00223 void 00224 process_4samples_aligned (const float *input /* aligned */, 00225 float *output) 00226 { 00227 const guint H = (ORDER / 2); /* half the filter length */ 00228 00229 output[1] = input[H]; 00230 output[3] = input[H + 1]; 00231 output[5] = input[H + 2]; 00232 output[7] = input[H + 3]; 00233 00234 fir_process_4samples_sse (input, &sse_taps[0], ORDER, &output[0], &output[2], &output[4], &output[6]); 00235 } 00236 /* slow convolution */ 00237 void 00238 process_sample_unaligned (const float *input, 00239 float *output) 00240 { 00241 const guint H = (ORDER / 2); /* half the filter length */ 00242 output[0] = fir_process_one_sample<float> (&input[0], &taps[0], ORDER); 00243 output[1] = input[H]; 00244 } 00245 void 00246 process_block_aligned (const float *input, 00247 guint n_input_samples, 00248 float *output) 00249 { 00250 uint i = 0; 00251 if (USE_SSE) 00252 { 00253 while (i + 3 < n_input_samples) 00254 { 00255 process_4samples_aligned (&input[i], &output[i*2]); 00256 i += 4; 00257 } 00258 } 00259 while (i < n_input_samples) 00260 { 00261 process_sample_unaligned (&input[i], &output[2*i]); 00262 i++; 00263 } 00264 } 00265 void 00266 process_block_unaligned (const float *input, 00267 guint n_input_samples, 00268 float *output) 00269 { 00270 uint i = 0; 00271 if (USE_SSE) 00272 { 00273 while ((reinterpret_cast<ptrdiff_t> (&input[i]) & 15) && i < n_input_samples) 00274 { 00275 process_sample_unaligned (&input[i], &output[2 * i]); 00276 i++; 00277 } 00278 } 00279 process_block_aligned (&input[i], n_input_samples - i, &output[2 * i]); 00280 } 00281 public: 00287 Upsampler2 (float *init_taps) : 00288 taps (init_taps, init_taps + ORDER), 00289 history (2 * ORDER), 00290 sse_taps (fir_compute_sse_taps (taps)) 00291 { 00292 g_assert ((ORDER & 1) == 0); /* even order filter */ 00293 } 00298 void 00299 process_block (const float *input, 00300 guint n_input_samples, 00301 float *output) 00302 { 00303 const uint history_todo = min (n_input_samples, ORDER - 1); 00304 00305 copy (input, input + history_todo, &history[ORDER - 1]); 00306 process_block_aligned (&history[0], history_todo, output); 00307 if (n_input_samples > history_todo) 00308 { 00309 process_block_unaligned (input, n_input_samples - history_todo, &output [2 * history_todo]); 00310 00311 // build new history from new input 00312 copy (input + n_input_samples - history_todo, input + n_input_samples, &history[0]); 00313 } 00314 else 00315 { 00316 // build new history from end of old history 00317 // (very expensive if n_input_samples tends to be a lot smaller than ORDER often) 00318 g_memmove (&history[0], &history[n_input_samples], sizeof (history[0]) * (ORDER - 1)); 00319 } 00320 } 00324 guint 00325 order() const 00326 { 00327 return ORDER; 00328 } 00329 double 00330 delay() const 00331 { 00332 return order() - 1; 00333 } 00334 }; 00335 00343 template<guint ORDER, bool USE_SSE> 00344 class Downsampler2 : public Resampler2 { 00345 vector<float> taps; 00346 AlignedArray<float,16> history_even; 00347 AlignedArray<float,16> history_odd; 00348 AlignedArray<float,16> sse_taps; 00349 /* fast SSE optimized convolution */ 00350 template<int ODD_STEPPING> void 00351 process_4samples_aligned (const float *input_even /* aligned */, 00352 const float *input_odd, 00353 float *output) 00354 { 00355 const guint H = (ORDER / 2) - 1; /* half the filter length */ 00356 00357 fir_process_4samples_sse (input_even, &sse_taps[0], ORDER, &output[0], &output[1], &output[2], &output[3]); 00358 00359 output[0] += 0.5 * input_odd[H * ODD_STEPPING]; 00360 output[1] += 0.5 * input_odd[(H + 1) * ODD_STEPPING]; 00361 output[2] += 0.5 * input_odd[(H + 2) * ODD_STEPPING]; 00362 output[3] += 0.5 * input_odd[(H + 3) * ODD_STEPPING]; 00363 } 00364 /* slow convolution */ 00365 template<int ODD_STEPPING> float 00366 process_sample_unaligned (const float *input_even, 00367 const float *input_odd) 00368 { 00369 const guint H = (ORDER / 2) - 1; /* half the filter length */ 00370 00371 return fir_process_one_sample<float> (&input_even[0], &taps[0], ORDER) + 0.5 * input_odd[H * ODD_STEPPING]; 00372 } 00373 template<int ODD_STEPPING> void 00374 process_block_aligned (const float *input_even, 00375 const float *input_odd, 00376 float *output, 00377 guint n_output_samples) 00378 { 00379 uint i = 0; 00380 if (USE_SSE) 00381 { 00382 while (i + 3 < n_output_samples) 00383 { 00384 process_4samples_aligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING], &output[i]); 00385 i += 4; 00386 } 00387 } 00388 while (i < n_output_samples) 00389 { 00390 output[i] = process_sample_unaligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING]); 00391 i++; 00392 } 00393 } 00394 template<int ODD_STEPPING> void 00395 process_block_unaligned (const float *input_even, 00396 const float *input_odd, 00397 float *output, 00398 guint n_output_samples) 00399 { 00400 uint i = 0; 00401 if (USE_SSE) 00402 { 00403 while ((reinterpret_cast<ptrdiff_t> (&input_even[i]) & 15) && i < n_output_samples) 00404 { 00405 output[i] = process_sample_unaligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING]); 00406 i++; 00407 } 00408 } 00409 process_block_aligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING], &output[i], n_output_samples); 00410 } 00411 void 00412 deinterleave2 (const float *data, 00413 guint n_data_values, 00414 float *output) 00415 { 00416 for (uint i = 0; i < n_data_values; i += 2) 00417 output[i / 2] = data[i]; 00418 } 00419 public: 00425 Downsampler2 (float *init_taps) : 00426 taps (init_taps, init_taps + ORDER), 00427 history_even (2 * ORDER), 00428 history_odd (2 * ORDER), 00429 sse_taps (fir_compute_sse_taps (taps)) 00430 { 00431 g_assert ((ORDER & 1) == 0); /* even order filter */ 00432 } 00437 void 00438 process_block (const float *input, 00439 guint n_input_samples, 00440 float *output) 00441 { 00442 g_assert ((n_input_samples & 1) == 0); 00443 00444 const uint BLOCKSIZE = 1024; 00445 00446 F4Vector block[BLOCKSIZE / 4]; /* using F4Vector ensures 16-byte alignment */ 00447 float *input_even = &block[0].f[0]; 00448 00449 while (n_input_samples) 00450 { 00451 uint n_input_todo = min (n_input_samples, BLOCKSIZE * 2); 00452 00453 /* since the halfband filter contains zeros every other sample 00454 * and since we're using SSE instructions, which expect the 00455 * data to be consecutively represented in memory, we prepare 00456 * a block of samples containing only even-indexed samples 00457 * 00458 * we keep the deinterleaved data on the stack (instead of per-class 00459 * allocated memory), to ensure that even running a lot of these 00460 * downsampler streams will not result in cache trashing 00461 * 00462 * FIXME: this implementation is suboptimal for non-SSE, because it 00463 * performs an extra deinterleaving step in any case, but deinterleaving 00464 * is only required for SSE instructions 00465 */ 00466 deinterleave2 (input, n_input_todo, input_even); 00467 00468 const float *input_odd = input + 1; /* we process this one with a stepping of 2 */ 00469 00470 const uint n_output_todo = n_input_todo / 2; 00471 const uint history_todo = min (n_output_todo, ORDER - 1); 00472 00473 copy (input_even, input_even + history_todo, &history_even[ORDER - 1]); 00474 deinterleave2 (input_odd, history_todo * 2, &history_odd[ORDER - 1]); 00475 00476 process_block_aligned <1> (&history_even[0], &history_odd[0], output, history_todo); 00477 if (n_output_todo > history_todo) 00478 { 00479 process_block_unaligned<2> (input_even, input_odd, &output[history_todo], n_output_todo - history_todo); 00480 00481 // build new history from new input (here: history_todo == ORDER - 1) 00482 copy (input_even + n_output_todo - history_todo, input_even + n_output_todo, &history_even[0]); 00483 deinterleave2 (input_odd + n_input_todo - history_todo * 2, history_todo * 2, &history_odd[0]); /* FIXME: can be optimized */ 00484 } 00485 else 00486 { 00487 // build new history from end of old history 00488 // (very expensive if n_output_todo tends to be a lot smaller than ORDER often) 00489 g_memmove (&history_even[0], &history_even[n_output_todo], sizeof (history_even[0]) * (ORDER - 1)); 00490 g_memmove (&history_odd[0], &history_odd[n_output_todo], sizeof (history_odd[0]) * (ORDER - 1)); 00491 } 00492 00493 n_input_samples -= n_input_todo; 00494 input += n_input_todo; 00495 output += n_output_todo; 00496 } 00497 } 00501 guint 00502 order() const 00503 { 00504 return ORDER; 00505 } 00506 double 00507 delay() const 00508 { 00509 return order() / 2 - 0.5; 00510 } 00511 }; 00512 00513 template<bool USE_SSE> Resampler2* 00514 Resampler2::create_impl (BseResampler2Mode mode, 00515 BseResampler2Precision precision) 00516 { 00517 if (mode == BSE_RESAMPLER2_MODE_UPSAMPLE) 00518 { 00519 switch (precision) 00520 { 00521 case BSE_RESAMPLER2_PREC_LINEAR: return create_impl_with_coeffs <Upsampler2<2, USE_SSE> > (halfband_fir_linear_coeffs, 2, 2.0); 00522 case BSE_RESAMPLER2_PREC_48DB: return create_impl_with_coeffs <Upsampler2<16, USE_SSE> > (halfband_fir_48db_coeffs, 16, 2.0); 00523 case BSE_RESAMPLER2_PREC_72DB: return create_impl_with_coeffs <Upsampler2<24, USE_SSE> > (halfband_fir_72db_coeffs, 24, 2.0); 00524 case BSE_RESAMPLER2_PREC_96DB: return create_impl_with_coeffs <Upsampler2<32, USE_SSE> > (halfband_fir_96db_coeffs, 32, 2.0); 00525 case BSE_RESAMPLER2_PREC_120DB: return create_impl_with_coeffs <Upsampler2<42, USE_SSE> > (halfband_fir_120db_coeffs, 42, 2.0); 00526 case BSE_RESAMPLER2_PREC_144DB: return create_impl_with_coeffs <Upsampler2<52, USE_SSE> > (halfband_fir_144db_coeffs, 52, 2.0); 00527 } 00528 } 00529 else if (mode == BSE_RESAMPLER2_MODE_DOWNSAMPLE) 00530 { 00531 switch (precision) 00532 { 00533 case BSE_RESAMPLER2_PREC_LINEAR: return create_impl_with_coeffs <Downsampler2<2, USE_SSE> > (halfband_fir_linear_coeffs, 2, 1.0); 00534 case BSE_RESAMPLER2_PREC_48DB: return create_impl_with_coeffs <Downsampler2<16, USE_SSE> > (halfband_fir_48db_coeffs, 16, 1.0); 00535 case BSE_RESAMPLER2_PREC_72DB: return create_impl_with_coeffs <Downsampler2<24, USE_SSE> > (halfband_fir_72db_coeffs, 24, 1.0); 00536 case BSE_RESAMPLER2_PREC_96DB: return create_impl_with_coeffs <Downsampler2<32, USE_SSE> > (halfband_fir_96db_coeffs, 32, 1.0); 00537 case BSE_RESAMPLER2_PREC_120DB: return create_impl_with_coeffs <Downsampler2<42, USE_SSE> > (halfband_fir_120db_coeffs, 42, 1.0); 00538 case BSE_RESAMPLER2_PREC_144DB: return create_impl_with_coeffs <Downsampler2<52, USE_SSE> > (halfband_fir_144db_coeffs, 52, 1.0); 00539 } 00540 } 00541 return 0; 00542 } 00543 00544 } // Resampler 00545 } // Bse 00546 00547 #endif /* __BSE_RESAMPLER_TCC__ */