BEAST/BSE - Better Audio System and Sound Engine  0.8.2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
bseresamplerimpl.hh
Go to the documentation of this file.
00001  // Licensed GNU LGPL v2.1 or later: http://www.gnu.org/licenses/lgpl.html
00002 #ifndef __BSE_RESAMPLER_TCC__
00003 #define __BSE_RESAMPLER_TCC__
00004 
00005 #include <vector>
00006 #include <bse/bseresampler.hh>
00007 #include <sfi/sfi.hh>
00008 #include <math.h>
00009 #include <string.h>
00010 #include <stdlib.h>
00011 #include <stdio.h>
00012 #ifdef __SSE__
00013 #include <xmmintrin.h>
00014 #endif
00015 
00016 namespace Bse {
00017 namespace Resampler {
00018 using std::vector;
00019 using std::min;
00020 using std::max;
00021 using std::copy;
00022 
00023 /* see: http://ds9a.nl/gcc-simd/ */
00024 union F4Vector 
00025 {
00026   float f[4];
00027 #ifdef __SSE__
00028   __m128 v;   // vector of four single floats
00029 #endif
00030 };
00031 
00047 template<class Accumulator> static inline Accumulator
00048 fir_process_one_sample (const float *input,
00049                         const float *taps, /* [0..order-1] */
00050                         const guint  order)
00051 {
00052   Accumulator out = 0;
00053   for (guint i = 0; i < order; i++)
00054     out += input[i] * taps[i];
00055   return out;
00056 }
00057 
00068 static inline void
00069 fir_process_4samples_sse (const float *input,
00070                           const float *sse_taps,
00071                           const guint  order,
00072                           float       *out0,
00073                           float       *out1,
00074                           float       *out2,
00075                           float       *out3)
00076 {
00077 #ifdef __SSE__
00078   /* input and taps must be 16-byte aligned */
00079   const F4Vector *input_v = reinterpret_cast<const F4Vector *> (input);
00080   const F4Vector *sse_taps_v = reinterpret_cast<const F4Vector *> (sse_taps);
00081   F4Vector out0_v, out1_v, out2_v, out3_v;
00082 
00083   out0_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[0].v);
00084   out1_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[1].v);
00085   out2_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[2].v);
00086   out3_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[3].v);
00087 
00088   for (guint i = 1; i < (order + 6) / 4; i++)
00089     {
00090       out0_v.v = _mm_add_ps (out0_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 0].v));
00091       out1_v.v = _mm_add_ps (out1_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 1].v));
00092       out2_v.v = _mm_add_ps (out2_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 2].v));
00093       out3_v.v = _mm_add_ps (out3_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 3].v));
00094     }
00095 
00096   *out0 = out0_v.f[0] + out0_v.f[1] + out0_v.f[2] + out0_v.f[3];
00097   *out1 = out1_v.f[0] + out1_v.f[1] + out1_v.f[2] + out1_v.f[3];
00098   *out2 = out2_v.f[0] + out2_v.f[1] + out2_v.f[2] + out2_v.f[3];
00099   *out3 = out3_v.f[0] + out3_v.f[1] + out3_v.f[2] + out3_v.f[3];
00100 #else
00101   g_assert_not_reached();
00102 #endif
00103 }
00104 
00105 
00124 static inline vector<float>
00125 fir_compute_sse_taps (const vector<float>& taps)
00126 {
00127   const int order = taps.size();
00128   vector<float> sse_taps ((order + 6) / 4 * 16);
00129 
00130   for (int j = 0; j < 4; j++)
00131     for (int i = 0; i < order; i++)
00132       {
00133         int k = i + j;
00134         sse_taps[(k / 4) * 16 + (k % 4) + j * 4] = taps[i];
00135       }
00136 
00137   return sse_taps;
00138 }
00139 
00149 static inline bool
00150 fir_test_filter_sse (bool        verbose,
00151                      const guint max_order = 64)
00152 {
00153   int errors = 0;
00154   if (verbose)
00155     printf ("testing SSE filter implementation:\n\n");
00156 
00157   for (guint order = 0; order < max_order; order++)
00158     {
00159       vector<float> taps (order);
00160       for (guint i = 0; i < order; i++)
00161         taps[i] = i + 1;
00162 
00163       AlignedArray<float,16> sse_taps (fir_compute_sse_taps (taps));
00164       if (verbose)
00165         {
00166           for (uint i = 0; i < sse_taps.size(); i++)
00167             {
00168               printf ("%3d", (int) (sse_taps[i] + 0.5));
00169               if (i % 4 == 3)
00170                 printf ("  |");
00171               if (i % 16 == 15)
00172                 printf ("   ||| upper bound = %d\n", (order + 6) / 4);
00173             }
00174           printf ("\n\n");
00175         }
00176 
00177       AlignedArray<float,16> random_mem (order + 4);
00178       for (guint i = 0; i < order + 4; i++)
00179         random_mem[i] = 1.0 - rand() / (0.5 * RAND_MAX);
00180 
00181       /* FIXME: the problem with this test is that we explicitely test SSE code
00182        * here, but the test case is not compiled with -msse within the BEAST tree
00183        */
00184       float out[4];
00185       fir_process_4samples_sse (&random_mem[0], &sse_taps[0], order,
00186                                 &out[0], &out[1], &out[2], &out[3]);
00187 
00188       double avg_diff = 0.0;
00189       for (int i = 0; i < 4; i++)
00190         {
00191           double diff = fir_process_one_sample<double> (&random_mem[i], &taps[0], order) - out[i];
00192           avg_diff += fabs (diff);
00193         }
00194       avg_diff /= (order + 1);
00195       bool is_error = (avg_diff > 0.00001);
00196       if (is_error || verbose)
00197         printf ("*** order = %d, avg_diff = %g\n", order, avg_diff);
00198       if (is_error)
00199         errors++;
00200     }
00201   if (errors)
00202     printf ("*** %d errors detected\n", errors);
00203   else
00204     printf ("filter implementation ok.\n");
00205 
00206   return (errors == 0);
00207 }
00208 
00216 template<guint ORDER, bool USE_SSE>
00217 class Upsampler2 : public Resampler2 {
00218   vector<float>          taps;
00219   AlignedArray<float,16> history;
00220   AlignedArray<float,16> sse_taps;
00221 protected:
00222   /* fast SSE optimized convolution */
00223   void
00224   process_4samples_aligned (const float *input /* aligned */,
00225                             float       *output)
00226   {
00227     const guint H = (ORDER / 2); /* half the filter length */
00228 
00229     output[1] = input[H];
00230     output[3] = input[H + 1];
00231     output[5] = input[H + 2];
00232     output[7] = input[H + 3];
00233 
00234     fir_process_4samples_sse (input, &sse_taps[0], ORDER, &output[0], &output[2], &output[4], &output[6]);
00235   }
00236   /* slow convolution */
00237   void
00238   process_sample_unaligned (const float *input,
00239                             float       *output)
00240   {
00241     const guint H = (ORDER / 2); /* half the filter length */
00242     output[0] = fir_process_one_sample<float> (&input[0], &taps[0], ORDER);
00243     output[1] = input[H];
00244   }
00245   void
00246   process_block_aligned (const float *input,
00247                          guint        n_input_samples,
00248                          float       *output)
00249   {
00250     uint i = 0;
00251     if (USE_SSE)
00252       {
00253         while (i + 3 < n_input_samples)
00254           {
00255             process_4samples_aligned (&input[i], &output[i*2]);
00256             i += 4;
00257           }
00258       }
00259     while (i < n_input_samples)
00260       {
00261         process_sample_unaligned (&input[i], &output[2*i]);
00262         i++;
00263       }
00264   }
00265   void
00266   process_block_unaligned (const float *input,
00267                            guint        n_input_samples,
00268                            float       *output)
00269   {
00270     uint i = 0;
00271     if (USE_SSE)
00272       {
00273         while ((reinterpret_cast<ptrdiff_t> (&input[i]) & 15) && i < n_input_samples)
00274           {
00275             process_sample_unaligned (&input[i], &output[2 * i]);
00276             i++;
00277           }
00278       }
00279     process_block_aligned (&input[i], n_input_samples - i, &output[2 * i]);
00280   }
00281 public:
00287   Upsampler2 (float *init_taps) :
00288     taps (init_taps, init_taps + ORDER),
00289     history (2 * ORDER),
00290     sse_taps (fir_compute_sse_taps (taps))
00291   {
00292     g_assert ((ORDER & 1) == 0);    /* even order filter */
00293   }
00298   void
00299   process_block (const float *input,
00300                  guint        n_input_samples,
00301                  float       *output)
00302   {
00303     const uint history_todo = min (n_input_samples, ORDER - 1);
00304 
00305     copy (input, input + history_todo, &history[ORDER - 1]);
00306     process_block_aligned (&history[0], history_todo, output);
00307     if (n_input_samples > history_todo)
00308       {
00309         process_block_unaligned (input, n_input_samples - history_todo, &output [2 * history_todo]);
00310 
00311         // build new history from new input
00312         copy (input + n_input_samples - history_todo, input + n_input_samples, &history[0]);
00313       }
00314     else
00315       {
00316         // build new history from end of old history
00317         // (very expensive if n_input_samples tends to be a lot smaller than ORDER often)
00318         g_memmove (&history[0], &history[n_input_samples], sizeof (history[0]) * (ORDER - 1));
00319       }
00320   }
00324   guint
00325   order() const
00326   {
00327     return ORDER;
00328   }
00329   double
00330   delay() const
00331   {
00332     return order() - 1;
00333   }
00334 };
00335 
00343 template<guint ORDER, bool USE_SSE>
00344 class Downsampler2 : public Resampler2 {
00345   vector<float>        taps;
00346   AlignedArray<float,16> history_even;
00347   AlignedArray<float,16> history_odd;
00348   AlignedArray<float,16> sse_taps;
00349   /* fast SSE optimized convolution */
00350   template<int ODD_STEPPING> void
00351   process_4samples_aligned (const float *input_even /* aligned */,
00352                             const float *input_odd,
00353                             float       *output)
00354   {
00355     const guint H = (ORDER / 2) - 1; /* half the filter length */
00356 
00357     fir_process_4samples_sse (input_even, &sse_taps[0], ORDER, &output[0], &output[1], &output[2], &output[3]);
00358 
00359     output[0] += 0.5 * input_odd[H * ODD_STEPPING];
00360     output[1] += 0.5 * input_odd[(H + 1) * ODD_STEPPING];
00361     output[2] += 0.5 * input_odd[(H + 2) * ODD_STEPPING];
00362     output[3] += 0.5 * input_odd[(H + 3) * ODD_STEPPING];
00363   }
00364   /* slow convolution */
00365   template<int ODD_STEPPING> float
00366   process_sample_unaligned (const float *input_even,
00367                             const float *input_odd)
00368   {
00369     const guint H = (ORDER / 2) - 1; /* half the filter length */
00370 
00371     return fir_process_one_sample<float> (&input_even[0], &taps[0], ORDER) + 0.5 * input_odd[H * ODD_STEPPING];
00372   }
00373   template<int ODD_STEPPING> void
00374   process_block_aligned (const float *input_even,
00375                          const float *input_odd,
00376                          float       *output,
00377                          guint        n_output_samples)
00378   {
00379     uint i = 0;
00380     if (USE_SSE)
00381       {
00382         while (i + 3 < n_output_samples)
00383           {
00384             process_4samples_aligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING], &output[i]);
00385             i += 4;
00386           }
00387       }
00388     while (i < n_output_samples)
00389       {
00390         output[i] = process_sample_unaligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING]);
00391         i++;
00392       }
00393   }
00394   template<int ODD_STEPPING> void
00395   process_block_unaligned (const float *input_even,
00396                            const float *input_odd,
00397                            float       *output,
00398                            guint        n_output_samples)
00399   {
00400     uint i = 0;
00401     if (USE_SSE)
00402       {
00403         while ((reinterpret_cast<ptrdiff_t> (&input_even[i]) & 15) && i < n_output_samples)
00404           {
00405             output[i] = process_sample_unaligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING]);
00406             i++;
00407           }
00408       }
00409     process_block_aligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING], &output[i], n_output_samples);
00410   }
00411   void
00412   deinterleave2 (const float *data,
00413                  guint        n_data_values,
00414                  float       *output)
00415   {
00416     for (uint i = 0; i < n_data_values; i += 2)
00417       output[i / 2] = data[i];
00418   }
00419 public:
00425   Downsampler2 (float *init_taps) :
00426     taps (init_taps, init_taps + ORDER),
00427     history_even (2 * ORDER),
00428     history_odd (2 * ORDER),
00429     sse_taps (fir_compute_sse_taps (taps))
00430   {
00431     g_assert ((ORDER & 1) == 0);    /* even order filter */
00432   }
00437   void
00438   process_block (const float *input,
00439                  guint        n_input_samples,
00440                  float       *output)
00441   {
00442     g_assert ((n_input_samples & 1) == 0);
00443 
00444     const uint BLOCKSIZE = 1024;
00445 
00446     F4Vector  block[BLOCKSIZE / 4]; /* using F4Vector ensures 16-byte alignment */
00447     float    *input_even = &block[0].f[0];
00448 
00449     while (n_input_samples)
00450       {
00451         uint n_input_todo = min (n_input_samples, BLOCKSIZE * 2);
00452 
00453         /* since the halfband filter contains zeros every other sample
00454          * and since we're using SSE instructions, which expect the
00455          * data to be consecutively represented in memory, we prepare
00456          * a block of samples containing only even-indexed samples
00457          *
00458          * we keep the deinterleaved data on the stack (instead of per-class
00459          * allocated memory), to ensure that even running a lot of these
00460          * downsampler streams will not result in cache trashing
00461          *
00462          * FIXME: this implementation is suboptimal for non-SSE, because it
00463          * performs an extra deinterleaving step in any case, but deinterleaving
00464          * is only required for SSE instructions
00465          */
00466         deinterleave2 (input, n_input_todo, input_even);
00467 
00468         const float       *input_odd = input + 1; /* we process this one with a stepping of 2 */
00469 
00470         const uint n_output_todo = n_input_todo / 2;
00471         const uint history_todo = min (n_output_todo, ORDER - 1);
00472 
00473         copy (input_even, input_even + history_todo, &history_even[ORDER - 1]);
00474         deinterleave2 (input_odd, history_todo * 2, &history_odd[ORDER - 1]);
00475 
00476         process_block_aligned <1> (&history_even[0], &history_odd[0], output, history_todo);
00477         if (n_output_todo > history_todo)
00478           {
00479             process_block_unaligned<2> (input_even, input_odd, &output[history_todo], n_output_todo - history_todo);
00480 
00481             // build new history from new input (here: history_todo == ORDER - 1)
00482             copy (input_even + n_output_todo - history_todo, input_even + n_output_todo, &history_even[0]);
00483             deinterleave2 (input_odd + n_input_todo - history_todo * 2, history_todo * 2, &history_odd[0]); /* FIXME: can be optimized */
00484           }
00485         else
00486           {
00487             // build new history from end of old history
00488             // (very expensive if n_output_todo tends to be a lot smaller than ORDER often)
00489             g_memmove (&history_even[0], &history_even[n_output_todo], sizeof (history_even[0]) * (ORDER - 1));
00490             g_memmove (&history_odd[0], &history_odd[n_output_todo], sizeof (history_odd[0]) * (ORDER - 1));
00491           }
00492 
00493         n_input_samples -= n_input_todo;
00494         input += n_input_todo;
00495         output += n_output_todo;
00496       }
00497   }
00501   guint
00502   order() const
00503   {
00504     return ORDER;
00505   }
00506   double
00507   delay() const
00508   {
00509     return order() / 2 - 0.5;
00510   }
00511 };
00512 
00513 template<bool USE_SSE> Resampler2*
00514 Resampler2::create_impl (BseResampler2Mode      mode,
00515                          BseResampler2Precision precision)
00516 {
00517   if (mode == BSE_RESAMPLER2_MODE_UPSAMPLE)
00518     {
00519       switch (precision)
00520         {
00521         case BSE_RESAMPLER2_PREC_LINEAR: return create_impl_with_coeffs <Upsampler2<2, USE_SSE> > (halfband_fir_linear_coeffs, 2, 2.0);
00522         case BSE_RESAMPLER2_PREC_48DB:   return create_impl_with_coeffs <Upsampler2<16, USE_SSE> > (halfband_fir_48db_coeffs, 16, 2.0);
00523         case BSE_RESAMPLER2_PREC_72DB:   return create_impl_with_coeffs <Upsampler2<24, USE_SSE> > (halfband_fir_72db_coeffs, 24, 2.0);
00524         case BSE_RESAMPLER2_PREC_96DB:   return create_impl_with_coeffs <Upsampler2<32, USE_SSE> > (halfband_fir_96db_coeffs, 32, 2.0);
00525         case BSE_RESAMPLER2_PREC_120DB:  return create_impl_with_coeffs <Upsampler2<42, USE_SSE> > (halfband_fir_120db_coeffs, 42, 2.0);
00526         case BSE_RESAMPLER2_PREC_144DB:  return create_impl_with_coeffs <Upsampler2<52, USE_SSE> > (halfband_fir_144db_coeffs, 52, 2.0);
00527         }
00528     }
00529   else if (mode == BSE_RESAMPLER2_MODE_DOWNSAMPLE)
00530     {
00531       switch (precision)
00532         {
00533         case BSE_RESAMPLER2_PREC_LINEAR: return create_impl_with_coeffs <Downsampler2<2, USE_SSE> > (halfband_fir_linear_coeffs, 2, 1.0);
00534         case BSE_RESAMPLER2_PREC_48DB:   return create_impl_with_coeffs <Downsampler2<16, USE_SSE> > (halfband_fir_48db_coeffs, 16, 1.0);
00535         case BSE_RESAMPLER2_PREC_72DB:   return create_impl_with_coeffs <Downsampler2<24, USE_SSE> > (halfband_fir_72db_coeffs, 24, 1.0);
00536         case BSE_RESAMPLER2_PREC_96DB:   return create_impl_with_coeffs <Downsampler2<32, USE_SSE> > (halfband_fir_96db_coeffs, 32, 1.0);
00537         case BSE_RESAMPLER2_PREC_120DB:  return create_impl_with_coeffs <Downsampler2<42, USE_SSE> > (halfband_fir_120db_coeffs, 42, 1.0);
00538         case BSE_RESAMPLER2_PREC_144DB:  return create_impl_with_coeffs <Downsampler2<52, USE_SSE> > (halfband_fir_144db_coeffs, 52, 1.0);
00539         }
00540     }
00541   return 0;
00542 }
00543 
00544 } // Resampler
00545 } // Bse
00546 
00547 #endif /* __BSE_RESAMPLER_TCC__ */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines