New subject: SSE optimized blur (was: Multithreaded blur using OpenMP)

18 Nov 2008

Using GCC's support of OpenMP (now also in MingW!) I had a stab at 
making the outer loop of blurring parallel (the loops are typical DSP 
loops and shouldn't even throw an exception, so it should be perfectly 
safe). On my dual-core system blurring (a large image) is approximately 
twice as fast, so it seems to scale really well.
As I don't have access to a Linux system at the moment I haven't 
modified the Makefiles to add support, but it shouldn't be too difficult 
(see the changes to build.xml). Also, I've made it write a log to be 
able to analyze the performance, but it uses Win32 specific functions 
for timing. So, if anyone would like to have a go at making it work on 
Linux and/or testing it that would be great :)
The number of threads defaults to whatever omp_get_num_procs() returns, 
but on my system this doesn't work... So to use it, set the number of 
threads in the preferences, using:
     <group id="threading"
        numthreads="1" />
in the "options" group (so it's /options/threading/numthreads).
Obviously this is not the most efficient way to do this, as new threads 
are created for each blur. But so far the overhead doesn't appear to be 
too bad, compared with the actual blurring operation. And using OpenMP 
the impact on the code is minimal, I only had to put two #pragma's 
before the blur loops, allocate the temporary data it needs for all 
threads and add some preference for the number of threads. So if this 
works out it might provide a relatively easy way to start making use of 
all these dual- and quad-core machines people have lying around doing 
almost nothing these days :)
Patch attached, if wanted I can also provide a binary for Windows. This 
version writes a log file (blurlog.txt) that lists how long each blur takes.
Note that inkscape also needs mingwm10.dll to run with this patch. As it 
isn't included with devlibs at the moment you'll have to copy it to 
Inkscape's directory manually (it's in the MingW bin directory).
Index: build.xml
===================================================================

--- build.xml	(revision 20219)
+++ build.xml	(working copy)
@@ -375,6 +375,7 @@
             -Wall -Wformat -Werror=format-security -W -Wpointer-arith -Wcast-align -Wsign-compare -Woverloaded-virtual -Wswitch
             -O2
             -mms-bitfields
+            -fopenmp
         </flags>
         <defines>
             -DVERSION="${version}"
@@ -476,6 +477,7 @@
               objcopycommand="${archutil}objcopy">
        <flags>
            -mwindows
+           -mthreads
        </flags>
        <fileset dir="${build}">
            <include name="inkres.o"/>
@@ -501,6 +503,7 @@
            -lpng -ljpeg.dll -ltiff.dll -lpopt ${devlibs}/lib/zdll.lib
            -lgc
            -lws2_32 -lintl -lgdi32 -lcomdlg32 -lm
+           -lgomp -lpthreadGC2
        </libs>
     </link>
   </target>
@@ -530,6 +533,7 @@
               objcopycommand="${archutil}objcopy">
        <flags>
            -mwindows
+           -mthreads
        </flags>
        <fileset dir="${build}">
            <include name="inkviewres.o"/>
@@ -553,6 +557,7 @@
            -lpng -ljpeg.dll -ltiff.dll -lpopt ${devlibs}/lib/zdll.lib
            -lgc
            -lws2_32 -lintl -lgdi32 -lcomdlg32 -lm
+           -lgomp -lpthreadGC2
        </libs>
     </link>
   </target>
@@ -572,6 +577,7 @@
               stripcommand="${archutil}strip"
               objcopycommand="${archutil}objcopy">
        <flags>
+           -mthreads
        </flags>
        <fileset dir="${build}">
            <include name="obj/test-main.o"/>
@@ -602,6 +608,7 @@
            -lpng -ljpeg.dll -ltiff.dll -lpopt ${devlibs}/lib/zdll.lib
            -lgc
            -lws2_32 -lintl -lgdi32 -lcomdlg32 -lm
+           -lgomp -lpthreadGC2
        </libs>
     </link>
   </target>
@@ -667,6 +674,7 @@
     <copy todir="${dist}" file="${devlibs}/bin/libpopt-0.dll"/>
     <copy todir="${dist}" file="${devlibs}/bin/liblcms-1.dll"/>
     <copy todir="${dist}" file="${devlibs}/bin/intl.dll"/>
+    <copy todir="${dist}" file="${devlibs}/bin/pthreadGC2.dll"/>
     <copy file="${devlibs}/bin/intl.dll" tofile="${dist}/libintl-2.dll"/>
<!-- MSGFMT files -->
Index: src/display/nr-filter-gaussian.cpp
===================================================================
--- src/display/nr-filter-gaussian.cpp	(revision 20219)
+++ src/display/nr-filter-gaussian.cpp	(working copy)
@@ -16,9 +16,11 @@
 #include <algorithm>
 #include <cmath>
 #include <complex>
+#include <cstdlib>
 #include <glib.h>
-#include <cstdlib>
 #include <limits>
+#include <omp.h>
+#include <windows.h> // For performance logging
#include "2geom/isnan.h"
@@ -268,9 +270,11 @@
 filter2D_IIR(PT *const dest, int const dstr1, int const dstr2,
              PT const *const src, int const sstr1, int const sstr2,
              int const n1, int const n2, IIRValue const b[N+1], double const M[N*N],
-             IIRValue *const tmpdata)
+             IIRValue *const tmpdata[], int const num_threads)
 {
+#pragma omp parallel for num_threads(num_threads)
     for ( int c2 = 0 ; c2 < n2 ; c2++ ) {
+        unsigned int tid = omp_get_thread_num();
         // corresponding line in the source and output buffer
         PT const * srcimg = src  + c2*sstr2;
         PT       * dstimg = dest + c2*dstr2 + n1*dstr1;
@@ -288,7 +292,7 @@
             for(unsigned int i=1; i<N+1; i++) {
                 for(unsigned int c=0; c<PC; c++) u[0][c] += u[i][c]*b[i];
             }
-            copy_n(u[0], PC, tmpdata+c1*PC);
+            copy_n(u[0], PC, tmpdata[tid]+c1*PC);
         }
         // Backward pass
         IIRValue v[N+1][PC];
@@ -303,7 +307,7 @@
         int c1=n1-1;
         while(c1-->0) {
             for(unsigned int i=N; i>0; i--) copy_n(v[i-1], PC, v[i]);
-            copy_n(tmpdata+c1*PC, PC, v[0]);
+            copy_n(tmpdata[tid]+c1*PC, PC, v[0]);
             for(unsigned int c=0; c<PC; c++) v[0][c] *= b[0];
             for(unsigned int i=1; i<N+1; i++) {
                 for(unsigned int c=0; c<PC; c++) v[0][c] += v[i][c]*b[i];
@@ -326,11 +330,12 @@
 static void
 filter2D_FIR(PT *const dst, int const dstr1, int const dstr2,
              PT const *const src, int const sstr1, int const sstr2,
-             int const n1, int const n2, FIRValue const *const kernel, int const scr_len)
+             int const n1, int const n2, FIRValue const *const kernel, int const scr_len, int const num_threads)
 {
     // Past pixels seen (to enable in-place operation)
     PT history[scr_len+1][PC];
+#pragma omp parallel for num_threads(num_threads) private(history)
     for ( int c2 = 0 ; c2 < n2 ; c2++ ) {
// corresponding line in the source buffer
@@ -539,13 +544,14 @@
     }
// Some common constants
+    Inkscape::Preferences *prefs = Inkscape::Preferences::get();
     int const width_org = in->area.x1-in->area.x0, height_org = in->area.y1-in->area.y0;
     double const deviation_x_org = _deviation_x * NR::expansionX(trans);
     double const deviation_y_org = _deviation_y * NR::expansionY(trans);
     int const PC = NR_PIXBLOCK_BPP(in);
+    int const NTHREADS = std::max(1,std::min(8,prefs->getInt("/options/threading/numthreads",omp_get_num_procs())));
// Subsampling constants
-    Inkscape::Preferences *prefs = Inkscape::Preferences::get();
     int const quality = prefs->getInt("/options/blurquality/value");
     int const x_step_l2 = _effect_subsample_step_log2(deviation_x_org, quality);
     int const y_step_l2 = _effect_subsample_step_log2(deviation_y_org, quality);
@@ -567,6 +573,11 @@
     bool const use_IIR_x = deviation_x > 3;
     bool const use_IIR_y = deviation_y > 3;
+    // Temporary performance logging
+    LARGE_INTEGER startTime, endTime, timeFrequency;
+    QueryPerformanceFrequency(&timeFrequency);
+    QueryPerformanceCounter(&startTime);
+
     // new buffer for the subsampled output
     NRPixBlock *out = new NRPixBlock;
     nr_pixblock_setup_fast(out, in->mode, in->area.x0/x_step,       in->area.y0/y_step,
@@ -577,13 +588,19 @@
     }
     // Temporary storage for IIR filter
     // NOTE: This can be eliminated, but it reduces the precision a bit
-    IIRValue * tmpdata = 0;
+    IIRValue * tmpdata[NTHREADS];
+    std::fill_n(tmpdata, NTHREADS, (IIRValue*)0);
     if ( use_IIR_x || use_IIR_y ) {
-        tmpdata = new IIRValue[std::max(width,height)*PC];
-        if (tmpdata == NULL) {
-            nr_pixblock_release(out);
-            delete out;
-            return 0;
+        for(int i=0; i<NTHREADS; i++) {
+            tmpdata[i] = new IIRValue[std::max(width,height)*PC];
+            if (tmpdata[i] == NULL) {
+                nr_pixblock_release(out);
+                while(i-->0) {
+                    delete[] tmpdata[i];
+                }
+                delete out;
+                return 0;
+            }
         }
     }
     NRPixBlock *ssin = in;
@@ -629,16 +646,16 @@
         // Filter (x)
         switch(in->mode) {
         case NR_PIXBLOCK_MODE_A8:        ///< Grayscale
-            filter2D_IIR<unsigned char,1,false>(NR_PIXBLOCK_PX(out), 1, out->rs, NR_PIXBLOCK_PX(ssin), 1, ssin->rs, width, height, b, M, tmpdata);
+            filter2D_IIR<unsigned char,1,false>(NR_PIXBLOCK_PX(out), 1, out->rs, NR_PIXBLOCK_PX(ssin), 1, ssin->rs, width, height, b, M, tmpdata, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8:    ///< 8 bit RGB
-            filter2D_IIR<unsigned char,3,false>(NR_PIXBLOCK_PX(out), 3, out->rs, NR_PIXBLOCK_PX(ssin), 3, ssin->rs, width, height, b, M, tmpdata);
+            filter2D_IIR<unsigned char,3,false>(NR_PIXBLOCK_PX(out), 3, out->rs, NR_PIXBLOCK_PX(ssin), 3, ssin->rs, width, height, b, M, tmpdata, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8A8N: ///< Normal 8 bit RGBA
-            filter2D_IIR<unsigned char,4,false>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, b, M, tmpdata);
+            filter2D_IIR<unsigned char,4,false>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, b, M, tmpdata, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8A8P:  ///< Premultiplied 8 bit RGBA
-            filter2D_IIR<unsigned char,4,true >(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, b, M, tmpdata);
+            filter2D_IIR<unsigned char,4,true >(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, b, M, tmpdata, NTHREADS);
             break;
         default:
             assert(false);
@@ -651,16 +668,16 @@
         // Filter (x)
         switch(in->mode) {
         case NR_PIXBLOCK_MODE_A8:        ///< Grayscale
-            filter2D_FIR<unsigned char,1>(NR_PIXBLOCK_PX(out), 1, out->rs, NR_PIXBLOCK_PX(ssin), 1, ssin->rs, width, height, kernel, scr_len_x);
+            filter2D_FIR<unsigned char,1>(NR_PIXBLOCK_PX(out), 1, out->rs, NR_PIXBLOCK_PX(ssin), 1, ssin->rs, width, height, kernel, scr_len_x, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8:    ///< 8 bit RGB
-            filter2D_FIR<unsigned char,3>(NR_PIXBLOCK_PX(out), 3, out->rs, NR_PIXBLOCK_PX(ssin), 3, ssin->rs, width, height, kernel, scr_len_x);
+            filter2D_FIR<unsigned char,3>(NR_PIXBLOCK_PX(out), 3, out->rs, NR_PIXBLOCK_PX(ssin), 3, ssin->rs, width, height, kernel, scr_len_x, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8A8N: ///< Normal 8 bit RGBA
-            filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, kernel, scr_len_x);
+            filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, kernel, scr_len_x, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8A8P:  ///< Premultiplied 8 bit RGBA
-            filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, kernel, scr_len_x);
+            filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, kernel, scr_len_x, NTHREADS);
             break;
         default:
             assert(false);
@@ -688,16 +705,16 @@
         // Filter (y)
         switch(in->mode) {
         case NR_PIXBLOCK_MODE_A8:        ///< Grayscale
-            filter2D_IIR<unsigned char,1,false>(NR_PIXBLOCK_PX(out), out->rs, 1, NR_PIXBLOCK_PX(out), out->rs, 1, height, width, b, M, tmpdata);
+            filter2D_IIR<unsigned char,1,false>(NR_PIXBLOCK_PX(out), out->rs, 1, NR_PIXBLOCK_PX(out), out->rs, 1, height, width, b, M, tmpdata, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8:    ///< 8 bit RGB
-            filter2D_IIR<unsigned char,3,false>(NR_PIXBLOCK_PX(out), out->rs, 3, NR_PIXBLOCK_PX(out), out->rs, 3, height, width, b, M, tmpdata);
+            filter2D_IIR<unsigned char,3,false>(NR_PIXBLOCK_PX(out), out->rs, 3, NR_PIXBLOCK_PX(out), out->rs, 3, height, width, b, M, tmpdata, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8A8N: ///< Normal 8 bit RGBA
-            filter2D_IIR<unsigned char,4,false>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, b, M, tmpdata);
+            filter2D_IIR<unsigned char,4,false>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, b, M, tmpdata, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8A8P:  ///< Premultiplied 8 bit RGBA
-            filter2D_IIR<unsigned char,4,true >(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, b, M, tmpdata);
+            filter2D_IIR<unsigned char,4,true >(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, b, M, tmpdata, NTHREADS);
             break;
         default:
             assert(false);
@@ -710,24 +727,32 @@
         // Filter (y)
         switch(in->mode) {
         case NR_PIXBLOCK_MODE_A8:        ///< Grayscale
-            filter2D_FIR<unsigned char,1>(NR_PIXBLOCK_PX(out), out->rs, 1, NR_PIXBLOCK_PX(out), out->rs, 1, height, width, kernel, scr_len_y);
+            filter2D_FIR<unsigned char,1>(NR_PIXBLOCK_PX(out), out->rs, 1, NR_PIXBLOCK_PX(out), out->rs, 1, height, width, kernel, scr_len_y, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8:    ///< 8 bit RGB
-            filter2D_FIR<unsigned char,3>(NR_PIXBLOCK_PX(out), out->rs, 3, NR_PIXBLOCK_PX(out), out->rs, 3, height, width, kernel, scr_len_y);
+            filter2D_FIR<unsigned char,3>(NR_PIXBLOCK_PX(out), out->rs, 3, NR_PIXBLOCK_PX(out), out->rs, 3, height, width, kernel, scr_len_y, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8A8N: ///< Normal 8 bit RGBA
-            filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, kernel, scr_len_y);
+            filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, kernel, scr_len_y, NTHREADS);
             break;
         case NR_PIXBLOCK_MODE_R8G8B8A8P:  ///< Premultiplied 8 bit RGBA
-            filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, kernel, scr_len_y);
+            filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, kernel, scr_len_y, NTHREADS);
             break;
         default:
             assert(false);
         };
     }
-    delete[] tmpdata; // deleting a nullptr has no effect, so this is save
+    for(int i=0; i<NTHREADS; i++) {
+        delete[] tmpdata[i]; // deleting a nullptr has no effect, so this is safe
+    }
+    // Temporary performance logging
+    QueryPerformanceCounter(&endTime);
+    FILE* logfile = fopen("blurlog.txt", "at");
+    fprintf(logfile, "image size: %dx%d, threads: %d, time: %.3gs\n", width, height, NTHREADS, static_cast<double>(endTime.QuadPart-startTime.QuadPart)/timeFrequency.QuadPart);
+    fclose(logfile);
+
     if ( !resampling ) {
         // No upsampling needed
         out->empty = FALSE;

    

Multithreaded blur using OpenMP

Jasper van de Gronde

MenTaLguY

Jasper van de Gronde

Felipe Sanches

john cliff

Jasper van de Gronde

Jasper van de Gronde

Joel Holdsworth

Jasper van de Gronde

Ted Gould

bulia byak

Bob Jamison

Ted Gould

Jon A. Cruz

bulia byak

Ted Gould

Ted Gould

Jon A. Cruz

Ted Gould

Jon A. Cruz

Ted Gould

john cliff

Ted Gould

Bob Jamison

Jasper van de Gronde

Jon A. Cruz

Bryce Harrington

tags (0)

participants (10)