Using GCC's support of OpenMP (now also in MingW!) I had a stab at making the outer loop of blurring parallel (the loops are typical DSP loops and shouldn't even throw an exception, so it should be perfectly safe). On my dual-core system blurring (a large image) is approximately twice as fast, so it seems to scale really well.
As I don't have access to a Linux system at the moment I haven't modified the Makefiles to add support, but it shouldn't be too difficult (see the changes to build.xml). Also, I've made it write a log to be able to analyze the performance, but it uses Win32 specific functions for timing. So, if anyone would like to have a go at making it work on Linux and/or testing it that would be great :)
The number of threads defaults to whatever omp_get_num_procs() returns, but on my system this doesn't work... So to use it, set the number of threads in the preferences, using: <group id="threading" numthreads="1" /> in the "options" group (so it's /options/threading/numthreads).
Obviously this is not the most efficient way to do this, as new threads are created for each blur. But so far the overhead doesn't appear to be too bad, compared with the actual blurring operation. And using OpenMP the impact on the code is minimal, I only had to put two #pragma's before the blur loops, allocate the temporary data it needs for all threads and add some preference for the number of threads. So if this works out it might provide a relatively easy way to start making use of all these dual- and quad-core machines people have lying around doing almost nothing these days :)
Patch attached, if wanted I can also provide a binary for Windows. This version writes a log file (blurlog.txt) that lists how long each blur takes.
Note that inkscape also needs mingwm10.dll to run with this patch. As it isn't included with devlibs at the moment you'll have to copy it to Inkscape's directory manually (it's in the MingW bin directory).
Index: build.xml =================================================================== --- build.xml (revision 20219) +++ build.xml (working copy) @@ -375,6 +375,7 @@ -Wall -Wformat -Werror=format-security -W -Wpointer-arith -Wcast-align -Wsign-compare -Woverloaded-virtual -Wswitch -O2 -mms-bitfields + -fopenmp </flags> <defines> -DVERSION="${version}" @@ -476,6 +477,7 @@ objcopycommand="${archutil}objcopy"> <flags> -mwindows + -mthreads </flags> <fileset dir="${build}"> <include name="inkres.o"/> @@ -501,6 +503,7 @@ -lpng -ljpeg.dll -ltiff.dll -lpopt ${devlibs}/lib/zdll.lib -lgc -lws2_32 -lintl -lgdi32 -lcomdlg32 -lm + -lgomp -lpthreadGC2 </libs> </link> </target> @@ -530,6 +533,7 @@ objcopycommand="${archutil}objcopy"> <flags> -mwindows + -mthreads </flags> <fileset dir="${build}"> <include name="inkviewres.o"/> @@ -553,6 +557,7 @@ -lpng -ljpeg.dll -ltiff.dll -lpopt ${devlibs}/lib/zdll.lib -lgc -lws2_32 -lintl -lgdi32 -lcomdlg32 -lm + -lgomp -lpthreadGC2 </libs> </link> </target> @@ -572,6 +577,7 @@ stripcommand="${archutil}strip" objcopycommand="${archutil}objcopy"> <flags> + -mthreads </flags> <fileset dir="${build}"> <include name="obj/test-main.o"/> @@ -602,6 +608,7 @@ -lpng -ljpeg.dll -ltiff.dll -lpopt ${devlibs}/lib/zdll.lib -lgc -lws2_32 -lintl -lgdi32 -lcomdlg32 -lm + -lgomp -lpthreadGC2 </libs> </link> </target> @@ -667,6 +674,7 @@ <copy todir="${dist}" file="${devlibs}/bin/libpopt-0.dll"/> <copy todir="${dist}" file="${devlibs}/bin/liblcms-1.dll"/> <copy todir="${dist}" file="${devlibs}/bin/intl.dll"/> + <copy todir="${dist}" file="${devlibs}/bin/pthreadGC2.dll"/> <copy file="${devlibs}/bin/intl.dll" tofile="${dist}/libintl-2.dll"/>
<!-- MSGFMT files --> Index: src/display/nr-filter-gaussian.cpp =================================================================== --- src/display/nr-filter-gaussian.cpp (revision 20219) +++ src/display/nr-filter-gaussian.cpp (working copy) @@ -16,9 +16,11 @@ #include <algorithm> #include <cmath> #include <complex> +#include <cstdlib> #include <glib.h> -#include <cstdlib> #include <limits> +#include <omp.h> +#include <windows.h> // For performance logging
#include "2geom/isnan.h"
@@ -268,9 +270,11 @@ filter2D_IIR(PT *const dest, int const dstr1, int const dstr2, PT const *const src, int const sstr1, int const sstr2, int const n1, int const n2, IIRValue const b[N+1], double const M[N*N], - IIRValue *const tmpdata) + IIRValue *const tmpdata[], int const num_threads) { +#pragma omp parallel for num_threads(num_threads) for ( int c2 = 0 ; c2 < n2 ; c2++ ) { + unsigned int tid = omp_get_thread_num(); // corresponding line in the source and output buffer PT const * srcimg = src + c2*sstr2; PT * dstimg = dest + c2*dstr2 + n1*dstr1; @@ -288,7 +292,7 @@ for(unsigned int i=1; i<N+1; i++) { for(unsigned int c=0; c<PC; c++) u[0][c] += u[i][c]*b[i]; } - copy_n(u[0], PC, tmpdata+c1*PC); + copy_n(u[0], PC, tmpdata[tid]+c1*PC); } // Backward pass IIRValue v[N+1][PC]; @@ -303,7 +307,7 @@ int c1=n1-1; while(c1-->0) { for(unsigned int i=N; i>0; i--) copy_n(v[i-1], PC, v[i]); - copy_n(tmpdata+c1*PC, PC, v[0]); + copy_n(tmpdata[tid]+c1*PC, PC, v[0]); for(unsigned int c=0; c<PC; c++) v[0][c] *= b[0]; for(unsigned int i=1; i<N+1; i++) { for(unsigned int c=0; c<PC; c++) v[0][c] += v[i][c]*b[i]; @@ -326,11 +330,12 @@ static void filter2D_FIR(PT *const dst, int const dstr1, int const dstr2, PT const *const src, int const sstr1, int const sstr2, - int const n1, int const n2, FIRValue const *const kernel, int const scr_len) + int const n1, int const n2, FIRValue const *const kernel, int const scr_len, int const num_threads) { // Past pixels seen (to enable in-place operation) PT history[scr_len+1][PC];
+#pragma omp parallel for num_threads(num_threads) private(history) for ( int c2 = 0 ; c2 < n2 ; c2++ ) {
// corresponding line in the source buffer @@ -539,13 +544,14 @@ }
// Some common constants + Inkscape::Preferences *prefs = Inkscape::Preferences::get(); int const width_org = in->area.x1-in->area.x0, height_org = in->area.y1-in->area.y0; double const deviation_x_org = _deviation_x * NR::expansionX(trans); double const deviation_y_org = _deviation_y * NR::expansionY(trans); int const PC = NR_PIXBLOCK_BPP(in); + int const NTHREADS = std::max(1,std::min(8,prefs->getInt("/options/threading/numthreads",omp_get_num_procs())));
// Subsampling constants - Inkscape::Preferences *prefs = Inkscape::Preferences::get(); int const quality = prefs->getInt("/options/blurquality/value"); int const x_step_l2 = _effect_subsample_step_log2(deviation_x_org, quality); int const y_step_l2 = _effect_subsample_step_log2(deviation_y_org, quality); @@ -567,6 +573,11 @@ bool const use_IIR_x = deviation_x > 3; bool const use_IIR_y = deviation_y > 3;
+ // Temporary performance logging + LARGE_INTEGER startTime, endTime, timeFrequency; + QueryPerformanceFrequency(&timeFrequency); + QueryPerformanceCounter(&startTime); + // new buffer for the subsampled output NRPixBlock *out = new NRPixBlock; nr_pixblock_setup_fast(out, in->mode, in->area.x0/x_step, in->area.y0/y_step, @@ -577,13 +588,19 @@ } // Temporary storage for IIR filter // NOTE: This can be eliminated, but it reduces the precision a bit - IIRValue * tmpdata = 0; + IIRValue * tmpdata[NTHREADS]; + std::fill_n(tmpdata, NTHREADS, (IIRValue*)0); if ( use_IIR_x || use_IIR_y ) { - tmpdata = new IIRValue[std::max(width,height)*PC]; - if (tmpdata == NULL) { - nr_pixblock_release(out); - delete out; - return 0; + for(int i=0; i<NTHREADS; i++) { + tmpdata[i] = new IIRValue[std::max(width,height)*PC]; + if (tmpdata[i] == NULL) { + nr_pixblock_release(out); + while(i-->0) { + delete[] tmpdata[i]; + } + delete out; + return 0; + } } } NRPixBlock *ssin = in; @@ -629,16 +646,16 @@ // Filter (x) switch(in->mode) { case NR_PIXBLOCK_MODE_A8: ///< Grayscale - filter2D_IIR<unsigned char,1,false>(NR_PIXBLOCK_PX(out), 1, out->rs, NR_PIXBLOCK_PX(ssin), 1, ssin->rs, width, height, b, M, tmpdata); + filter2D_IIR<unsigned char,1,false>(NR_PIXBLOCK_PX(out), 1, out->rs, NR_PIXBLOCK_PX(ssin), 1, ssin->rs, width, height, b, M, tmpdata, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8: ///< 8 bit RGB - filter2D_IIR<unsigned char,3,false>(NR_PIXBLOCK_PX(out), 3, out->rs, NR_PIXBLOCK_PX(ssin), 3, ssin->rs, width, height, b, M, tmpdata); + filter2D_IIR<unsigned char,3,false>(NR_PIXBLOCK_PX(out), 3, out->rs, NR_PIXBLOCK_PX(ssin), 3, ssin->rs, width, height, b, M, tmpdata, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8A8N: ///< Normal 8 bit RGBA - filter2D_IIR<unsigned char,4,false>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, b, M, tmpdata); + filter2D_IIR<unsigned char,4,false>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, b, M, tmpdata, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8A8P: ///< Premultiplied 8 bit RGBA - filter2D_IIR<unsigned char,4,true >(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, b, M, tmpdata); + filter2D_IIR<unsigned char,4,true >(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, b, M, tmpdata, NTHREADS); break; default: assert(false); @@ -651,16 +668,16 @@ // Filter (x) switch(in->mode) { case NR_PIXBLOCK_MODE_A8: ///< Grayscale - filter2D_FIR<unsigned char,1>(NR_PIXBLOCK_PX(out), 1, out->rs, NR_PIXBLOCK_PX(ssin), 1, ssin->rs, width, height, kernel, scr_len_x); + filter2D_FIR<unsigned char,1>(NR_PIXBLOCK_PX(out), 1, out->rs, NR_PIXBLOCK_PX(ssin), 1, ssin->rs, width, height, kernel, scr_len_x, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8: ///< 8 bit RGB - filter2D_FIR<unsigned char,3>(NR_PIXBLOCK_PX(out), 3, out->rs, NR_PIXBLOCK_PX(ssin), 3, ssin->rs, width, height, kernel, scr_len_x); + filter2D_FIR<unsigned char,3>(NR_PIXBLOCK_PX(out), 3, out->rs, NR_PIXBLOCK_PX(ssin), 3, ssin->rs, width, height, kernel, scr_len_x, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8A8N: ///< Normal 8 bit RGBA - filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, kernel, scr_len_x); + filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, kernel, scr_len_x, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8A8P: ///< Premultiplied 8 bit RGBA - filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, kernel, scr_len_x); + filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), 4, out->rs, NR_PIXBLOCK_PX(ssin), 4, ssin->rs, width, height, kernel, scr_len_x, NTHREADS); break; default: assert(false); @@ -688,16 +705,16 @@ // Filter (y) switch(in->mode) { case NR_PIXBLOCK_MODE_A8: ///< Grayscale - filter2D_IIR<unsigned char,1,false>(NR_PIXBLOCK_PX(out), out->rs, 1, NR_PIXBLOCK_PX(out), out->rs, 1, height, width, b, M, tmpdata); + filter2D_IIR<unsigned char,1,false>(NR_PIXBLOCK_PX(out), out->rs, 1, NR_PIXBLOCK_PX(out), out->rs, 1, height, width, b, M, tmpdata, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8: ///< 8 bit RGB - filter2D_IIR<unsigned char,3,false>(NR_PIXBLOCK_PX(out), out->rs, 3, NR_PIXBLOCK_PX(out), out->rs, 3, height, width, b, M, tmpdata); + filter2D_IIR<unsigned char,3,false>(NR_PIXBLOCK_PX(out), out->rs, 3, NR_PIXBLOCK_PX(out), out->rs, 3, height, width, b, M, tmpdata, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8A8N: ///< Normal 8 bit RGBA - filter2D_IIR<unsigned char,4,false>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, b, M, tmpdata); + filter2D_IIR<unsigned char,4,false>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, b, M, tmpdata, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8A8P: ///< Premultiplied 8 bit RGBA - filter2D_IIR<unsigned char,4,true >(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, b, M, tmpdata); + filter2D_IIR<unsigned char,4,true >(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, b, M, tmpdata, NTHREADS); break; default: assert(false); @@ -710,24 +727,32 @@ // Filter (y) switch(in->mode) { case NR_PIXBLOCK_MODE_A8: ///< Grayscale - filter2D_FIR<unsigned char,1>(NR_PIXBLOCK_PX(out), out->rs, 1, NR_PIXBLOCK_PX(out), out->rs, 1, height, width, kernel, scr_len_y); + filter2D_FIR<unsigned char,1>(NR_PIXBLOCK_PX(out), out->rs, 1, NR_PIXBLOCK_PX(out), out->rs, 1, height, width, kernel, scr_len_y, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8: ///< 8 bit RGB - filter2D_FIR<unsigned char,3>(NR_PIXBLOCK_PX(out), out->rs, 3, NR_PIXBLOCK_PX(out), out->rs, 3, height, width, kernel, scr_len_y); + filter2D_FIR<unsigned char,3>(NR_PIXBLOCK_PX(out), out->rs, 3, NR_PIXBLOCK_PX(out), out->rs, 3, height, width, kernel, scr_len_y, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8A8N: ///< Normal 8 bit RGBA - filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, kernel, scr_len_y); + filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, kernel, scr_len_y, NTHREADS); break; case NR_PIXBLOCK_MODE_R8G8B8A8P: ///< Premultiplied 8 bit RGBA - filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, kernel, scr_len_y); + filter2D_FIR<unsigned char,4>(NR_PIXBLOCK_PX(out), out->rs, 4, NR_PIXBLOCK_PX(out), out->rs, 4, height, width, kernel, scr_len_y, NTHREADS); break; default: assert(false); }; }
- delete[] tmpdata; // deleting a nullptr has no effect, so this is save + for(int i=0; i<NTHREADS; i++) { + delete[] tmpdata[i]; // deleting a nullptr has no effect, so this is safe + }
+ // Temporary performance logging + QueryPerformanceCounter(&endTime); + FILE* logfile = fopen("blurlog.txt", "at"); + fprintf(logfile, "image size: %dx%d, threads: %d, time: %.3gs\n", width, height, NTHREADS, static_cast<double>(endTime.QuadPart-startTime.QuadPart)/timeFrequency.QuadPart); + fclose(logfile); + if ( !resampling ) { // No upsampling needed out->empty = FALSE;