--- nr-filter-gaussian.orig.cpp 2016-09-19 04:52:06.345314000 -0700 +++ nr-filter-gaussian.cpp 2016-09-19 04:51:34.310670194 -0700 @@ -12,6 +12,11 @@ */ #include "config.h" // Needed for HAVE_OPENMP +#include +#ifdef __SSE4_1__ + #include +#endif +#include #include #include @@ -19,6 +24,7 @@ #include #include #include +#include #if HAVE_OPENMP #include #endif //HAVE_OPENMP @@ -32,11 +38,18 @@ #include <2geom/affine.h> #include "util/fixed_point.h" #include "preferences.h" +#include +#include +#ifdef _WIN32 + #include +#endif #ifndef INK_UNUSED #define INK_UNUSED(x) ((void)(x)) #endif +using namespace std; + // IIR filtering method based on: // L.J. van Vliet, I.T. Young, and P.W. Verbeek, Recursive Gaussian Derivative Filters, // in: A.K. Jain, S. Venkatesh, B.C. Lovell (eds.), @@ -54,10 +67,12 @@ // filters are used). static size_t const N = 3; +#if __cplusplus < 201103 template inline void copy_n(InIt beg_in, Size N, OutIt beg_out) { std::copy(beg_in, beg_in+N, beg_out); } +#endif // Type used for IIR filter coefficients (can be 10.21 signed fixed point, see Anisotropic Gaussian Filtering Using Fixed Point Arithmetic, Christoph H. Lampert & Oliver Wirjadi, 2006) typedef double IIRValue; @@ -142,8 +157,8 @@ return (int)std::ceil(std::fabs(deviation) * 3.0); } -static void -_make_kernel(FIRValue *const kernel, double const deviation) +template +static void _make_kernel(FIRValue *const kernel, double const deviation) { int const scr_len = _effect_area_scr(deviation); g_assert(scr_len >= 0); @@ -546,6 +561,3815 @@ }; } +// can Inkscape use Boost chrono? +double Clock() +{ +#ifdef _WIN32 + static LARGE_INTEGER freq; + if (freq.QuadPart == 0) + QueryPerformanceFrequency(&freq); + LARGE_INTEGER t; + QueryPerformanceCounter(&t); + return t.QuadPart / (double)freq.QuadPart; +#else + timespec t; + clock_gettime(CLOCK_REALTIME, &t); + return t.tv_sec + t.tv_nsec * 1.0E-9; +#endif +} +#ifdef __AVX2__ + #define USE_MULTIPLY_ADD +#endif + +#ifndef __GNUC__ +__m128d operator + (__m128d a, __m128d b) +{ + return _mm_add_pd(a, b); +} + +__m128d operator - (__m128d a, __m128d b) +{ + return _mm_sub_pd(a, b); +} + +__m128d operator * (__m128d a, __m128d b) +{ + return _mm_mul_pd(a, b); +} + +__m256d operator + (__m256d a, __m256d b) +{ + return _mm256_add_pd(a, b); +} + +__m256d operator - (__m256d a, __m256d b) +{ + return _mm256_sub_pd(a, b); +} + +__m256d operator * (__m256d a, __m256d b) +{ + return _mm256_mul_pd(a, b); +} + +__m128 operator + (__m128 a, __m128 b) +{ + return _mm_add_ps(a, b); +} + +__m128 operator - (__m128 a, __m128 b) +{ + return _mm_sub_ps(a, b); +} + +__m128 operator * (__m128 a, __m128 b) +{ + return _mm_mul_ps(a, b); +} + +__m256 operator + (__m256 a, __m256 b) +{ + return _mm256_add_ps(a, b); +} + +__m256 operator - (__m256 a, __m256 b) +{ + return _mm256_sub_ps(a, b); +} + +__m256 operator * (__m256 a, __m256 b) +{ + return _mm256_mul_ps(a, b); +} +#endif + +#ifdef _MSC_VER +#define FORCE_INLINE __forceinline +#define ALIGN(x) __declspec(align(x)) +#else +#define FORCE_INLINE __attribute__((always_inline)) +#define ALIGN(x) __attribute__((alignment(x))) + +__m256 _mm256_setr_m128(__m128 lo, __m128 hi) +{ + return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); +} + +__m256i _mm256_setr_m128i(__m128i lo, __m128i hi) +{ + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); +} + +__m256d _mm256_setr_m128d(__m128d lo, __m128d hi) +{ + return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); +} + +#endif + +__m128 MultiplyAdd(__m128 a, __m128 b, __m128 c) +{ + return _mm_fmadd_ps(a, b, c); +} + +__m256 MultiplyAdd(__m256 a, __m256 b, __m256 c) +{ + return _mm256_fmadd_ps(a, b, c); +} + +__m256d MultiplyAdd(__m256d a, __m256d b, __m256d c) +{ + return _mm256_fmadd_pd(a, b, c); +} + + +float ExtractElement0(__m256 x) +{ + return _mm_cvtss_f32(_mm256_castps256_ps128(x)); +} + + +double ExtractElement0(__m256d x) +{ + return _mm_cvtsd_f64(_mm256_castpd256_pd128(x)); +} + +template +static void calcTriggsSdikaInitialization(double const M[N*N], float uold[N][SIZE], float const uplus[SIZE], float const vplus[SIZE], float const alpha, float vold[N][SIZE]) +{ + __m128 v4f_alpha = _mm_set1_ps(alpha); + ssize_t c; + for (c = 0; c + 4 <= SIZE; c += 4) + { + __m128 uminp[N]; + for(ssize_t i=0; i +static void calcTriggsSdikaInitialization(double const M[N*N], double uold[N][SIZE], double const uplus[SIZE], double const vplus[SIZE], double const alpha, double vold[N][SIZE]) +{ + __m128d v2f_alpha = _mm_set1_pd(alpha); + ssize_t c; + for (c = 0; c <= SIZE - 2; c += 2) + { + __m128d uminp[N]; + for(ssize_t i=0; i +class SimpleImage +{ +public: + SimpleImage() + { + } + SimpleImage(AnyType *b, ssize_t p) + { + buffer = b; + pitch = p; + } + AnyType *operator[](ssize_t y) + { + return (AnyType *)((uint8_t *)buffer + y * pitch); + } + SimpleImage SubImage(ssize_t x, ssize_t y) + { + return SimpleImage(&(*this)[y][x], pitch); + } + AnyType *buffer; + ssize_t pitch; +}; + +template +IntType RoundDown(IntType a, IntType b) +{ + return (a / b) * b; +} + +template +IntType RoundUp(IntType a, IntType b) +{ + return RoundDown(a + b - 1, b); +} + +#ifdef _WIN32 + #define aligned_alloc(a, s) _aligned_malloc(s, a) + #define aligned_free(x) _aligned_free(x) +#else + #define aligned_free(x) free(x) +#endif + +template +class AlignedImage : public SimpleImage +{ +public: + AlignedImage() + { + this->buffer = NULL; + } + void Resize(int width, int height) + { + if (this->buffer != NULL) + aligned_free(this->buffer); + this->pitch = RoundUp(ssize_t(width * sizeof(AnyType)), alignment); + this->buffer = (AnyType *)aligned_alloc(alignment, this->pitch * height); + } + ~AlignedImage() + { + if (this->buffer != NULL) + aligned_free(this->buffer); + } +}; + +template +struct MyTraits +{ +}; + +template <> +struct MyTraits +{ +#ifdef __AVX__ + typedef __m256 SIMDtype; + const static int SIMDwidth = 8; +#else + typedef __m128 SIMDtype; + const static int SIMDwidth = 4; +#endif +}; + +template <> +struct MyTraits +{ +#ifdef __AVX__ + typedef __m256i SIMDtype; + const static int SIMDwidth = 16; +#else + typedef __m128i SIMDtype; + const static int SIMDwidth = 8; +#endif +}; + +template <> +struct MyTraits +{ +}; + +template <> +struct MyTraits +{ +#ifdef __AVX__ + typedef __m256d SIMDtype; + const static int SIMDwidth = 4; +#else + typedef __m128d SIMDtype; + const static int SIMDwidth = 2; +#endif +}; + +// does 1D IIR convolution on multiple rows (height) of data +// IntermediateType must be float or double +template +FORCE_INLINE void Convolve1DHorizontalRef(SimpleImage out, + SimpleImage in, + IntermediateType *borderValues, // [y][color] + ssize_t xStart, ssize_t xEnd, ssize_t width, ssize_t height, + typename MyTraits::SIMDtype *vCoefficients, double M[N * N]) +{ + ssize_t xStep = isForwardPass ? 1 : -1; + + ssize_t y = 0; + do + { + ssize_t c = 0; + do + { + IntermediateType prevOut[N]; + ssize_t x = xStart; + if (isBorder && !isForwardPass) + { + // xStart must be width - 1 + IntermediateType u[N + 1][1]; // u[0] = last forward filtered value, u[1] = 2nd last forward filtered value, ... + for (ssize_t i = 0; i < N + 1; ++i) + { + u[i][0] = in[y][(xStart + i * xStep) * channels + c]; + } + IntermediateType backwardsInitialState[N][1]; + calcTriggsSdikaInitialization<1>(M, u, &borderValues[y * channels + c], &borderValues[y * channels + c], ExtractElement0(vCoefficients[0]), backwardsInitialState); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = backwardsInitialState[i][0]; + + if (transposeOut) + out[x][y * channels + c] = clip_round_cast(prevOut[0]); + else + out[y][x * channels + c] = clip_round_cast(prevOut[0]); + x += xStep; + if (x == xEnd) + goto nextIteration; // do early check here so that we can still use do-while for forward pass + } + else if (isBorder && isForwardPass) + { + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = in[y][0 * channels + c]; + } + else + { + for (ssize_t i = 0; i < N; ++i) + { + prevOut[i] = transposeOut ? out[xStart - (i + 1) * xStep][y * channels + c] + : out[y][(xStart - (i + 1) * xStep) * channels + c]; + } + } + + do + { + IntermediateType sum = prevOut[0] * ExtractElement0(vCoefficients[1]) + + prevOut[1] * ExtractElement0(vCoefficients[2]) + + prevOut[2] * ExtractElement0(vCoefficients[3]) + + in[y][x * channels + c] * ExtractElement0(vCoefficients[0]); // add last for best accuracy since this terms tends to be the smallest + if (transposeOut) + out[x][y * channels + c] = clip_round_cast(sum); + else + out[y][x * channels + c] = clip_round_cast(sum); + prevOut[2] = prevOut[1]; + prevOut[1] = prevOut[0]; + prevOut[0] = sum; + x += xStep; + } while (x != xEnd); + ++c; + } while (c < channels); + nextIteration: + ++y; + } while (y < height); +} + + +const ALIGN(32) uint8_t MASKS[64] = +{ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +FORCE_INLINE __m128i PartialVectorMask(ssize_t n) +{ + return _mm_loadu_si128((__m128i *)&MASKS[sizeof(MASKS) / 2 - n]); +} + +FORCE_INLINE __m256i PartialVectorMask32(ssize_t n) +{ + return _mm256_loadu_si256((__m256i *)&MASKS[sizeof(MASKS) / 2 - n]); +} + +#ifdef _MSC_VER +FORCE_INLINE __m128i PartialVectorMask8(ssize_t n) +{ + return _mm_loadl_epi64((__m128i *)&MASKS[sizeof(MASKS) / 2 - n]); +} +#else +// return __m64 so that it can be used by _mm_movemask_si64() +FORCE_INLINE __m64 PartialVectorMask8(ssize_t n) +{ + return _mm_cvtsi64_m64(*(int64_t *)&MASKS[sizeof(MASKS) / 2 - n]); +} +#endif + +#ifdef __AVX__ + +inline __m256i Make256(__m128i v0, __m128i v1) +{ + return _mm256_insertf128_si256(_mm256_castsi128_si256(v0), v1, 1); +} + +inline __m256 Make256(__m128 v0, __m128 v1) +{ + return _mm256_insertf128_ps(_mm256_castps128_ps256(v0), v1, 1); +} + + +__m256d LoadDoubles(double *x) +{ + return _mm256_loadu_pd(x); +} + +__m256d Load4Doubles(double *x) +{ + return _mm256_loadu_pd(x); +} + +__m256d Load4Doubles(float *x) +{ + return _mm256_cvtps_pd(_mm_loadu_ps(x)); +} + +__m256d Load4Doubles(uint8_t *x) +{ + return _mm256_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)x))); +} + +__m256d Load4Doubles(uint16_t *x) +{ + return _mm256_cvtepi32_pd(_mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)x))); +} + +__m256 LoadFloats(float *x) +{ + return _mm256_loadu_ps(x); +} + +__m256 LoadFloats(uint8_t *x) +{ + __m128i temp = _mm_loadl_epi64((__m128i *)x); + return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(temp)); +} + +__m256 LoadFloats(uint16_t *x) +{ + __m128i temp = _mm_loadu_si128((__m128i *)x); + __m256i i32; +#ifdef __AVX2__ + i32 = _mm256_cvtepu16_epi32(temp); +#else + __m128i zero = _mm_setzero_si128(); + i32 = Make256(_mm_unpacklo_epi16(temp, zero), _mm_unpackhi_epi16(temp, zero)); +#endif + return _mm256_cvtepi32_ps(i32); +} + +template // no, this parameter isn't redundant - without it, there will be a redundant n == 4 check when partial = 0 +void StoreDoubles(double *out, __m256d x, ssize_t n = 4) +{ + if (partial) + _mm256_maskstore_pd(out, PartialVectorMask32(n * sizeof(double)), x); + else + _mm256_storeu_pd(out, x); +} + +template +void StoreDoubles(float *out, __m256d x, ssize_t n = 4) +{ + __m128 f32 = _mm256_cvtpd_ps(x); + if (partial) + _mm_maskstore_ps(out, PartialVectorMask(n * sizeof(float)), f32); + else + _mm_storeu_ps(out, f32); +} + +template +void StoreDoubles(uint16_t *out, __m256d x, ssize_t n = 4) +{ + __m128i i32 = _mm256_cvtpd_epi32(x), + u16 = _mm_packus_epi32(i32, i32); + if (partial) + { +#ifdef _MSC_VER + _mm_maskmoveu_si128(u16, PartialVectorMask8(n * sizeof(int16_t)), (char *)out); +#else + _mm_maskmove_si64(_mm_movepi64_pi64(u16), PartialVectorMask8(n * sizeof(int16_t)), (char *)out); +#endif + } + else + _mm_storel_epi64((__m128i *)out, u16); +} + +template +void StoreDoubles(uint8_t *out, __m256d x, ssize_t n = 4) +{ + __m128i i32 = _mm256_cvtpd_epi32(x), + u16 = _mm_packus_epi32(i32, i32), + u8 = _mm_packus_epi16(u16, u16); + if (partial) + { +#ifdef _MSC_VER + _mm_maskmoveu_si128(u8, PartialVectorMask8(n), (char *)out); +#else + _mm_maskmove_si64(_mm_movepi64_pi64(u8), PartialVectorMask8(n), (char *)out); +#endif + } + else + *(int32_t *)out = _mm_cvtsi128_si32(u8); +} + +void StoreDoubles(double *out, __m256d x) +{ + _mm256_storeu_pd(out, x); +} + +void StoreDoubles(float *out, __m256d x) +{ + _mm_storeu_ps(out, _mm256_cvtpd_ps(x)); +} + +void StoreDoubles(uint8_t *out, __m256d x) +{ + __m128i vInt = _mm_cvtps_epi32(_mm256_cvtpd_ps(x)); + *(int32_t *)out = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packus_epi32(vInt, vInt), vInt)); +} + +template // no, this parameter isn't redundant - without it, there will be a redundant n == 8 check when partial = 0 +void StoreFloats(float *out, __m256 x, ssize_t n = 8) +{ + if (partial) + _mm256_maskstore_ps(out, PartialVectorMask32(n * sizeof(float)), x); + else + _mm256_storeu_ps(out, x); +} + +template +void StoreFloats(uint16_t *out, __m256 x, ssize_t n = 8) +{ + __m256i i32 = _mm256_cvtps_epi32(x); + __m128i u16 = _mm_packus_epi32(_mm256_castsi256_si128(i32), _mm256_extractf128_si256(i32, 1)); + if (partial) + _mm_maskmoveu_si128(u16, PartialVectorMask(n * sizeof(int16_t)), (char *)out); + else + _mm_storeu_si128((__m128i *)out, u16); +} + +template +void StoreFloats(uint8_t *out, __m256 x, ssize_t n = 8) +{ + __m256i i32 = _mm256_cvtps_epi32(x); + __m256i i32Hi = _mm256_castsi128_si256(_mm256_extractf128_si256(i32, 1)); + __m128i u16 = _mm256_castsi256_si128(_mm256_packus_epi32(i32, i32Hi)), + u8 = _mm_packus_epi16(u16, u16); + if (partial) + { +#ifdef _MSC_VER + _mm_maskmoveu_si128(u8, PartialVectorMask8(n), (char *)out); +#else + _mm_maskmove_si64(_mm_movepi64_pi64(u8), PartialVectorMask8(n), (char *)out); +#endif + } + else + _mm_storel_epi64((__m128i *)out, u8); +} + +template // no, this parameter isn't redundant - without it, there will be a redundant n == 4 check when partial = 0 +void StoreFloats(float *out, __m128 x, ssize_t n = 4) +{ + if (partial) + _mm_maskstore_ps(out, PartialVectorMask(n * sizeof(float)), x); + else + _mm_storeu_ps(out, x); +} + +template +void StoreFloats(uint16_t *out, __m128 x, ssize_t n = 4) +{ + __m128i i32 = _mm_cvtps_epi32(x), + u16 = _mm_packus_epi32(i32, i32); + if (partial) + { +#ifdef _MSC_VER + _mm_maskmoveu_si128(u16, PartialVectorMask(n * sizeof(int16_t)), (char *)out); +#else + _mm_maskmove_si64(_mm_movepi64_pi64(u16), PartialVectorMask8(n * sizeof(int16_t)), (char *)out); +#endif + } + else + _mm_storel_epi64((__m128i *)out, u16); +} + +template +void StoreFloats(uint8_t *out, __m128 x, ssize_t n = 4) +{ + __m128i i32 = _mm_cvtps_epi32(x), + u8 = _mm_packus_epi16(_mm_packus_epi32(i32, i32), i32); + if (partial) + { +#ifdef _MSC_VER + _mm_maskmoveu_si128(u8, PartialVectorMask(n), (char *)out); +#else + _mm_maskmove_si64(_mm_movepi64_pi64(u8), PartialVectorMask8(n), (char *)out); +#endif + } + else + *(int32_t *)out = _mm_cvtsi128_si32(u8); +} + +__m256 BroadcastSIMD(float x) +{ + return _mm256_set1_ps(x); +} + +__m256d BroadcastSIMD(double x) +{ + return _mm256_set1_pd(x); +} + +__m256i BroadcastSIMD(int16_t x) +{ + return _mm256_set1_epi16(x); +} + +#else + +__m128 BroadcastSIMD(float x) +{ + return _mm_set1_ps(x); +} + +__m128d BroadcastSIMD(double x) +{ + return _mm_set1_pd(x); +} + +__m128i BroadcastSIMD(int16_t x) +{ + return _mm_set1_epi16(x); +} + +#endif + + + +#ifdef __SSE__ +__m128 Load4Floats(float *x) +{ + return _mm_loadu_ps(x); +} + +__m128 Load4Floats(uint8_t *x) +{ + return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)x))); +} + +__m128 Load4Floats(uint16_t *x) +{ + return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)x))); +} + +void StoreFloats(float *out, __m128 x) +{ + _mm_storeu_ps(out, x); +} + +void StoreFloats(uint16_t *out, __m128 x) +{ + __m128i vInt = _mm_cvtps_epi32(x); + _mm_storel_epi64((__m128i *)out, _mm_packus_epi32(vInt, vInt)); +} + +void StoreFloats(uint8_t *out, __m128 x) +{ + __m128i vInt = _mm_cvtps_epi32(x); + *(int32_t *)out = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packus_epi32(vInt, vInt), vInt)); +} + +__m128d Load2Doubles(double *x) +{ + return _mm_loadu_pd(x); +} + +__m128d Load2Doubles(uint8_t *x) +{ + return _mm_cvtepi32_pd(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)x))); +} + +__m128d Load2Doubles(uint16_t *x) +{ + return _mm_cvtepi32_pd(_mm_cvtepu16_epi32(_mm_cvtsi32_si128(*(int32_t *)x))); +} + +void StoreDoubles(double *out, __m128d x) +{ + _mm_storeu_pd(out, x); +} + +void StoreDoubles(uint16_t *out, __m128d x) +{ + __m128i i32 = _mm_cvtpd_epi32(x), + u16 = _mm_packus_epi32(i32, i32); + *(uint32_t *)out = _mm_cvtsi128_si32(u16); +} + +#endif + +__m256 Load4x2Floats(uint8_t *row0, uint8_t *row1) +{ + return _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)row0)), + _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)row1)))); +} + +__m256 Load4x2Floats(uint16_t *row0, uint16_t *row1) +{ + return _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)row0)), + _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)row1)))); +} + +__m256 Load4x2Floats(float *row0, float *row1) +{ + return _mm256_setr_m128(_mm_loadu_ps(row0), _mm_loadu_ps(row1)); +} + +template +FORCE_INLINE void DoOneIIR(SimpleImage out, SimpleImage in, __m256d &vSum, __m256d &vIn, ssize_t x, ssize_t y, __m256d vCoefficients[N + 1], __m256d prevOut[N]) +{ + vSum = vIn * vCoefficients[0]; + vIn = Load4Doubles(&in[y][(x + xStep) * channels]); // load data for next iteration early to hide latency (software pipelining) + + // since coefficient[0] * in can be very small, it should be added to a similar magnitude term to minimize rounding error. For this gaussian filter, the 2nd smallest term is usually coefficient[3] * out[-3] +#ifdef USE_MULTIPLY_ADD + // this expression uses fewer MADs than the max. possible, but has a shorter critical path and is actually faster + vSum = MultiplyAdd(prevOut[i2], vCoefficients[3], vSum) + MultiplyAdd(prevOut[i1], vCoefficients[2], prevOut[i0] * vCoefficients[1]); +#else + vSum = prevOut[i0] * vCoefficients[1] + + prevOut[i1] * vCoefficients[2] + + prevOut[i2] * vCoefficients[3] + + vIn * vCoefficients[0]; +#endif + if (transposeOut) + StoreDoubles(&out[x][y * channels], vSum); + else + StoreDoubles(&out[y][x * channels], vSum); +} + +// input is always untransposed +// for reverse pass, input is output from forward pass +// for transposed output, in-place operation isn't possible +template +FORCE_INLINE void Convolve1DHorizontal(SimpleImage out, + SimpleImage in, + double *borderValues, + ssize_t xStart, ssize_t xEnd, ssize_t width, ssize_t height, + MyTraits::SIMDtype *vCoefficients, double M[N * N]) +{ +#if 0 + + Convolve1DHorizontalRef(out, + in, + borderValues, + xStart, xEnd, width, height, + vCoefficients, M); + return; + +#endif + const ssize_t xStep = isForwardPass ? 1 : -1; + if (channels == 4) + { +#ifdef __AVX__ + ssize_t y = 0; + do + { + __m256d prevOut[N]; + + ssize_t x = xStart; + if (isBorder && !isForwardPass) + { + // condition: xStart must be width - 1 + double u[N + 1][channels]; //[x][channels] + for (ssize_t i = 0; i < N + 1; ++i) + { + _mm256_storeu_pd(u[i], Load4Doubles(&in[y][(xStart + i * xStep) * channels])); + } + double backwardsInitialState[N][channels]; + calcTriggsSdikaInitialization(M, u, &borderValues[y * channels], &borderValues[y * channels], ExtractElement0(vCoefficients[0]), backwardsInitialState); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = LoadDoubles(backwardsInitialState[i]); + + if (transposeOut) + StoreDoubles(&out[x][y * channels], prevOut[0]); + else + StoreDoubles(&out[y][x * channels], prevOut[0]); + + x += xStep; + if (x == xEnd) + goto nextIteration; + } + else if (isBorder && isForwardPass) + { + __m256d firstPixel = Load4Doubles(&in[y][0 * channels]); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = firstPixel; + } + else + { + for (ssize_t i = 0; i < N; ++i) + { + prevOut[i] = transposeOut ? Load4Doubles(&out[xStart - (i + 1) * xStep][y * channels]) + : Load4Doubles(&out[y][(xStart - (i + 1) * xStep) * channels]); + } + } + +#if 0 // no measurable speedup + // same as loop below, but unrolled 3 times to increase ||ism, hide latency, and reduce overhead of shifting the sliding window (prevOut) + __m256d vIn = Load4Doubles(&in[y][xStart * channels]); + for ( ; isForwardPass ? (x < xEnd - 3) : (x > xEnd + 3); ) + { + __m256d vSum; + DoOneIIR(out, in, vSum, vIn, x, y, vCoefficients, prevOut); + prevOut[2] = vSum; + x += xStep; + + DoOneIIR(out, in, vSum, vIn, x, y, vCoefficients, prevOut); + prevOut[1] = vSum; + x += xStep; + + DoOneIIR(out, in, vSum, vIn, x, y, vCoefficients, prevOut); + prevOut[0] = vSum; + x += xStep; + } +#endif + while (isForwardPass ? (x < xEnd) : (x > xEnd)) + { + __m256d vIn = Load4Doubles(&in[y][x * channels]), + vSum; + + // since coefficient[0] * in can be very small, it should be added to a similar magnitude term to minimize rounding error. For this gaussian filter, the 2nd smallest term is usually coefficient[3] * out[-3] + #ifdef USE_MULTIPLY_ADD + // this expression uses fewer MADs than the max. possible, but has a shorter critical path and is actually faster + vSum = MultiplyAdd(vIn, vCoefficients[0], prevOut[2] * vCoefficients[3]) + MultiplyAdd(prevOut[1], vCoefficients[2], prevOut[0] * vCoefficients[1]); + #else + vSum = prevOut[0] * vCoefficients[1] + + prevOut[1] * vCoefficients[2] + + prevOut[2] * vCoefficients[3] + + vIn * vCoefficients[0]; + #endif + if (transposeOut) + StoreDoubles(&out[x][y * channels], vSum); + else + StoreDoubles(&out[y][x * channels], vSum); + + prevOut[2] = prevOut[1]; + prevOut[1] = prevOut[0]; + prevOut[0] = vSum; + x += xStep; + } + nextIteration: + ++y; + } while (y < height); +#endif + // todo: implement for SSE only + } + else + { + ssize_t y = 0; + do + { + if (isForwardPass) + { + ssize_t x = xStart; + __m128d feedback[2], + k0[2]; + k0[0] = _mm256_castpd256_pd128(vCoefficients[0]); + k0[1] = _mm256_extractf128_pd(vCoefficients[0], 1); + + + if (isBorder && isForwardPass) + { + // xStart must be 0 + feedback[0] = feedback[1] = _mm_set1_pd(in[y][0]); + } + else + { + feedback[0] = Load2Doubles(&out[y][xStart - 3 * xStep]); + feedback[1] = Load2Doubles(&out[y][xStart - 1 * xStep]); + + feedback[1] = _mm_shuffle_pd(feedback[0], feedback[1], _MM_SHUFFLE2(0, 1)); + feedback[0] = _mm_shuffle_pd(feedback[0], feedback[0], _MM_SHUFFLE2(0, 0)); + } + // feedback = [garbage y-3 y-2 y-1] + for (; x != xEnd; x += xStep) + { + __m128d _in = _mm_set1_pd(in[y][x]); + + feedback[0] = _mm_blend_pd(feedback[0], _in, 0x1); + + __m128d newOutput = _mm_add_pd(_mm_dp_pd(feedback[0], k0[0], 0x31), + _mm_dp_pd(feedback[1], k0[1], 0x31)); + feedback[0] = _mm_blend_pd(feedback[0], newOutput, 0x1); // insert back input + + out[y][x] = _mm_cvtsd_f64(newOutput); + feedback[0] = _mm_shuffle_pd(feedback[0], feedback[1], _MM_SHUFFLE2(0, 0)); + feedback[1] = _mm_shuffle_pd(feedback[1], feedback[0], _MM_SHUFFLE2(0, 1)); + } + } + else + { + __m128d feedback[2], k4[2]; + k4[0] = _mm256_castpd256_pd128(vCoefficients[4]); + k4[1] = _mm256_extractf128_pd(vCoefficients[4], 1); + + ssize_t x = xStart; + if (isBorder && !isForwardPass) + { + // xstart must be width - 1 + double u[N + 1][1]; //[x][y][channels] + for (ssize_t i = 0; i < N + 1; ++i) + { + u[i][0] = in[y][xStart + i * xStep]; + } + double backwardsInitialState[N][1]; + calcTriggsSdikaInitialization<1>(M, u, &borderValues[y * channels], &borderValues[y * channels], ExtractElement0(vCoefficients[0]), backwardsInitialState); + + feedback[0] = _mm_load_pd(&backwardsInitialState[0][0]); + feedback[1] = _mm_load_pd(&backwardsInitialState[2][0]); + + out[y][x] = backwardsInitialState[0][0]; + x += xStep; + if (x == xEnd) + continue; + } + else + { + feedback[0] = Load2Doubles(&out[y][xStart - xStep]); + feedback[1] = Load2Doubles(&out[y][xStart - 3 * xStep]); + } + + for (; x != xEnd; x += xStep) + { + __m128d _in = _mm_set1_pd(in[y][x]); + + feedback[1] = _mm_blend_pd(feedback[1], _in, 0x2); + + __m128d newOutput = _mm_add_pd(_mm_dp_pd(feedback[0], k4[0], 0x32), + _mm_dp_pd(feedback[1], k4[1], 0x32)); + feedback[1] = _mm_blend_pd(feedback[1], newOutput, 0x2); // insert back input + + __m128d temp = _mm_shuffle_pd(feedback[1], feedback[1], _MM_SHUFFLE2(0, 1)); + out[y][x] = _mm_cvtsd_f64(temp); + + feedback[1] = _mm_shuffle_pd(feedback[0], feedback[1], _MM_SHUFFLE2(1, 1)); + feedback[0] = _mm_shuffle_pd(feedback[1], feedback[0], _MM_SHUFFLE2(0, 1)); + } + } + ++y; + } while (y < height); + } +} + + +template +FORCE_INLINE void DoOneIIR(SimpleImage out, SimpleImage in, __m256 &vSum, __m256 &vIn, ssize_t x, ssize_t y, __m256 vCoefficients[N + 1], __m256 prevOut[N]) +{ + vSum = vIn * vCoefficients[0]; + + // load data for next iteration early to hide latency (software pipelining) + vIn = Make256(Load4Floats(&in[y][(x + xStep) * channels]), + Load4Floats(&in[y + 1][(x + xStep) * channels])); + + // since coefficient[0] * in can be very small, it should be added to a similar magnitude term to minimize rounding error. For this gaussian filter, the 2nd smallest term is usually coefficient[3] * out[-3] +#ifdef USE_MULTIPLY_ADD + // this expression uses fewer MADs than the max. possible, but has a shorter critical path and is actually faster + vSum = MultiplyAdd(prevOut[i2], vCoefficients[3], vSum) + MultiplyAdd(prevOut[i1], vCoefficients[2], prevOut[i0] * vCoefficients[1]); +#else + vSum = prevOut[i0] * vCoefficients[1] + + prevOut[i1] * vCoefficients[2] + + prevOut[i2] * vCoefficients[3] + + vIn * vCoefficients[0]; +#endif + if (transposeOut) + { + StoreFloats(&out[x][y * channels], _mm256_castps256_ps128(vSum)); + StoreFloats(&out[x][(y + 1) * channels], _mm256_extractf128_ps(vSum, 1)); + } + else + { + StoreFloats(&out[y][x * channels], _mm256_castps256_ps128(vSum)); + StoreFloats(&out[y + 1][x * channels], _mm256_extractf128_ps(vSum, 1)); + } +} + +template +FORCE_INLINE void Convolve1DHorizontal(SimpleImage out, + SimpleImage in, + float *borderValues, + ssize_t xStart, ssize_t xEnd, ssize_t width, ssize_t height, + MyTraits::SIMDtype *vCoefficients, double M[N * N]) +{ +#if 0 + MyTraits::SIMDtype coefficients2[4]; + + if (channels == 1) + { + coefficients2[0] = _mm256_set1_ps(((float *)vCoefficients)[0]); + coefficients2[1] = _mm256_set1_ps(((float *)vCoefficients)[3]); + coefficients2[2] = _mm256_set1_ps(((float *)vCoefficients)[2]); + coefficients2[3] = _mm256_set1_ps(((float *)vCoefficients)[1]); + vCoefficients = coefficients2; + } + Convolve1DHorizontalRef(out, + in, + borderValues, + xStart, xEnd, width, height, + vCoefficients, M); + return; +#endif + const ssize_t xStep = isForwardPass ? 1 : -1; + + if (channels == 4) + { + ssize_t y = 0; +#ifdef __AVX__ + for (; y <= height - 2; y += 2) // AVX code process 2 rows at a time + { + __m256 prevOut[N]; + + ssize_t x = xStart; + + if (isBorder && !isForwardPass) + { + float u[N + 1][2 * channels]; //[x][y][channels] + for (ssize_t i = 0; i < N + 1; ++i) + { + _mm_storeu_ps(&u[i][0], Load4Floats(&in[y][(xStart + i * xStep) * channels])); + _mm_storeu_ps(&u[i][channels], Load4Floats(&in[y + 1][(xStart + i * xStep) * channels])); + } + float backwardsInitialState[N][2 * channels]; + calcTriggsSdikaInitialization<2 * channels>(M, u, &borderValues[y * channels], &borderValues[y * channels], ExtractElement0(vCoefficients[0]), backwardsInitialState); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = LoadFloats(backwardsInitialState[i]); + + if (transposeOut) + { + StoreFloats(&out[x][y * channels], _mm256_castps256_ps128(prevOut[0])); + StoreFloats(&out[x][(y + 1) * channels], _mm256_extractf128_ps(prevOut[0], 1)); + } + else + { + StoreFloats(&out[y][x * channels], _mm256_castps256_ps128(prevOut[0])); + StoreFloats(&out[y + 1][x * channels], _mm256_extractf128_ps(prevOut[0], 1)); + } + x += xStep; + if (x == xEnd) + continue; + } + else if (isBorder && isForwardPass) + { + // xStart must be 0 + __m256 firstPixel = Make256(Load4Floats(&in[y][0 * channels]), + Load4Floats(&in[y + 1][0 * channels])); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = firstPixel; + } + else + { + for (ssize_t i = 0; i < N; ++i) + { + prevOut[i] = transposeOut ? Make256(Load4Floats(&out[xStart - (i + 1) * xStep][y * channels]), + Load4Floats(&out[xStart - (i + 1) * xStep][(y + 1) * channels])) + : Make256(Load4Floats(&out[y][(xStart - (i + 1) * xStep) * channels]), + Load4Floats(&out[y + 1][(xStart - (i + 1) * xStep) * channels])); + } + } + +#if 0 // 2x slower than no unrolling - too many register spills? + // same as loop below, but unrolled 3 times to increase ||ism, hide latency, and reduce overhead of shifting the sliding window (prevOut) + __m256 vIn = Make256(Load4Floats(&in[y][xStart * channels]), + Load4Floats(&in[y + 1][xStart * channels])); + + for (; isForwardPass ? (x < xEnd - 3) : (x > xEnd + 3); ) + { + __m256 vSum; + DoOneIIR(out, in, vSum, vIn, x, y, vCoefficients, prevOut); + prevOut[2] = vSum; + x += xStep; + + DoOneIIR(out, in, vSum, vIn, x, y, vCoefficients, prevOut); + prevOut[1] = vSum; + x += xStep; + + DoOneIIR(out, in, vSum, vIn, x, y, vCoefficients, prevOut); + prevOut[0] = vSum; + x += xStep; + } +#endif + for (; x != xEnd; x += xStep) + { + __m256 vIn = Make256(Load4Floats(&in[y][x * channels]), + Load4Floats(&in[y + 1][x * channels])), + vSum; + + // since coefficient[0] * in can be very small, it should be added to a similar magnitude term to minimize rounding error. For this gaussian filter, the 2nd smallest term is usually coefficient[3] * out[-3] +#ifdef USE_MULTIPLY_ADD + // this expression uses fewer MADs than the max. possible, but has a shorter critical path and is actually faster + vSum = MultiplyAdd(vIn, vCoefficients[0], prevOut[2] * vCoefficients[3]) + MultiplyAdd(prevOut[1], vCoefficients[2], prevOut[0] * vCoefficients[1]); +#else + vSum = prevOut[0] * vCoefficients[1] + + prevOut[1] * vCoefficients[2] + + prevOut[2] * vCoefficients[3] + + vIn * vCoefficients[0]; +#endif + + if (transposeOut) + { + StoreFloats(&out[x][y * channels], _mm256_castps256_ps128(vSum)); + StoreFloats(&out[x][(y + 1) * channels], _mm256_extractf128_ps(vSum, 1)); + } + else + { + StoreFloats(&out[y][x * channels], _mm256_castps256_ps128(vSum)); + StoreFloats(&out[y + 1][x * channels], _mm256_extractf128_ps(vSum, 1)); + } + prevOut[2] = prevOut[1]; + prevOut[1] = prevOut[0]; + prevOut[0] = vSum; + } + } +#endif + for (; y < height; ++y) + { + __m128 prevOut[N]; + ssize_t x = xStart; + + if (isBorder && !isForwardPass) + { + float u[N + 1][channels]; //[x][channels] + for (ssize_t i = 0; i < N + 1; ++i) + { + _mm_storeu_ps(u[i], Load4Floats(&in[y][(xStart + i * xStep) * channels])); + } + float backwardsInitialState[N][channels]; + calcTriggsSdikaInitialization(M, u, &borderValues[y * channels], &borderValues[y * channels], ExtractElement0(vCoefficients[0]), backwardsInitialState); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = Load4Floats(backwardsInitialState[i]); + + if (transposeOut) + StoreFloats(&out[x][y * channels], prevOut[0]); + else + StoreFloats(&out[y][x * channels], prevOut[0]); + x += xStep; + if (x == xEnd) + continue; + } + else if (isBorder && isForwardPass) + { + // xStart must be 0 + __m128 firstPixel = Load4Floats(&in[y][0 * channels]); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = firstPixel; + } + else + { + for (ssize_t i = 0; i < N; ++i) + { + prevOut[i] = transposeOut ? Load4Floats(&out[xStart - (i + 1) * xStep][y * channels]) + : Load4Floats(&out[y][(xStart - (i + 1) * xStep) * channels]); + } + } + + do + { + __m128 vIn = Load4Floats(&in[y][x * channels]), + vSum; + // since coefficient[0] * in can be very small, it should be added to a similar magnitude term to minimize rounding error. For this gaussian filter, the 2nd smallest term is usually coefficient[3] * out[-3] +#ifdef USE_MULTIPLY_ADD + // this expression uses fewer MADs than the max. possible, but has a shorter critical path and is actually faster + vSum = MultiplyAdd(vIn, _mm256_castps256_ps128(vCoefficients[0]), prevOut[2] * _mm256_castps256_ps128(vCoefficients[3])) + MultiplyAdd(prevOut[1], _mm256_castps256_ps128(vCoefficients[2]), prevOut[0] * _mm256_castps256_ps128(vCoefficients[1])); +#else + vSum = prevOut[0] * _mm256_castps256_ps128(vCoefficients[1]) + + prevOut[1] * _mm256_castps256_ps128(vCoefficients[2]) + + prevOut[2] * _mm256_castps256_ps128(vCoefficients[3]) + + vIn * _mm256_castps256_ps128(vCoefficients[0]); +#endif + if (transposeOut) + { + StoreFloats(&out[x][y * channels], vSum); + } + else + { + StoreFloats(&out[y][x * channels], vSum); + } + prevOut[2] = prevOut[1]; + prevOut[1] = prevOut[0]; + prevOut[0] = vSum; + x += xStep; + } while (x != xEnd); + } + } + else + { + //static_assert(!transposeOut, "transpose not supported"); + ssize_t y = 0; + + const ssize_t Y_BLOCK_SIZE = 8; +#ifdef __AVX__ + for (; y <= height - Y_BLOCK_SIZE; y += Y_BLOCK_SIZE) + { + if (isForwardPass) + { + ssize_t x = xStart; + __m256 feedback[4], + k0 = vCoefficients[0], + k1 = vCoefficients[1], + k2 = vCoefficients[2], + k3 = vCoefficients[3]; + + if (isBorder && isForwardPass) + { + // xStart must be 0 + + for (ssize_t i = 0; i < 4; ++i) + { + feedback[i] = Make256(_mm_set1_ps(in[y + i * 2][0]), + _mm_set1_ps(in[y + i * 2 + 1][0])); + } + } + else + { + for (ssize_t i = 0; i < 4; ++i) + { + feedback[i] = Load4x2Floats(&out[y + i * 2][xStart - 3 * xStep], + &out[y + i * 2 + 1][xStart - 3 * xStep]); + feedback[i] = _mm256_shuffle_ps(feedback[i], feedback[i], _MM_SHUFFLE(2, 1, 0, 0)); + } + } + // feedback0 = [garbage y-3 y-2 y-1] + for (; x <= xEnd - 4; x += 4) + { + __m256 _in[4]; + for (ssize_t i = 0; i < 4; ++i) + _in[i] = Load4x2Floats(&in[y + i * 2][x], &in[y + i * 2 + 1][x]); + + for (int i = 0; i < 4; ++i) + { + feedback[i] = _mm256_blend_ps(feedback[i], _in[i], 0x11); + feedback[i] = _mm256_blend_ps(feedback[i], _mm256_dp_ps(feedback[i], k0, 0xf1), 0x11); // insert back input + } + + for (int i = 0; i < 4; ++i) + { + feedback[i] = _mm256_blend_ps(feedback[i], _in[i], 0x22); + feedback[i] = _mm256_blend_ps(feedback[i], _mm256_dp_ps(feedback[i], k1, 0xf2), 0x22); // insert back input + } + + for (int i = 0; i < 4; ++i) + { + feedback[i] = _mm256_blend_ps(feedback[i], _in[i], 0x44); + feedback[i] = _mm256_blend_ps(feedback[i], _mm256_dp_ps(feedback[i], k2, 0xf4), 0x44); // insert back input + } + + for (ssize_t i = 0; i < 4; ++i) + { + feedback[i] = _mm256_blend_ps(feedback[i], _in[i], 0x88); + feedback[i] = _mm256_blend_ps(feedback[i], _mm256_dp_ps(feedback[i], k3, 0xf8), 0x88); // insert back input + + _mm_storeu_ps((float *)&out[y + i * 2][x], _mm256_castps256_ps128(feedback[i])); + _mm_storeu_ps((float *)&out[y + i * 2 + 1][x], _mm256_extractf128_ps(feedback[i], 1)); + } + } + for (; x != xEnd; x += xStep) + { + // todo: make these scalar loads to avoid buffer overflow + __m256 _in[4]; + for (ssize_t i = 0; i < 4; ++i) + { + _in[i] = Load4x2Floats(&in[y + i * 2][x], + &in[y + i * 2 + 1][x]); + } + + for (int i = 0; i < 4; ++i) + { + feedback[i] = _mm256_blend_ps(feedback[i], _in[i], 0x11); + feedback[i] = _mm256_blend_ps(feedback[i], _mm256_dp_ps(feedback[i], k0, 0xf1), 0x11); // insert back input + } + + for (ssize_t i = 0; i < 4; ++i) + { + out[y + i * 2][x] = _mm_cvtss_f32(_mm256_castps256_ps128(feedback[i])); + out[y + i * 2 + 1][x] = _mm_cvtss_f32(_mm256_extractf128_ps(feedback[i], 1)); + } + + for (int i = 0; i < 4; ++i) + feedback[i] = _mm256_shuffle_ps(feedback[i], feedback[i], _MM_SHUFFLE(0, 3, 2, 0)); + } + } + else + { + __m256 feedback[4], + k4 = vCoefficients[4], + k5 = vCoefficients[5], + k6 = vCoefficients[6], + k7 = vCoefficients[7]; + ssize_t x = xStart; + if (isBorder && !isForwardPass) + { + // xstart must be width - 1 + float u[N + 1][8 * channels]; //[x][y][channels] + for (ssize_t i = 0; i < N + 1; ++i) + { + for (ssize_t _y = 0; _y < 8; ++_y) + u[i][_y] = in[y + _y][xStart + i * xStep]; + } + float backwardsInitialState[N][8 * channels]; + calcTriggsSdikaInitialization<8 * channels>(M, u, &borderValues[y * channels], &borderValues[y * channels], ExtractElement0(vCoefficients[0]), backwardsInitialState); + + for (ssize_t i = 0; i < 4; ++i) + { + float temp[2][N + 1]; + for (ssize_t j = 0; j < N; ++j) + { + temp[0][j] = backwardsInitialState[j][i * 2]; + temp[1][j] = backwardsInitialState[j][i * 2 + 1]; + } + feedback[i] = Load4x2Floats(temp[0], temp[1]); + } + + for (ssize_t _y = 0; _y < Y_BLOCK_SIZE; ++_y) + out[y + _y][x] = backwardsInitialState[0][_y]; + + x += xStep; + if (x == xEnd) + continue; + } + else + { + for (ssize_t i = 0; i < 4; ++i) + { + feedback[i] = Load4x2Floats(&out[y + i * 2][xStart - xStep], + &out[y + i * 2 + 1][xStart - xStep]); + } + } + for (; x - 4 >= xEnd; x -= 4) + { + __m256 _in[4]; + for (ssize_t i = 0; i < 4; ++i) + { + _in[i] = Load4x2Floats(&in[y + i * 2][x - 3], + &in[y + i * 2 + 1][x - 3]); + } + + for (int i = 0; i < 4; ++i) + { + feedback[i] = _mm256_blend_ps(feedback[i], _in[i], 0x88); + feedback[i] = _mm256_blend_ps(feedback[i], _mm256_dp_ps(feedback[i], k4, 0xf8), 0x88); // insert back input + } + + for (int i = 0; i < 4; ++i) + { + feedback[i] = _mm256_blend_ps(feedback[i], _in[i], 0x44); + feedback[i] = _mm256_blend_ps(feedback[i], _mm256_dp_ps(feedback[i], k5, 0xf4), 0x44); // insert back input + } + + for (int i = 0; i < 4; ++i) + { + feedback[i] = _mm256_blend_ps(feedback[i], _in[i], 0x22); + feedback[i] = _mm256_blend_ps(feedback[i], _mm256_dp_ps(feedback[i], k6, 0xf2), 0x22); // insert back input + } + + for (ssize_t i = 0; i < 4; ++i) + { + feedback[i] = _mm256_blend_ps(feedback[i], _in[i], 0x11); + feedback[i] = _mm256_blend_ps(feedback[i], _mm256_dp_ps(feedback[i], k7, 0xf1), 0x11); // insert back input + + StoreFloats(&out[y + i * 2][x - 3], _mm256_castps256_ps128(feedback[i])); + StoreFloats(&out[y + i * 2 + 1][x - 3], _mm256_extractf128_ps(feedback[i], 1)); + } + } + + for ( ; x != xEnd; x += xStep) + { + // todo: make these scalar loads to avoid buffer overflow + __m256 _in[4]; + for (ssize_t i = 0; i < 4; ++i) + { + _in[i] = Load4x2Floats(&in[y + i * 2][x - 3], + &in[y + i * 2 + 1][x - 3]); + } + + for (int i = 0; i < 4; ++i) + { + feedback[i] = _mm256_blend_ps(feedback[i], _in[i], 0x88); + feedback[i] = _mm256_blend_ps(feedback[i], _mm256_dp_ps(feedback[i], k4, 0xf8), 0x88); // insert back input + } + + for (ssize_t i = 0; i < 4; ++i) + { + __m256 temp = _mm256_shuffle_ps(feedback[i], feedback[i], _MM_SHUFFLE(0, 0, 0, 3)); + out[y + i * 2][x] = _mm_cvtss_f32(_mm256_castps256_ps128(temp)); + out[y + i * 2 + 1][x] = _mm_cvtss_f32(_mm256_extractf128_ps(temp, 1)); + } + + + for (int i = 0; i < 4; ++i) + feedback[i] = _mm256_shuffle_ps(feedback[i], feedback[i], _MM_SHUFFLE(2, 1, 0, 3)); + } + } + } +#endif + for (; y < height; ++y) + { + if (isForwardPass) + { + ssize_t x = xStart; + __m128 feedback0, + k0 = _mm256_castps256_ps128(vCoefficients[0]); + + if (isBorder && isForwardPass) + { + // xStart must be 0 + feedback0 = _mm_set1_ps(in[y][0]); + } + else + { + feedback0 = Load4Floats(&out[y][xStart - 3 * xStep]); + feedback0 = _mm_shuffle_ps(feedback0, feedback0, _MM_SHUFFLE(2, 1, 0, 0)); + } + // feedback0 = [garbage y-3 y-2 y-1] + for (; x != xEnd; x += xStep) + { + // todo: make these scalar loads to avoid buffer overflow + __m128 _in0 = _mm_set1_ps(in[y][x]); + + feedback0 = _mm_blend_ps(feedback0, _in0, 0x1); + feedback0 = _mm_blend_ps(feedback0, _mm_dp_ps(feedback0, k0, 0xf1), 0x1); // insert back input + + out[y][x] = _mm_cvtss_f32(feedback0); + feedback0 = _mm_shuffle_ps(feedback0, feedback0, _MM_SHUFFLE(0, 3, 2, 0)); + } + } + else + { + __m128 feedback0, + k4 = _mm256_castps256_ps128(vCoefficients[4]); + + ssize_t x = xStart; + if (isBorder && !isForwardPass) + { + // xstart must be width - 1 + float u[N + 1][channels]; //[x][y][channels] + for (ssize_t i = 0; i < N + 1; ++i) + { + u[i][0] = in[y][xStart + i * xStep]; + } + float backwardsInitialState[N][channels]; + calcTriggsSdikaInitialization(M, u, &borderValues[y * channels], &borderValues[y * channels], ExtractElement0(vCoefficients[0]), backwardsInitialState); + + float temp[N + 1]; + for (ssize_t i = 0; i < N; ++i) + { + temp[i] = backwardsInitialState[i][0]; + } + feedback0 = Load4Floats(temp); + + out[y][x] = backwardsInitialState[0][0]; + x += xStep; + if (x == xEnd) + continue; + } + else + { + feedback0 = Load4Floats(&out[y][xStart - xStep]); + } + + for (; x != xEnd; x += xStep) + { + // todo: make these scalar loads to avoid buffer overflow + __m128 _in0 = _mm_set1_ps(in[y][x]); + + feedback0 = _mm_blend_ps(feedback0, _in0, 0x8); + feedback0 = _mm_blend_ps(feedback0, _mm_dp_ps(feedback0, k4, 0xf8), 0x8); // insert back input + + __m128 temp = _mm_shuffle_ps(feedback0, feedback0, _MM_SHUFFLE(0, 0, 0, 3)); + out[y][x] = _mm_cvtss_f32(temp); + + feedback0 = _mm_shuffle_ps(feedback0, feedback0, _MM_SHUFFLE(2, 1, 0, 3)); + } + } + } + } +} + + +// does 1D IIR convolution on multiple rows (height) of data +// IntermediateType must be float or double +template +FORCE_INLINE void Convolve1DVerticalRef(SimpleImage out, + SimpleImage in, + IntermediateType *borderValues, // [y][color] + ssize_t yStart, ssize_t yEnd, ssize_t width, ssize_t height, + typename MyTraits::SIMDtype *vCoefficients, double M[N * N]) +{ + ssize_t yStep = isForwardPass ? 1 : -1; + + ssize_t x = 0; + do + { + IntermediateType prevOut[N]; + ssize_t y = yStart; + if (isBorder && !isForwardPass) + { + IntermediateType u[N + 1][1]; // u[0] = last forward filtered value, u[1] = 2nd last forward filtered value, ... + for (ssize_t i = 0; i < N + 1; ++i) + { + u[i][0] = in[yStart + i * yStep][x]; + } + IntermediateType backwardsInitialState[N][1]; + calcTriggsSdikaInitialization<1>(M, u, &borderValues[x], &borderValues[x], ExtractElement0(vCoefficients[0]), backwardsInitialState); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = backwardsInitialState[i][0]; + + out[y][x] = clip_round_cast(prevOut[0]); + y += yStep; + if (y == yEnd) + goto nextIteration; + } + else if (isBorder && isForwardPass) + { + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = in[0][x]; + } + else + { + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = out[yStart - (i + 1) * yStep][x]; + } + + do + { + IntermediateType sum = prevOut[0] * ExtractElement0(vCoefficients[1]) + + prevOut[1] * ExtractElement0(vCoefficients[2]) + + prevOut[2] * ExtractElement0(vCoefficients[3]) + + in[y][x] * ExtractElement0(vCoefficients[0]); // add last for best accuracy since this terms tends to be the smallest + + + out[y][x] = clip_round_cast(sum); + prevOut[2] = prevOut[1]; + prevOut[1] = prevOut[0]; + prevOut[0] = sum; + y += yStep; + } while (y != yEnd); + nextIteration: + ++x; + } while (x < width); +} + + + +// input is always untransposed +// for reverse pass, input is output from forward pass +// for transposed output, in-place operation isn't possible +template +FORCE_INLINE void Convolve1DVertical(SimpleImage out, + SimpleImage in, + float *borderValues, + ssize_t yStart, ssize_t yEnd, ssize_t width, ssize_t height, + MyTraits::SIMDtype *vCoefficients, double M[N * N]) +{ +#if 0 + Convolve1DVerticalRef(out, + in, + borderValues, + yStart, yEnd, width, height, + vCoefficients, M); + return; +#endif + const ssize_t yStep = isForwardPass ? 1 : -1; + + const int SIMD_WIDTH = 8; + ssize_t x = 0; +#ifdef __AVX__ + for ( ; x <= width - SIMD_WIDTH; x += SIMD_WIDTH) + { + __m256 prevOut[N]; + ssize_t y = yStart; + if (isBorder && !isForwardPass) + { + float u[N + 1][SIMD_WIDTH]; //[x][channels] + for (ssize_t i = 0; i < N + 1; ++i) + { + _mm256_storeu_ps(u[i], LoadFloats(&in[yStart + i * yStep][x])); + } + float backwardsInitialState[N][SIMD_WIDTH]; + calcTriggsSdikaInitialization(M, u, &borderValues[x], &borderValues[x], ExtractElement0(vCoefficients[0]), backwardsInitialState); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = LoadFloats(backwardsInitialState[i]); + + StoreFloats(&out[y][x], prevOut[0]); + + y += yStep; + if (y == yEnd) + continue; + } + else if (isBorder && isForwardPass) + { + // yStart must be 0 + __m256 firstPixel = LoadFloats(&in[0][x]); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = firstPixel; + } + else + { + for (ssize_t i = 0; i < N; ++i) + { + prevOut[i] = LoadFloats(&out[yStart - (i + 1) * yStep][x]); + } + } + + __m256 vIn = LoadFloats(&in[yStart][x]); + + do + { + __m256 vSum = LoadFloats(&in[y][x]) * vCoefficients[0]; + + vSum = prevOut[0] * vCoefficients[1] + + prevOut[1] * vCoefficients[2] + + prevOut[2] * vCoefficients[3] + + vSum; + + StoreFloats(&out[y][x], vSum); + + prevOut[2] = prevOut[1]; + prevOut[1] = prevOut[0]; + prevOut[0] = vSum; + y += yStep; + } while (isForwardPass ? (y < yEnd) : (y > yEnd)); + } +#endif + { + const int SIMD_WIDTH = 4; + for (; x < width; x += SIMD_WIDTH) + { + __m128 prevOut[N]; + ssize_t y = yStart; + if (isBorder && !isForwardPass) + { + float u[N + 1][SIMD_WIDTH]; //[x][channels] + for (ssize_t i = 0; i < N + 1; ++i) + { + _mm_storeu_ps(u[i], Load4Floats(&in[yStart + i * yStep][x])); + } + float backwardsInitialState[N][SIMD_WIDTH]; + calcTriggsSdikaInitialization(M, u, &borderValues[x], &borderValues[x], ExtractElement0(vCoefficients[0]), backwardsInitialState); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = Load4Floats(backwardsInitialState[i]); + + StoreFloats(&out[y][x], prevOut[0], width - x); + + y += yStep; + if (y == yEnd) + continue; + } + else if (isBorder && isForwardPass) + { + // yStart must be 0 + __m128 firstPixel = Load4Floats(&in[0][x]); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = firstPixel; + } + else + { + for (ssize_t i = 0; i < N; ++i) + { + prevOut[i] = Load4Floats(&out[yStart - (i + 1) * yStep][x]); + } + } + + __m128 vIn = Load4Floats(&in[yStart][x]); + + do + { + __m128 vSum = Load4Floats(&in[y][x]) * _mm256_castps256_ps128(vCoefficients[0]); + + vSum = prevOut[0] * _mm256_castps256_ps128(vCoefficients[1]) + + prevOut[1] * _mm256_castps256_ps128(vCoefficients[2]) + + prevOut[2] * _mm256_castps256_ps128(vCoefficients[3]) + + vSum; + + StoreFloats(&out[y][x], vSum, width - x); + + prevOut[2] = prevOut[1]; + prevOut[1] = prevOut[0]; + prevOut[0] = vSum; + y += yStep; + } while (isForwardPass ? (y < yEnd) : (y > yEnd)); + } + } +} + +// input is always untransposed +// for reverse pass, input is output from forward pass +// for transposed output, in-place operation isn't possible +template +FORCE_INLINE void Convolve1DVertical(SimpleImage out, + SimpleImage in, + double *borderValues, + ssize_t yStart, ssize_t yEnd, ssize_t width, ssize_t height, + MyTraits::SIMDtype *vCoefficients, double M[N * N]) +{ +#if 0 + Convolve1DVerticalRef(out, + in, + borderValues, + yStart, yEnd, width, height, + vCoefficients, M); + return; +#endif + + const ssize_t yStep = isForwardPass ? 1 : -1, + SIMD_WIDTH = 4; + ssize_t x = 0; +#ifdef __AVX__ + for ( ; x <= width - SIMD_WIDTH; x += SIMD_WIDTH) + { + __m256d prevOut[N]; + ssize_t y = yStart; + + if (isBorder && !isForwardPass) + { + // condition: yStart must be height - 1 + double u[N + 1][SIMD_WIDTH]; //[x][channels] + for (ssize_t i = 0; i < N + 1; ++i) + { + _mm256_storeu_pd(u[i], Load4Doubles(&in[yStart + i * yStep][x])); + } + double backwardsInitialState[N][SIMD_WIDTH]; + calcTriggsSdikaInitialization(M, u, &borderValues[x], &borderValues[x], ExtractElement0(vCoefficients[0]), backwardsInitialState); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = LoadDoubles(backwardsInitialState[i]); + + StoreDoubles(&out[y][x], prevOut[0]); + + y += yStep; + if (y == yEnd) + continue; + } + else if (isBorder && isForwardPass) + { + // condition: yStart must be 0 + __m256d firstPixel = Load4Doubles(&in[0][x]); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = firstPixel; + } + else + { + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = Load4Doubles(&out[yStart - (i + 1) * yStep][x]); + } + + __m256d vIn = Load4Doubles(&in[yStart][x]); + + do + { + __m256d vSum = Load4Doubles(&in[y][x]) * vCoefficients[0]; + + vSum = prevOut[0] * vCoefficients[1] + + prevOut[1] * vCoefficients[2] + + prevOut[2] * vCoefficients[3] + + vSum; + + StoreDoubles(&out[y][x], vSum); + + prevOut[2] = prevOut[1]; + prevOut[1] = prevOut[0]; + prevOut[0] = vSum; + y += yStep; + } while (y != yEnd); + } +#endif + { + const ssize_t SIMD_WIDTH = 2; + for (; x < width; x += SIMD_WIDTH) + { + __m128d prevOut[N]; + ssize_t y = yStart; + + if (isBorder && !isForwardPass) + { + // condition: yStart must be height - 1 + double u[N + 1][SIMD_WIDTH]; //[x][channels] + for (ssize_t i = 0; i < N + 1; ++i) + { + _mm_storeu_pd(u[i], Load2Doubles(&in[yStart + i * yStep][x])); + } + double backwardsInitialState[N][SIMD_WIDTH]; + calcTriggsSdikaInitialization(M, u, &borderValues[x], &borderValues[x], ExtractElement0(vCoefficients[0]), backwardsInitialState); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = Load2Doubles(backwardsInitialState[i]); + + StoreDoubles(&out[y][x], prevOut[0]); + + y += yStep; + if (y == yEnd) + continue; + } + else if (isBorder && isForwardPass) + { + // condition: yStart must be 0 + __m128d firstPixel = Load2Doubles(&in[0][x]); + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = firstPixel; + } + else + { + for (ssize_t i = 0; i < N; ++i) + prevOut[i] = Load2Doubles(&out[yStart - (i + 1) * yStep][x]); + } + + __m128d vIn = Load2Doubles(&in[yStart][x]); + + do + { + __m128d vSum = Load2Doubles(&in[y][x]) * _mm256_castpd256_pd128(vCoefficients[0]); + + vSum = prevOut[0] * _mm256_castpd256_pd128(vCoefficients[1]) + + prevOut[1] * _mm256_castpd256_pd128(vCoefficients[2]) + + prevOut[2] * _mm256_castpd256_pd128(vCoefficients[3]) + + vSum; + + StoreDoubles(&out[y][x], vSum); + + prevOut[2] = prevOut[1]; + prevOut[1] = prevOut[0]; + prevOut[0] = vSum; + y += yStep; + } while (y != yEnd); + } + } +} + +template +FORCE_INLINE void Copy2D(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height) +{ + ssize_t y = 0; + do + { + ssize_t x = 0; + __m256i v8n_0 = _mm256_setzero_si256(); + if (channels == 4) + { + #ifdef __AVX2__ + for (; x <= width - 2; x += 2) + { + __m256i vInt = _mm256_cvtps_epi32(_mm256_loadu_ps(&in[y][x * channels])); + __m256i vInt2 = _mm256_permute2x128_si256(vInt, vInt, 1); + + __m128i vRGBA = _mm256_castsi256_si128(_mm256_packus_epi16(_mm256_packus_epi32(vInt, vInt2), v8n_0)); + + if (transposeOut) + { + *(uint32_t *)&out[x][y * channels] = _mm_extract_epi32(vRGBA, 0); + *(uint32_t *)&out[x + 1][y * channels] = _mm_extract_epi32(vRGBA, 1); + } + else + { + _mm_storel_epi64((__m128i *)&out[y][x * channels], vRGBA); + } + } + #endif + while (x < width) + { + __m128 data = _mm_loadu_ps(&in[y][x * channels]); + if (transposeOut) + StoreFloats(&out[x][y * channels], data); + else + StoreFloats(&out[y][x * channels], data); + ++x; + } + } + else if (channels == 1) + { + for (; x <= width - 8; x += 8) + { + StoreFloats(&out[y][x], _mm256_loadu_ps(&in[y][x])); + } + if (x < width) + StoreFloats(&out[y][x], _mm256_loadu_ps(&in[y][x]), width - x); + } + ++y; + } while (y < height); +} + +template +FORCE_INLINE void Copy2D(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height) +{ + ssize_t y = 0; + do + { + ssize_t x = 0; + __m256i v8n_0 = _mm256_setzero_si256(); + if (channels == 4) + { + #ifdef __AVX2__ + for (; x <= width - 2; x += 2) + { + __m256i vInt = _mm256_cvtps_epi32(_mm256_loadu_ps(&in[y][x * channels])); + __m256i vInt2 = _mm256_permute2x128_si256(vInt, vInt, 1); + + __m128i vRGBA = _mm256_castsi256_si128(_mm256_packus_epi32(vInt, vInt2)); + + if (transposeOut) + { + _mm_storel_epi64((__m128i *)&out[x][y * channels], vRGBA); + _mm_storel_epi64((__m128i *)&out[x + 1][y * channels], _mm_shuffle_epi32(vRGBA, _MM_SHUFFLE(0, 0, 3, 2))); + } + else + { + _mm_storeu_si128((__m128i *)&out[y][x * channels], vRGBA); + } + } + #endif + while (x < width) + { + __m128 data = _mm_loadu_ps(&in[y][x * channels]); + if (transposeOut) + StoreFloats(&out[x][y * channels], data); + else + StoreFloats(&out[y][x * channels], data); + ++x; + } + } + else if (channels == 1) + { + for (; x <= width - 8; x += 8) + { + StoreFloats(&out[y][x], _mm256_loadu_ps(&in[y][x])); + } + if (x < width) + StoreFloats(&out[y][x], _mm256_loadu_ps(&in[y][x]), width - x); + } + ++y; + } while (y < height); +} + +template +FORCE_INLINE void Copy2D(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height) +{ + ssize_t y = 0; + do + { + ssize_t x = 0; + __m256i v8n_0 = _mm256_setzero_si256(); + if (channels == 4) + { + #ifdef __AVX2__ + for ( ; x < width; x++) + { + __m128i vInt = _mm256_cvtpd_epi32(_mm256_loadu_pd(&in[y][x * channels])); + __m128i vRGBA = _mm_packus_epi32(vInt, vInt); + + if (transposeOut) + { + _mm_storel_epi64((__m128i *)&out[x][y * channels], vRGBA); + } + else + { + _mm_storel_epi64((__m128i *)&out[y][x * channels], vRGBA); + } + } + #else + throw 0; + #endif + } + else if (channels == 1) + { + for (; x <= width - 4; x += 4) + { + StoreDoubles(&out[y][x], _mm256_loadu_pd(&in[y][x])); + } + if (x < width) + { + StoreDoubles(&out[y][x], _mm256_loadu_pd(&in[y][x]), width - x); + } + } + ++y; + } while (y < height); +} + +template +FORCE_INLINE void Copy2D(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height) +{ + ssize_t y = 0; + do + { + if (channels == 4) + { + ssize_t x = 0; + do + { + #ifdef __AVX__ + __m128 v4f_data = _mm256_cvtpd_ps(_mm256_loadu_pd(&in[y][x * channels])); + #else + __m128 v4f_data = _mm_shuffle_ps(_mm_cvtpd_ps(_mm_loadu_pd(&in[y][x * channels])), + _mm_cvtpd_ps(_mm_loadu_pd(&in[y][x * channels + 2])), _MM_SHUFFLE(1, 0, 1, 0)); + #endif + if (transposeOut) + _mm_store_ps(&out[x][y * channels], v4f_data); + else + _mm_store_ps(&out[y][x * channels], v4f_data); + ++x; + } while (x < width); + } + else + { + // 1 channel + ssize_t x; + for (x = 0; x <= width - 4; x += 4) + StoreDoubles(&out[y][x], _mm256_loadu_pd(&in[y][x])); + if (x < width) + StoreDoubles(&out[y][x], _mm256_loadu_pd(&in[y][x]), width - x); + } + ++y; + } while (y < height); +} + +template +FORCE_INLINE void Copy2D(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height) +{ + ssize_t y = 0; + do + { + if (channels == 4) + { +#ifdef __AVX__ + ssize_t x = 0; + do + { + __m256d _in = _mm256_load_pd(&in[y][x * channels]); + __m128i inI32 = _mm256_cvtpd_epi32(_in), + inU16 = _mm_packus_epi32(inI32, inI32), + inU8 = _mm_packus_epi16(inU16, inU16); + if (transposeOut) + *(int32_t *)&out[x][y * channels] = _mm_cvtsi128_si32(inU8); + else + *(int32_t *)&out[y][x * channels] = _mm_cvtsi128_si32(inU8); + ++x; + } while (x < width); +#else + throw 0; +#endif + } + else + { + // 1 channel + ssize_t x; + for (x = 0; x <= width - 4; x += 4) + StoreDoubles(&out[y][x], _mm256_load_pd(&in[y][x * channels])); + if (x < width) + StoreDoubles(&out[y][x], _mm256_load_pd(&in[y][x * channels]), width - x); + } + ++y; + } while (y < height); +} + +#if 0 // comment this function out to ensure everything is vectorized +template +FORCE_INLINE void Copy2D(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height) +{ + ssize_t y = 0; + do + { + ssize_t x = 0; + do + { + ssize_t c = 0; + do + { + if (transposeOut) + out[x][y * channels + c] = clip_round_cast(in[y][x * channels + c]); + else + out[y][x * channels + c] = clip_round_cast(in[y][x * channels + c]); + ++c; + } while (c < channels); + ++x; + } while (x < width); + ++y; + } while (y < height); +} +#endif + +template +void StoreFloatsTransposed(SimpleImage out, __m128 x) +{ + out[0][0] = _mm_cvtss_f32(x); + out[1][0] = _mm_cvtss_f32(_mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 1))); + out[2][0] = _mm_cvtss_f32(_mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 2))); + out[3][0] = _mm_cvtss_f32(_mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 3))); +} + +const size_t GUARANTEED_ALIGNMENT = 16; + +#define ALIGNED_ALLOCA(size, alignment) RoundUp((size_t)alloca(size + (((alignment) / GUARANTEED_ALIGNMENT) - 1) * GUARANTEED_ALIGNMENT), alignment) + +// input & output are color interleaved +template +void ConvolveHorizontal(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height, float sigmaX, bool canOverwriteInput = false) +{ + const bool convertOutput = typeid(OutType) != typeid(IntermediateType); + typedef typename MyTraits::SIMDtype SIMDtype; + + double bf[N]; + double M[N*N]; // matrix used for initialization procedure (has to be double) + double b[N + 1]; + + calcFilter(sigmaX, bf); + + for (size_t i = 0; i forwardFilteredTemp; // TODO: can directly output to output buffer if !transposeOut & output type is float + SimpleImage forwardFiltered; + if (!convertOutput && !transposeOut) + { + forwardFiltered = SimpleImage((IntermediateType *)out.buffer, out.pitch); + } + /* + else if (canOverwriteInput && typeid(InType) == typeid(IntermediateType)) + { + forwardFiltered = SimpleImage((IntermediateType *)in.buffer, in.pitch); + }*/ + else + { + forwardFilteredTemp.Resize(width * channels, Y_BLOCK_SIZE); + forwardFiltered = forwardFilteredTemp; + } + + IntermediateType *borderValues = (IntermediateType *)alloca(channels * Y_BLOCK_SIZE * sizeof(IntermediateType)); + + #pragma omp for + for (ssize_t y0 = 0; y0 < height; y0 += Y_BLOCK_SIZE) + { + ssize_t x = 0; + ssize_t yBlockSize = min(height - y0, Y_BLOCK_SIZE); + + ssize_t i = 0; + do + { + ssize_t color = 0; + do + { + borderValues[i * channels + color] = in[y0 + i][(width - 1) * channels + color]; + ++color; + } while (color < channels); + ++i; + } while (i < yBlockSize); + + ssize_t xBlockSize = min(max(X_BLOCK_SIZE, ssize_t(N)), width); // try to process at least X_BLOCK_SIZE or else later, data won't be SIMD aligned + // convolve pixels[0:FILTER_SIZE - 1] + Convolve1DHorizontal(forwardFiltered, + in.SubImage(0, y0), + (IntermediateType *)NULL, + x, x + xBlockSize, width, yBlockSize, + vCoefficients, + M); + + x += xBlockSize; + while (x < width) + { + xBlockSize = min(width - x, X_BLOCK_SIZE); + + Convolve1DHorizontal(forwardFiltered, + in.SubImage(0, y0), + (IntermediateType *)NULL, + x, x + xBlockSize, width, yBlockSize, + vCoefficients, + M); + x += xBlockSize; + } + + //--------------- backward pass-------------------------- + SimpleImage floatOut; + // if output type is fixed point, we still compute an intermediate result as float for better precision + if (convertOutput) + { + floatOut = forwardFiltered; + } + else + { + floatOut = SimpleImage((IntermediateType *)&out[transposeOut ? 0 : y0][(transposeOut ? y0 : 0) * channels], out.pitch); + } + x = width - 1; + + ssize_t lastAligned = RoundDown(width, X_BLOCK_SIZE); + // todo: check is this really vector aligned? + xBlockSize = min(max(width - lastAligned, ssize_t(N)), width); // try to process more than N pixels so that later, data is SIMD aligned + + // in-place operation (use forwardFiltered as both input & output) is possible due to internal register buffering + if (transposeOut && !convertOutput) + Convolve1DHorizontal(floatOut, + forwardFiltered, + borderValues, + x, x - xBlockSize, width, yBlockSize, + vCoefficients, + M); + else + Convolve1DHorizontal(floatOut, + forwardFiltered, + borderValues, + x, x - xBlockSize, width, yBlockSize, + vCoefficients, + M); + + if (convertOutput) + { + ssize_t outCornerX = x + 1 - xBlockSize; + Copy2D(out.SubImage((transposeOut ? y0 : outCornerX) * channels, transposeOut ? outCornerX : y0), floatOut.SubImage(outCornerX * channels, 0), xBlockSize, yBlockSize); + } + x -= xBlockSize; + while (x >= 0) + { + xBlockSize = min(X_BLOCK_SIZE, x + 1); + + if (transposeOut && !convertOutput) + Convolve1DHorizontal(floatOut, + forwardFiltered, + borderValues, + x, x - xBlockSize, width, yBlockSize, + vCoefficients, + M); + else + Convolve1DHorizontal(floatOut, + forwardFiltered, + borderValues, + x, x - xBlockSize, width, yBlockSize, + vCoefficients, + M); + + if (convertOutput) + { + ssize_t outCornerX = x + 1 - xBlockSize; + Copy2D(out.SubImage((transposeOut ? y0 : outCornerX) * channels, transposeOut ? outCornerX : y0), floatOut.SubImage(outCornerX * channels, 0), xBlockSize, yBlockSize); + } + x -= xBlockSize; + } + } + } +} + +const float MAX_SIZE_FOR_SINGLE_PRECISION = 30.0f; // switch to double if sigma > threshold or else round off error becomes so big that the output barely changes and you see annoying mach bands + +template +void ConvolveHorizontal(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height, float sigmaX, bool canOverwriteInput = false) +{ + if (sigmaX > MAX_SIZE_FOR_SINGLE_PRECISION) + ConvolveHorizontal(out, in, width, height, sigmaX, canOverwriteInput); + else + ConvolveHorizontal(out, in, width, height, sigmaX, canOverwriteInput); +} + + +// handles blocking +// input & output are color interleaved +template +void ConvolveVertical(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height, double sigmaY) +{ + const bool convertOutput = typeid(OutType) != typeid(IntermediateType); + + const ssize_t Y_BLOCK_SIZE = 8, + X_BLOCK_SIZE = 40; // must be multiple of SIMD width or else say goodbye to throughput + + typedef typename MyTraits::SIMDtype SIMDtype; + + double bf[N]; + double M[N*N]; // matrix used for initialization procedure (has to be double) + double b[N + 1]; + + calcFilter(sigmaY, bf); + + for (size_t i = 0; i forwardFiltered; // TODO: can directly output to output buffer if output type is float + forwardFiltered.Resize(X_BLOCK_SIZE, height); + + IntermediateType *borderValues = (IntermediateType *)alloca(X_BLOCK_SIZE * sizeof(IntermediateType)); + + #pragma omp for + for (ssize_t x0 = 0; x0 < width; x0 += X_BLOCK_SIZE) + { + ssize_t y = 0; + ssize_t xBlockSize = min(width - x0, X_BLOCK_SIZE); + + ssize_t i = 0; + do + { + borderValues[i] = in[height - 1][x0 + i]; + ++i; + } while (i < xBlockSize); + + ssize_t yBlockSize = min(ssize_t(N), height); + // convolve pixels[0:filterSize - 1] + Convolve1DVertical(forwardFiltered, + in.SubImage(x0, 0), + (IntermediateType *)NULL, + y, y + yBlockSize, xBlockSize, height, + vCoefficients, + M); + + y += yBlockSize; + while (y < height) + { + yBlockSize = min(height - y, Y_BLOCK_SIZE); + + Convolve1DVertical(forwardFiltered, + in.SubImage(x0, 0), + (IntermediateType *)NULL, + y, y + yBlockSize, xBlockSize, height, + vCoefficients, + M); + y += yBlockSize; + } + + //--------------- backward pass-------------------------- + SimpleImage floatOut; + // if output type is fixed point, we still compute an intermediate result as float for better precision + if (convertOutput) + { + floatOut = forwardFiltered; + } + else + { + floatOut = SimpleImage((IntermediateType *)&out[0][x0], out.pitch); + } + y = height - 1; + yBlockSize = min(ssize_t(N), height); + + // in-place operation (use forwardFiltered as both input & output) is possible due to internal register buffering + Convolve1DVertical(floatOut, + forwardFiltered, + borderValues, + y, y - yBlockSize, xBlockSize, height, + vCoefficients, + M); + + if (convertOutput) + { + ssize_t outCornerY = y + 1 - yBlockSize; + Copy2D(out.SubImage(x0, outCornerY), floatOut.SubImage(0, outCornerY), xBlockSize, yBlockSize); + } + y -= yBlockSize; + while (y >= 0) + { + yBlockSize = min(Y_BLOCK_SIZE, y + 1); + + Convolve1DVertical(floatOut, + forwardFiltered, + borderValues, + y, y - yBlockSize, xBlockSize, y, + vCoefficients, + M); + + if (convertOutput) + { + ssize_t outCornerY = y + 1 - yBlockSize; + Copy2D(out.SubImage(x0, outCornerY), floatOut.SubImage(0, outCornerY), xBlockSize, yBlockSize); + } + y -= yBlockSize; + } + } + } +} + +template +void ConvolveVertical(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height, float sigmaY) +{ + if (sigmaY > MAX_SIZE_FOR_SINGLE_PRECISION) + ConvolveVertical(out, in, width, height, sigmaY); + else + ConvolveVertical(out, in, width, height, sigmaY); +} + +template +void SaveImage(const char *path, SimpleImage in, int width, int height, int channels, float scale) +{ + cairo_surface_t *scaled = cairo_image_surface_create(channels == 1 ? CAIRO_FORMAT_A8 : CAIRO_FORMAT_ARGB32, width, height); + + uint8_t *_scaled = cairo_image_surface_get_data(scaled); + ssize_t scaledPitch = cairo_image_surface_get_stride(scaled); + for (int y = 0; y < height; ++y) + { + for (int x = 0; x < width * channels; ++x) + { + _scaled[y * scaledPitch + x] = min(in[y][x] * scale, 255.0f); + } + } + cairo_surface_mark_dirty(scaled); + if (cairo_surface_write_to_png(scaled, path) != CAIRO_STATUS_SUCCESS) + throw 0; +} + +// 2D +template +void Convolve(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height, float sigmaX, float sigmaY) +{ + typedef uint16_t HorizontalFilteredType; + AlignedImage horizontalFiltered; + + const bool DO_TIMING = false; + double t0; + if (DO_TIMING) + t0 = Clock(); + + const bool TRANSPOSE = channels != 1; // means for the 1st and 2nd pass to transposes output + + if (TRANSPOSE) + { + horizontalFiltered.Resize(height * channels, width); + } + else + { + horizontalFiltered.Resize(width * channels, height); + } + ConvolveHorizontal(horizontalFiltered, in, width, height, sigmaX); + +#if 0 + // save intermediate image + float scale; + if (typeid(InType) == typeid(uint8_t)) + scale = 1.0f; + else if (typeid(InType) == typeid(uint16_t)) + scale = 1.0f; + else + scale = 1.0f; + SaveImage("horizontal_filtered.png", horizontalFiltered, TRANSPOSE ? height : width, TRANPOSE ? width : height, channels, scale); +#endif + + if (DO_TIMING) + cout << "Thoriz=" << Clock() - t0 << endl; + + //--------------------------------------------------- + + if (DO_TIMING) + t0 = Clock(); + if (TRANSPOSE) + { + ConvolveHorizontal(out, horizontalFiltered, height, width, sigmaY, true); + } + else + { + ConvolveVertical(out, horizontalFiltered, width * channels, height, sigmaY); + } + if (DO_TIMING) + cout << "Tvert=" << Clock() - t0 << endl; +} + +// in-place (out = in) operation not allowed +template +void ConvolveHorizontalFIR(SimpleImage out, SimpleImage in, + ssize_t width, ssize_t height, + ssize_t xStart, ssize_t xEnd, + MyTraits::SIMDtype *vFilter, int filterSize) // filterSize assumed to be odd +{ + if (channels == 4) + { + ssize_t y = 0; + do + { + ssize_t x = xStart; + #ifdef __AVX__ + const ssize_t SIMD_WIDTH = 8, + PIXELS_PER_ITERATION = SIMD_WIDTH / channels; + __m256 vSum, + leftBorderValue, rightBorderValue; + if (onBorder) + { + __m128 temp = Load4Floats(&in[y][0 * channels]); + leftBorderValue = Make256(temp, temp); + temp = Load4Floats(&in[y][(width - 1) * channels]); + rightBorderValue = Make256(temp, temp); + } + goto middle; + + do + { + StoreFloats(&out[y][(x - PIXELS_PER_ITERATION) * channels], vSum); + middle: + if (symmetric) + { + vSum = vFilter[0] * LoadFloats(&in[y][x * channels]); + for (ssize_t i = 1; i <= filterSize; ++i) + { + __m256 filter = vFilter[i]; + ssize_t srcX = x - i; + if (onBorder) + srcX = max(-(PIXELS_PER_ITERATION - 1), srcX); // hack: do this for now until LoadFloats is modified to support partial loads + + __m256 leftNeighbor = LoadFloats(&in[y][srcX * channels]); + srcX = x + i; + if (onBorder) + srcX = min(width - 1, srcX); // hack: do this for now until LoadFloats is modified to support partial loads + + __m256 rightNeighbor = LoadFloats(&in[y][srcX * channels]); + if (onBorder) + { + __m256i notPastEnd = PartialVectorMask32(min(PIXELS_PER_ITERATION, width - i - x) * channels * sizeof(float)), + beforeBeginning = PartialVectorMask32(min(PIXELS_PER_ITERATION, max(ssize_t(0), i - x)) * channels * sizeof(float)); + leftNeighbor = _mm256_blendv_ps(leftNeighbor, leftBorderValue, _mm256_castsi256_ps(beforeBeginning)); + rightNeighbor = _mm256_blendv_ps(rightBorderValue, rightNeighbor, _mm256_castsi256_ps(notPastEnd)); + } + vSum = vSum + filter * (leftNeighbor + rightNeighbor); + } + } + else + { + vSum = _mm256_setzero_ps(); + ssize_t i = 0; + // the smaller & simpler machine code for do-while probably outweighs the cost of the extra add + do + { + ssize_t srcX = x - filterSize / 2 + i; + // todo: border not handled + vSum = vSum + vFilter[i] * LoadFloats(&in[y][srcX * channels]); + ++i; + } while (i < filterSize); + } + x += PIXELS_PER_ITERATION; + } while (x < xEnd); + StoreFloats(&out[y][(x - PIXELS_PER_ITERATION) * channels], vSum, (xEnd - (x - PIXELS_PER_ITERATION)) * channels); + #else + do + { + __m128 vSum; + if (symmetric) + { + vSum = vFilter[0] * Load4Floats(&in[y][x * channels]); + for (ssize_t i = 1; i <= filterSize; ++i) + { + ssize_t srcX = x - i; + if (onBorder) + srcX = max(ssize_t(0), srcX); + + __m128 leftNeighbor = Load4Floats(&in[y][srcX * channels]); + + srcX = x + i; + if (onBorder) + srcX = min(width - 1, srcX); + __m128 rightNeighbor = Load4Floats(&in[y][srcX * channels]); + + vSum = vSum + vFilter[i] * (leftNeighbor + rightNeighbor); + } + } + else + { + vSum = _mm_setzero_ps(); + ssize_t i = 0; + do + { + ssize_t srcX = x - filterSize / 2 + i; + if (onBorder) + srcX = min(width - 1, max(ssize_t(0), srcX)); + + vSum = vSum + vFilter[i] * Load4Floats(&in[y][srcX * channels]); + ++i; + } while (i < filterSize); + } + StoreFloats(&out[y][x * channels], vSum); + ++x; + } while (x < xEnd); + + #endif + ++y; + } while (y < height); + } + else + { + // 1 channel + // todo: can merge with 4 channel? + ssize_t y = 0; + do + { + ssize_t x = xStart; + #ifdef __AVX__ + const ssize_t SIMD_WIDTH = 8; + + __m256 leftBorderValue, rightBorderValue; + if (onBorder) + { + leftBorderValue = _mm256_set1_ps(in[y][0 * channels]); + rightBorderValue = _mm256_set1_ps(in[y][(width - 1) * channels]); + } + __m256 vSum; + // trashed performance from basic block reordering warning + goto middle2; + do + { + // write out values from previous iteration + StoreFloats(&out[y][(x - SIMD_WIDTH) * channels], vSum); + middle2: + if (symmetric) + { + vSum = vFilter[0] * LoadFloats(&in[y][x * channels]); + for (ssize_t i = 1; i <= filterSize; ++i) + { + __m256 filter = vFilter[i]; + + ssize_t srcX = x - i; + if (onBorder) + srcX = max(-(SIMD_WIDTH - 1), srcX); // hack: do this for now until LoadFloats is modified to support partial loads + + __m256 leftNeighbor = LoadFloats(&in[y][srcX * channels]); + srcX = x + i; + if (onBorder) + srcX = min(width - 1, srcX); // hack: do this for now until LoadFloats is modified to support partial loads + __m256 rightNeighbor = LoadFloats(&in[y][srcX * channels]); + + if (onBorder) + { + __m256i notPastEnd = PartialVectorMask32(min(SIMD_WIDTH, width - i - x) * sizeof(float)), + beforeBeginning = PartialVectorMask32(min(SIMD_WIDTH, max(ssize_t(0), i - x)) * sizeof(float)); + leftNeighbor = _mm256_blendv_ps(leftNeighbor, leftBorderValue, _mm256_castsi256_ps(beforeBeginning)); + rightNeighbor = _mm256_blendv_ps(rightBorderValue, rightNeighbor, _mm256_castsi256_ps(notPastEnd)); + } + vSum = vSum + filter * (leftNeighbor + rightNeighbor); + } + } + else + { + vSum = _mm256_setzero_ps(); + ssize_t i = 0; + // the smaller & simpler machine code for do-while probably outweighs the cost of the extra add + do + { + ssize_t srcX = x - filterSize / 2 + i; + // todo: border not handled + vSum = vSum + vFilter[i] * LoadFloats(&in[y][srcX * channels]); + ++i; + } while (i < filterSize); + } + x += SIMD_WIDTH; + } while (x < xEnd); + StoreFloats(&out[y][(x - SIMD_WIDTH) * channels], vSum, xEnd - (x - SIMD_WIDTH)); + #else + // todo: SSE only + throw 0; + #endif + ++y; + } while (y < height); + } +} + +__m128i LoadAndScaleToInt16(__m128i &out, uint8_t *x) +{ + // convert from [0-255] to [0-16383] + // leave 1 spare bit so that 2 values can be added without overflow for symmetric filters + out = _mm_slli_epi16(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)x)), 6); + return out; +} + +__m128i LoadAndScaleToInt16(__m128i &out, int16_t *x) +{ + out = _mm_loadu_si128((__m128i *)x); + return out; +} + +__m256i LoadAndScaleToInt16(__m256i &out, uint8_t *x) +{ + // convert from [0-255] to [0-16383] + // leave 1 spare bit so that 2 values can be added without overflow for symmetric filters + out = _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)x)), 6); + return out; +} + +__m256i LoadAndScaleToInt16(__m256i &out, int16_t *x) +{ + out = _mm256_loadu_si256((__m256i *)x); + return out; +} + + +template +void ScaleAndStoreInt16(uint8_t *out, __m128i x, ssize_t n = 8) +{ + __m128i i16 = _mm_srai_epi16(_mm_adds_epi16(x, _mm_set1_epi16(32)), 6), + u8 = _mm_packus_epi16(i16, i16); + if (partial) + { +#ifdef _MSC_VER + _mm_maskmoveu_si128(u8, PartialVectorMask8(n), (char *)out); +#else + _mm_maskmove_si64(_mm_movepi64_pi64(u8), PartialVectorMask8(n), (char *)out); +#endif + } + else + _mm_storel_epi64((__m128i *)out, u8); +} + +template +void ScaleAndStoreInt16(uint8_t *out, __m256i x, ssize_t n = 16) +{ + __m256i i16 = _mm256_srai_epi16(_mm256_adds_epi16(x, _mm256_set1_epi16(32)), 6); + __m128i u8 = _mm256_castsi256_si128(_mm256_packus_epi16(i16, _mm256_permute2f128_si256(i16, i16, 1))); + if (partial) + _mm_maskmoveu_si128(u8, PartialVectorMask(n), (char *)out); + else + _mm_storeu_si128((__m128i *)out, u8); +} + +template +void ScaleAndStoreInt16(int16_t *out, __m128i i16, ssize_t n = 8) +{ + if (partial) + _mm_maskmoveu_si128(i16, PartialVectorMask(n * sizeof(int16_t)), (char *)out); + else + _mm_storeu_si128((__m128i *)out, i16); +} + +template +void ScaleAndStoreInt16(int16_t *out, __m256i i16, ssize_t n = 16) +{ + if (partial) + { + _mm_maskmoveu_si128(_mm256_castsi256_si128(i16), PartialVectorMask(min(ssize_t(8), n) * sizeof(int16_t)), (char *)out); + _mm_maskmoveu_si128(_mm256_extractf128_si256(i16, 1), PartialVectorMask(max(ssize_t(0), n - 8) * sizeof(int16_t)), (char *)&out[8]); + } + else + _mm256_storeu_si256((__m256i *)out, i16); +} + +// in-place (out = in) operation not allowed +template +void ConvolveHorizontalFIR(SimpleImage out, SimpleImage in, + ssize_t width, ssize_t height, + ssize_t xStart, ssize_t xEnd, + MyTraits::SIMDtype *vFilter, int filterSize) +{ + if (channels == 4) + { + ssize_t y = 0; + do + { + int16_t *convertedIn; + if (typeid(InType) == typeid(int16_t)) + { + convertedIn = (int16_t *)&in[y][0]; + } + else + { + convertedIn = (int16_t *)ALIGNED_ALLOCA(RoundUp(width * channels, ssize_t(sizeof(__m256i) / sizeof(int16_t))) * sizeof(int16_t), sizeof(__m256i)); + for (ssize_t x = 0; x < width * channels; x += 16) + { + __m128i u8 = _mm_loadu_si128((__m128i *)&in[y][x]); + __m256i i16 = _mm256_slli_epi16(_mm256_cvtepu8_epi16(u8), 6); + _mm256_store_si256((__m256i *)&convertedIn[x], i16); + } + } + ssize_t x = xStart; + const ssize_t SIMD_WIDTH = 16, + PIXELS_PER_ITERATION = SIMD_WIDTH / channels; + __m256i vSum; + __m256i leftBorderValue, rightBorderValue; + if (onBorder) + { + __m128i temp = _mm_set1_epi64x(*(int64_t *)&convertedIn[0 * channels]); + leftBorderValue = Make256(temp, temp); + temp = _mm_set1_epi64x(*(int64_t *)&convertedIn[(width - 1) * channels]); + rightBorderValue = Make256(temp, temp); + } + goto middle2; + do + { + ScaleAndStoreInt16(&out[y][(x - PIXELS_PER_ITERATION) * channels], vSum); + middle2: + if (symmetric) + { + __m256i center = _mm256_loadu_si256((__m256i *)&convertedIn[x * channels]); + vSum = _mm256_mulhrs_epi16(vFilter[0], center); + for (ssize_t i = 1; i <= filterSize; ++i) + { + __m256i filter = vFilter[i]; + + ssize_t srcX = x - i; + if (onBorder) + srcX = max(-(PIXELS_PER_ITERATION - 1), srcX); // todo: use this for now until LoadAndScaleToInt16() supports partial loads + + __m256i leftNeighbor, rightNeighbor; + leftNeighbor = _mm256_loadu_si256((__m256i *)&convertedIn[srcX * channels]); + + srcX = x + i; + if (onBorder) + srcX = min(width - 1, srcX); + rightNeighbor = _mm256_loadu_si256((__m256i *)&convertedIn[srcX * channels]); + + if (onBorder) + { + __m256i leftMask = PartialVectorMask32(min(PIXELS_PER_ITERATION, max(ssize_t(0), i - x)) * channels * sizeof(int16_t)), + rightMask = PartialVectorMask32(min(PIXELS_PER_ITERATION, width - (x + i)) * channels * sizeof(int16_t)); + leftNeighbor = _mm256_blendv_epi8(leftNeighbor, leftBorderValue, leftMask); + rightNeighbor = _mm256_blendv_epi8(rightBorderValue, rightNeighbor, rightMask); + } + vSum = _mm256_adds_epi16(vSum, _mm256_mulhrs_epi16(filter, _mm256_adds_epi16(leftNeighbor, rightNeighbor))); + } + } + else + { + throw 0; + } + x += PIXELS_PER_ITERATION; + } while (x < xEnd); + ScaleAndStoreInt16(&out[y][(x - PIXELS_PER_ITERATION) * channels], vSum, (xEnd - (x - PIXELS_PER_ITERATION)) * channels); + ++y; + } while (y < height); + } + else + { + #ifdef __GNUC__ + const static void *labels[] = + { + NULL, + &&remainder1, + &&remainder2, + &&remainder3, + &&remainder4, + + &&remainder5, + &&remainder6, + &&remainder7, + &&remainder8 + }; + #endif + // 1 channel + // todo: can merge with 4 channel? + ssize_t y = 0; + do + { + int16_t *convertedIn; + if (typeid(InType) == typeid(int16_t)) + { + convertedIn = (int16_t *)&in[y][0]; + } + else + { + convertedIn = (int16_t *)ALIGNED_ALLOCA(RoundUp(width, ssize_t(sizeof(__m256i) / sizeof(int16_t))) * sizeof(int16_t), sizeof(__m256i)); + for (ssize_t x = 0; x < width; x += 16) + { + __m128i u8 = _mm_loadu_si128((__m128i *)&in[y][x]); + __m256i i16 = _mm256_slli_epi16(_mm256_cvtepu8_epi16(u8), 6); + _mm256_store_si256((__m256i *)&convertedIn[x], i16); + } + } + ssize_t x = xStart; + const ssize_t SIMD_WIDTH = 8; + __m128i vSum; + __m128i leftBorderValue, rightBorderValue; + if (onBorder) + { + leftBorderValue = _mm_set1_epi16(convertedIn[0 * channels]); + rightBorderValue = _mm_set1_epi16(convertedIn[(width - 1) * channels]); + } + goto middle; + do + { + ScaleAndStoreInt16(&out[y][x - SIMD_WIDTH], vSum); + middle: + if (symmetric) + { + #ifdef __GNUC__ + // up to 1.2x faster by using palignr instead of unaligned loads! + // the greatest difficulty with using palignr is that the offset must be a compile time constant + // solution? Duff's device + __m128i leftHalf[2], + rightHalf[2], + center = _mm_loadu_si128((__m128i *)&convertedIn[x]); + + if (onBorder) + { + __m128i mask = PartialVectorMask(min(SIMD_WIDTH, width - x) * sizeof(int16_t)); + center = _mm_blendv_epi8(rightBorderValue, center, mask); + } + rightHalf[0] = leftHalf[1] = center; + + vSum = _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[0]), rightHalf[0]); + + ssize_t base = 0; + while (base < filterSize) + { + leftHalf[0] = rightHalf[0]; + rightHalf[1] = leftHalf[1]; + rightHalf[0] = _mm_loadu_si128((__m128i *)&convertedIn[x + base + 8]); + leftHalf[1] = _mm_loadu_si128((__m128i *)&convertedIn[x - base - 8]); + + if (onBorder) + { + __m128i leftMask = PartialVectorMask(min(SIMD_WIDTH, max(ssize_t(0), (base + 8) - x)) * sizeof(int16_t)), + rightMask = PartialVectorMask(min(SIMD_WIDTH, width - (x + base + 8)) * sizeof(int16_t)); + leftHalf[1] = _mm_blendv_epi8(leftHalf[1], leftBorderValue, leftMask); + rightHalf[0] = _mm_blendv_epi8(rightBorderValue, rightHalf[0], rightMask); + } + + goto *labels[min(ssize_t(8), filterSize - base)]; + __m128i v, v2; + remainder8: + v = rightHalf[0]; // same as palignr(right, left, 16) + v2 = leftHalf[1]; + vSum = _mm_adds_epi16(vSum, _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[base + 8]), _mm_adds_epi16(v, v2))); + remainder7: + v = _mm_alignr_epi8(rightHalf[0], leftHalf[0], 14); + v2 = _mm_alignr_epi8(rightHalf[1], leftHalf[1], 2); + vSum = _mm_adds_epi16(vSum, _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[base + 7]), _mm_adds_epi16(v, v2))); + remainder6: + v = _mm_alignr_epi8(rightHalf[0], leftHalf[0], 12); + v2 = _mm_alignr_epi8(rightHalf[1], leftHalf[1], 4); + vSum = _mm_adds_epi16(vSum, _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[base + 6]), _mm_adds_epi16(v, v2))); + remainder5: + v = _mm_alignr_epi8(rightHalf[0], leftHalf[0], 10); + v2 = _mm_alignr_epi8(rightHalf[1], leftHalf[1], 6); + vSum = _mm_adds_epi16(vSum, _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[base + 5]), _mm_adds_epi16(v, v2))); + remainder4: + v = _mm_alignr_epi8(rightHalf[0], leftHalf[0], 8); + v2 = _mm_alignr_epi8(rightHalf[1], leftHalf[1], 8); + vSum = _mm_adds_epi16(vSum, _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[base + 4]), _mm_adds_epi16(v, v2))); + remainder3: + v = _mm_alignr_epi8(rightHalf[0], leftHalf[0], 6); + v2 = _mm_alignr_epi8(rightHalf[1], leftHalf[1], 10); + vSum = _mm_adds_epi16(vSum, _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[base + 3]), _mm_adds_epi16(v, v2))); + remainder2: + v = _mm_alignr_epi8(rightHalf[0], leftHalf[0], 4); + v2 = _mm_alignr_epi8(rightHalf[1], leftHalf[1], 12); + vSum = _mm_adds_epi16(vSum, _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[base + 2]), _mm_adds_epi16(v, v2))); + remainder1: + v = _mm_alignr_epi8(rightHalf[0], leftHalf[0], 2); + v2 = _mm_alignr_epi8(rightHalf[1], leftHalf[1], 14); + vSum = _mm_adds_epi16(vSum, _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[base + 1]), _mm_adds_epi16(v, v2))); + base += 8; + } + #else + __m128i center; + vSum = _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[0]), LoadAndScaleToInt16(center, &convertedIn[x * channels])); + for (ssize_t i = 1; i <= filterSize; ++i) + { + __m128i filter = _mm256_castsi256_si128(vFilter[i]); + + ssize_t srcX = x - i; + if (onBorder) + srcX = max(-(SIMD_WIDTH - 1), srcX); // todo: use this for now until LoadAndScaleToInt16() supports partial loads + + __m128i leftNeighbor = _mm_loadu_si128((__m128i *)&convertedIn[srcX * channels]); + + srcX = x + i; + if (onBorder) + srcX = min(width - 1, srcX); + + __m128i rightNeighbor = _mm_loadu_si128((__m128i *)&convertedIn[srcX * channels]); + + if (onBorder) + { + __m128i leftMask = PartialVectorMask(min(SIMD_WIDTH, max(ssize_t(0), i - x)) * sizeof(int16_t)), + rightMask = PartialVectorMask(min(SIMD_WIDTH, width - (x + i)) * sizeof(int16_t)); + leftNeighbor = _mm_blendv_epi8(leftNeighbor, leftBorderValue, leftMask); + rightNeighbor = _mm_blendv_epi8(rightBorderValue, rightNeighbor, rightMask); + } + vSum = _mm_adds_epi16(vSum, _mm_mulhrs_epi16(filter, _mm_adds_epi16(leftNeighbor, rightNeighbor))); + } + #endif + } + else + { + throw 0; + } + x += SIMD_WIDTH; + } while (x < xEnd); + ScaleAndStoreInt16(&out[y][x - SIMD_WIDTH], vSum, xEnd - (x - SIMD_WIDTH)); + ++y; + } while (y < height); + } +} + +typedef int16_t FIR_IntermediateType; // can be either float or int16_t + +// handles blocking +// in-place (out = in) operation not allowed +template +void ConvolveHorizontalFIR(SimpleImage out, + SimpleImage in, + ssize_t width, ssize_t height, float sigmaX) +{ + typedef MyTraits::SIMDtype SIMDtype; + + ssize_t halfFilterSize = _effect_area_scr(sigmaX); + float *filter = (float *)alloca((halfFilterSize + 1) * sizeof(float)); + _make_kernel(filter, sigmaX); + + SIMDtype *vFilter = (SIMDtype *)ALIGNED_ALLOCA((halfFilterSize + 1) * sizeof(SIMDtype), sizeof(SIMDtype)); + + for (ssize_t i = 0; i <= halfFilterSize; ++i) + { + if (typeid(FIR_IntermediateType) == typeid(int16_t)) + vFilter[i] = BroadcastSIMD(clip_round_cast(filter[i] * 32768)); + else + vFilter[i] = BroadcastSIMD((FIR_IntermediateType)filter[i]); + } + + const ssize_t IDEAL_Y_BLOCK_SIZE = 1; // pointless for now, but might be needed in the future when SIMD code processes 2 rows at a time + + #pragma omp parallel + { + #pragma omp for + for (ssize_t y = 0; y < height; y += IDEAL_Y_BLOCK_SIZE) + { + ssize_t yBlockSize = min(height - y, IDEAL_Y_BLOCK_SIZE); + + ssize_t nonBorderStart = RoundUp(halfFilterSize, ssize_t(sizeof(__m256) / channels)); // so that data for non-border region is vector aligned + ConvolveHorizontalFIR(out.SubImage(0, y), + in.SubImage(0, y), + width, yBlockSize, + nonBorderStart, width - halfFilterSize, + vFilter, halfFilterSize); + + ssize_t xStart = 0, + xEnd = nonBorderStart; + processEnd: + ConvolveHorizontalFIR(out.SubImage(0, y), + in.SubImage(0, y), + width, yBlockSize, + xStart, xEnd, + vFilter, halfFilterSize); + if (xStart == 0) + { + // avoid inline happy compiler from inlining another call to ConvolveHorizontalFIR() + xStart = width - halfFilterSize; + xEnd = width; + goto processEnd; + } + } + } // omp parallel +} + +// in-place (out = in) operation not allowed +template +void ConvolveVerticalFIR(SimpleImage out, SimpleImage in, + ssize_t width, ssize_t height, + ssize_t yStart, ssize_t yEnd, + MyTraits::SIMDtype *vFilter, int filterSize) +{ + ssize_t y = yStart; + do + { + ssize_t x = 0; +#ifdef __AVX__ + const ssize_t SIMD_WIDTH = 8; + __m256 vSum; + goto middle; + do + { + StoreFloats(&out[y][x - SIMD_WIDTH], vSum); // write out data from previous iteration + middle: + if (symmetric) + { + vSum = vFilter[0] * LoadFloats(&in[y][x]); + for (ssize_t i = 1; i <= filterSize; ++i) + { + ssize_t srcY = y - i; + if (onBorder) + srcY = max(ssize_t(0), srcY); + __m256 bottom = LoadFloats(&in[srcY][x]); + + srcY = y + i; + if (onBorder) + srcY = min(height - 1, srcY); + __m256 top = LoadFloats(&in[srcY][x]); + + vSum = vSum + vFilter[i] * (bottom + top); + } + } + else + { + // wouldn't be surprised if the smaller & simpler do-while code outweighs the cost of the extra add + vSum = _mm256_setzero_ps(); + ssize_t i = 0; + do + { + ssize_t srcY = y - filterSize / 2 + i; + if (onBorder) + srcY = min(height - 1, max(ssize_t(0), srcY)); + + vSum = vSum + vFilter[i] * LoadFloats(&in[srcY][x]); + ++i; + } while (i < filterSize); + } + x += SIMD_WIDTH; + } while (x < width); + StoreFloats(&out[y][x], vSum, width - (x - SIMD_WIDTH)); +#else + // for SSE only + const ssize_t SIMD_WIDTH = 4; + __m128 vSum; + goto middle; + do + { + StoreFloats(&out[y][x - SIMD_WIDTH], vSum); // write out data from previous iteration + middle: + if (symmetric) + { + vSum = _mm256_castps256_ps128(vFilter[0]) * Load4Floats(&in[y][x]); + for (ssize_t i = 1; i <= filterSize; ++i) + { + ssize_t srcY = y - i; + if (onBorder) + srcY = max(ssize_t(0), srcY); + __m128 bottom = Load4Floats(&in[srcY][x]); + + srcY = y + i; + if (onBorder) + srcY = min(height - 1, srcY); + __m128 top = Load4Floats(&in[srcY][x]); + + vSum = vSum + _mm256_castps256_ps128(vFilter[i]) * (bottom + top); + } + } + else + { + vSum = _mm_setzero_ps(); + ssize_t i = 0; + do + { + ssize_t srcY = y - filterSize / 2 + i; + if (onBorder) + srcY = min(height - 1, max(ssize_t(0), srcY)); + __m128 _vFilter; +#ifdef __AVX__ + _vFilter = _mm256_castps256_ps128(vFilter[i]); +#else + _vFilter = vFilter[i]; +#endif + vSum = vSum + _vFilter * Load4Floats(&in[srcY][x]); + ++i; + } while (i < filterSize); + } + x += SIMD_WIDTH; + } while (x < width); + StoreFloats(&out[y][x - SIMD_WIDTH], vSum, width - (x - SIMD_WIDTH)); // todo: handle partial +#endif + ++y; + } while (y < yEnd); +} + +// in-place (out = in) operation not allowed +template +void ConvolveVerticalFIR(SimpleImage out, SimpleImage in, + ssize_t width, ssize_t height, + ssize_t yStart, ssize_t yEnd, + MyTraits::SIMDtype *vFilter, int filterSize) +{ + ssize_t y = yStart; + do + { + ssize_t x = 0; + #ifdef __AVX2__ + const ssize_t SIMD_WIDTH = 16; + __m256i vSum; + goto middle; + do + { + ScaleAndStoreInt16(&out[y][x - SIMD_WIDTH], vSum); // store data from previous iteration + middle: + if (symmetric) + { + __m256i center; + vSum = _mm256_mulhrs_epi16(vFilter[0], LoadAndScaleToInt16(center, &in[y][x])); + for (ssize_t i = 1; i <= filterSize; ++i) + { + __m256i filter = vFilter[i]; + ssize_t srcY = y + i; + if (onBorder) + srcY = min(srcY, height - 1); + __m256i topNeighbor; + LoadAndScaleToInt16(topNeighbor, &in[srcY][x]); + + srcY = y - i; + if (onBorder) + srcY = max(srcY, ssize_t(0)); + __m256i bottomNeighbor; + LoadAndScaleToInt16(bottomNeighbor, &in[srcY][x]); + vSum = _mm256_adds_epi16 + ( + vSum, + _mm256_mulhrs_epi16 + ( + filter, + _mm256_adds_epi16(bottomNeighbor, topNeighbor) + ) + ); + } + } + else + { + throw 0; + } + x += SIMD_WIDTH; + } while (x < width); + ScaleAndStoreInt16(&out[y][x - SIMD_WIDTH], vSum, width - (x - SIMD_WIDTH)); + #else + const ssize_t SIMD_WIDTH = 8; + __m128i vSum; + goto middle; + do + { + ScaleAndStoreInt16(&out[y][x - SIMD_WIDTH], vSum); // store data from previous iteration + middle: + if (symmetric) + { + __m128i center; + vSum = _mm_mulhrs_epi16(_mm256_castsi256_si128(vFilter[0]), LoadAndScaleToInt16(center, &in[y][x])); + for (ssize_t i = 1; i <= filterSize; ++i) + { + __m128i filter = _mm256_castsi256_si128(vFilter[i]); + ssize_t srcY = y + i; + if (onBorder) + srcY = min(srcY, height - 1); + __m128i topNeighbor; + LoadAndScaleToInt16(topNeighbor, &in[srcY][x]); + + srcY = y - i; + if (onBorder) + srcY = max(srcY, ssize_t(0)); + __m128i bottomNeighbor; + LoadAndScaleToInt16(bottomNeighbor, &in[srcY][x]); + vSum = _mm_adds_epi16 + ( + vSum, + _mm_mulhrs_epi16 + ( + filter, + _mm_adds_epi16(bottomNeighbor, topNeighbor) + ) + ); + } + } + else + { + throw 0; + } + x += SIMD_WIDTH; + } while (x < width); + ScaleAndStoreInt16(&out[y][x - SIMD_WIDTH], vSum, width - (x - SIMD_WIDTH)); + #endif + ++y; + } while (y < yEnd); +} + + +// in-place (out = in) operation not allowed +template +void ConvolveVerticalFIR(SimpleImage out, + SimpleImage in, + ssize_t width, ssize_t height, + float sigmaY) +{ + typedef MyTraits::SIMDtype SIMDtype; + + int halfFilterSize = _effect_area_scr(sigmaY); + + float *filter = (float *)alloca((halfFilterSize + 1) * sizeof(float)); + + _make_kernel(filter, sigmaY); + + SIMDtype *vFilter = (SIMDtype *)ALIGNED_ALLOCA((halfFilterSize + 1) * sizeof(SIMDtype), sizeof(SIMDtype)); + + for (ssize_t i = 0; i <= halfFilterSize; ++i) + { + if (typeid(FIR_IntermediateType) == typeid(int16_t)) + vFilter[i] = BroadcastSIMD(clip_round_cast(filter[i] * 32768)); + else + vFilter[i] = BroadcastSIMD((FIR_IntermediateType)filter[i]); + } + + const ssize_t IDEAL_Y_BLOCK_SIZE = 2; // currently, no advantage to making > 1 + + #pragma omp parallel + { + #pragma omp for + for (ssize_t y = 0; y < height; y += IDEAL_Y_BLOCK_SIZE) + { + ssize_t yBlockSize = min(height - y, IDEAL_Y_BLOCK_SIZE); + bool onBorder = y < halfFilterSize || y + IDEAL_Y_BLOCK_SIZE + halfFilterSize > height; + + if (onBorder) + { + ConvolveVerticalFIR(out, in, + width, height, + y, y + yBlockSize, + vFilter, halfFilterSize); + + } + else + { + + ConvolveVerticalFIR(out, in, + width, height, + y, y + yBlockSize, + vFilter, halfFilterSize); + } + } + } // omp parallel +} + +template +void ConvolveFIR(SimpleImage out, SimpleImage in, ssize_t width, ssize_t height, float sigmaX, float sigmaY) +{ + AlignedImage horizontalFiltered; + + horizontalFiltered.Resize(width * channels, height); + + const bool DO_TIMING = false; + + double t0; + if (DO_TIMING) + t0 = Clock(); + + ConvolveHorizontalFIR(horizontalFiltered, in, width, height, sigmaX); + + if (DO_TIMING) + { + double t1 = Clock(); + cout << "T_horiz=" << t1 - t0 << endl; + t0 = t1; + } + + // todo: use sliding window to reduce cache pollution + float scale = 1.0f; + if (typeid(FIR_IntermediateType) == typeid(int16_t)) + scale = 1.0f / 64; + + //SaveImage("horizontal_filtered.png", horizontalFiltered, width, height, channels, scale); + ConvolveVerticalFIR(out, horizontalFiltered, width * channels, height, sigmaY); + if (DO_TIMING) + cout << "T_v=" << Clock() - t0 << endl; +} + +#ifdef UNIT_TEST + +template +void CompareImages(SimpleImage ref, SimpleImage actual, int w, int h) +{ + double avgDiff = 0, + maxDiff = 0, + maxErrorFrac = 0; + for (int y = 0; y < h; ++y) + { + for (int x = 0; x < w; ++x) + { + double diff = abs((double)actual[y][x] - (double)ref[y][x]); + maxDiff = max(diff, maxDiff); + avgDiff += diff; + if (ref[y][x] != 0) + { + double errorFrac = abs(diff / ref[y][x]); + maxErrorFrac = max(errorFrac, maxErrorFrac); + } + } + } + avgDiff /= (w * h); + cout << "avgDiff=" << setprecision(4) << setw(9) << avgDiff << " maxDiff=" << setw(4) << maxDiff << " maxErrorFrac=" << setprecision(4) << setw(8) << maxErrorFrac << endl; +} + +void RefFilterIIR(cairo_surface_t *out, cairo_surface_t *in, + float deviation_x, float deviation_y) +{ + int threads = omp_get_max_threads(); + const int MAX_THREADS = 16; + + int h = cairo_image_surface_get_height(in), + w = cairo_image_surface_get_width(in); + + IIRValue * tmpdata[MAX_THREADS]; + for (int i = 0; i < threads; ++i) + tmpdata[i] = new IIRValue[std::max(w, h)*4]; + + gaussian_pass_IIR(Geom::X, deviation_x, in, out, tmpdata, threads); + gaussian_pass_IIR(Geom::Y, deviation_y, out, out, tmpdata, threads); + + for (int i = 0; i < threads; ++i) + delete[] tmpdata[i]; +} + +void RefFilterFIR(cairo_surface_t *out, cairo_surface_t *in, + float deviation_x, float deviation_y) +{ + int threads = omp_get_max_threads(); + gaussian_pass_FIR(Geom::X, deviation_x, in, out, threads); + gaussian_pass_FIR(Geom::Y, deviation_y, out, out, threads); +} + + +cairo_surface_t *ConvertToGrayscale(cairo_surface_t *s) +{ + int w = cairo_image_surface_get_width(s), + h = cairo_image_surface_get_height(s); + cairo_surface_t *temp = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, w, h), + *grayScale = cairo_image_surface_create(CAIRO_FORMAT_A8, w, h); + + cairo_t *r = cairo_create(temp); + // set to 0 + cairo_set_source_rgb(r, 0, 0, 0); + cairo_paint(r); + + // convert to gray scale + cairo_set_operator(r, CAIRO_OPERATOR_HSL_LUMINOSITY); + cairo_set_source_surface(r, s, 0, 0); + cairo_paint(r); + cairo_destroy(r); + + ssize_t inPitch = cairo_image_surface_get_stride(temp), + outPitch = cairo_image_surface_get_stride(grayScale); + uint8_t *in = cairo_image_surface_get_data(temp), + *out = cairo_image_surface_get_data(grayScale); + for (int y = 0; y < h; ++y) + { + for (int x = 0; x < w; ++x) + { + out[y * outPitch + x] = in[y * inPitch + x * 4]; + } + } + cairo_surface_destroy(temp); + cairo_surface_mark_dirty(grayScale); + return grayScale; +} + +void CopySurface(cairo_surface_t *out, cairo_surface_t *in) +{ + int pitch = cairo_image_surface_get_stride(in), + h = cairo_image_surface_get_height(in); + + memcpy(cairo_image_surface_get_data(out), cairo_image_surface_get_data(in), pitch * h); + cairo_surface_mark_dirty(out); +} + +int main(int argc, char **argv) +{ + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); // why does Intel need microcode to handle denormals? NVIDIA handles it in hardware fine + + //omp_set_num_threads(1); + + + cairo_surface_t *in; + bool result; + if (argc >= 2) + in = cairo_image_surface_create_from_png(argv[1]); + else + in = cairo_image_surface_create_from_png("../drmixx/rasterized/99_showdown_carcass.png"); + + // OMG, not aligning to 16 bytes can almost slow things down 2x! + //iluScale(RoundUp(ilGetInteger(IL_IMAGE_WIDTH), 4), RoundUp(ilGetInteger(IL_IMAGE_HEIGHT), 4), 0); + + if (cairo_surface_status(in) != CAIRO_STATUS_SUCCESS) + { + cerr << "error loading" << endl; + return -1; + } + + //if (!iluScale(38, 44, 1)) + // cerr << "error scaling" << endl; + int originalHeight = cairo_image_surface_get_height(in), + originalWidth = cairo_image_surface_get_width(in); + + cairo_surface_t *grayScaleIn = ConvertToGrayscale(in); + + auto IterateCombinations = [&](int adjustedHeight0, int adjustedHeight1, int adjustedWidth0, int adjustedWidth1, auto callback) + { + for (int adjustedHeight = adjustedHeight0; adjustedHeight <= adjustedHeight1; ++adjustedHeight) + { + for (int adjustedWidth = adjustedWidth0; adjustedWidth <= adjustedWidth1; ++adjustedWidth) + { + for (int channels = 1; channels <= 4; channels += 3) + { + cairo_surface_t *modifiedIn = cairo_surface_create_similar_image(in, channels == 1 ? CAIRO_FORMAT_A8 : CAIRO_FORMAT_ARGB32, adjustedWidth, adjustedHeight); + if (cairo_surface_status(modifiedIn) != CAIRO_STATUS_SUCCESS) + { + cerr << "error creating surface" << endl; + continue; + } + cairo_t *ct = cairo_create(modifiedIn); + cairo_scale(ct, double(adjustedWidth) / originalWidth, double(adjustedHeight) / originalHeight); + // scale/convert to given size/format + if (channels == 1) + { + cairo_set_source_rgb(ct, 1, 1, 1); + cairo_mask_surface(ct, grayScaleIn, 0, 0); + } + else + { + cairo_set_source_surface(ct, in, 0, 0); + cairo_paint(ct); + } + cairo_destroy(ct); + + cairo_surface_t *out = cairo_surface_create_similar_image(modifiedIn, cairo_image_surface_get_format(modifiedIn), adjustedWidth, adjustedHeight); + + for (int FIRorIIR = 0; FIRorIIR < 2; ++FIRorIIR) + { + for (int inPlace = 0; inPlace < 2; ++inPlace) + { + if (inPlace) + CopySurface(out, modifiedIn); // restore overwritten input image + + callback(out, inPlace ? out : modifiedIn, modifiedIn, inPlace, FIRorIIR); + } + } + cairo_surface_destroy(modifiedIn); + cairo_surface_destroy(out); + } + } + } + }; +#define COMPARE +#ifdef COMPARE + auto CompareFunction = [&](cairo_surface_t *out, cairo_surface_t *in, cairo_surface_t *backupIn, bool inPlace, bool FIRorIIR) + { + // here, we assume input & output have the same format + int channels = cairo_image_surface_get_format(in) == CAIRO_FORMAT_ARGB32 ? 4 : 1; + + SimpleImage _in(cairo_image_surface_get_data(in), cairo_image_surface_get_stride(in)), + _out(cairo_image_surface_get_data(out), cairo_image_surface_get_stride(out)); + int width = cairo_image_surface_get_width(in), + height = cairo_image_surface_get_height(in); + + cairo_surface_t *refOut = cairo_surface_create_similar_image(in, channels == 1 ? CAIRO_FORMAT_A8 : CAIRO_FORMAT_ARGB32, width, height); + + SimpleImage _refOut(cairo_image_surface_get_data(refOut), cairo_image_surface_get_stride(refOut)); + bool originalSize = width == originalWidth && height == originalHeight; + + // test the correctness of different sigmas only for the original image size + // testing multiple sigmas for scaled image sizes is a waste + float DEFAULT_SIGMA = 5.0f; + + float sigmaX0 = originalSize ? 0.5f : DEFAULT_SIGMA, + sigmaX1 = originalSize ? (FIRorIIR ? 64 : 4) : DEFAULT_SIGMA; + for (float sigmaX = sigmaX0; sigmaX <= sigmaX1; sigmaX *= 2) + { + float sigmaY = 1.2f * sigmaX; + + cout << width << "x" << height + << setw(10) << (channels == 4 ? " RGBA" : " grayscale") + << (FIRorIIR ? " IIR" : " FIR") + << setw(15) << (inPlace ? " in-place" : " out-of-place") + << " sigmaX=" << setw(3) << sigmaX << " "; + double dt = 0; + + if (inPlace) + { + CopySurface(out, backupIn); + } + + + if (FIRorIIR) + { + if (channels == 1) + Convolve<1>(_out, _in, width, height, sigmaX, sigmaY); + else + Convolve<4>(_out, _in, width, height, sigmaX, sigmaY); + } + else + { + if (channels == 1) + ConvolveFIR<1>(_out, _in, width, height, sigmaX, sigmaY); + else + ConvolveFIR<4>(_out, _in, width, height, sigmaX, sigmaY); + } + + // ---------------------reference + cairo_surface_t *refIn; + if (inPlace) + { + refIn = refOut; + CopySurface(refIn, backupIn); + } + else + { + refIn = in; + } + + if (FIRorIIR) + RefFilterIIR(refOut, refIn, sigmaX, sigmaY); + else + RefFilterFIR(refOut, refIn, sigmaX, sigmaY); + + if (0)//FIRorIIR && width == 1466 && sigmaX == 0.5f) + { + cout << " dumping "; + cairo_surface_write_to_png(refOut, "filtered_ref.png"); + cairo_surface_write_to_png(out, "filtered_opt.png"); + exit(1); + } + + CompareImages(_refOut, _out, width, height); + } + cairo_surface_destroy(refOut); + }; + + const int SIMD_Y_BLOCK_SIZE = 2, // for checking SIMD remainder handling issues + SIMD_X_BLOCK_SIZE = 4; + int h0 = RoundDown(originalHeight, SIMD_Y_BLOCK_SIZE), + w0 = RoundDown(originalWidth, SIMD_X_BLOCK_SIZE); + IterateCombinations(h0, h0 + SIMD_Y_BLOCK_SIZE, w0, w0 + SIMD_X_BLOCK_SIZE, CompareFunction); + //IterateCombinations(height, height, width, width, CompareFunction); + +#else + // benchmark +#if defined(_WIN32) && defined(_DEBUG) + const int REPEATS = 1; +#else + const int REPEATS = 50 * max(1.0f, sqrtf(omp_get_max_threads())); // assume speedup is sqrt(#cores) +#endif + auto BenchmarkFunction = [&](cairo_surface_t *out, cairo_surface_t *in, cairo_surface_t *backupIn, bool inPlace, bool FIRorIIR) + { + // here, we assume input & output have the same format + int channels = cairo_image_surface_get_format(in) == CAIRO_FORMAT_ARGB32 ? 4 : 1; + + SimpleImage _in(cairo_image_surface_get_data(in), cairo_image_surface_get_stride(in)), + _out(cairo_image_surface_get_data(out), cairo_image_surface_get_stride(out)); + int width = cairo_image_surface_get_width(in), + height = cairo_image_surface_get_height(in); + const bool useRefCode = false; + + + for (float sigma = FIRorIIR ? 5 : 0.5f; sigma <= (FIRorIIR ? 5 : 4); sigma *= 2) + { + cout << width << "x" << height + << setw(10) << (channels == 4 ? " RGBA" : " luminance") + << (FIRorIIR ? " IIR" : " FIR") + << setw(15) << (inPlace ? " in-place" : " out-of-place") + << " sigma=" << setw(3) << sigma; + double dt = 0; + for (int i = 0; i < REPEATS; ++i) + { + if (inPlace) + CopySurface(out, backupIn); // copy backup to input/output + + double t0 = Clock(); + if (useRefCode) + { + if (FIRorIIR) + RefFilterIIR(out, in, sigma, sigma); + else + RefFilterFIR(out, in, sigma, sigma); + } + else + { + if (FIRorIIR) + { + if (channels == 1) + Convolve<1>(_out, _in, width, height, sigma, sigma); + else + Convolve<4>(_out, _in, width, height, sigma, sigma); + } + else + { + if (channels == 1) + ConvolveFIR<1>(_out, _in, width, height, sigma, sigma); + else + ConvolveFIR<4>(_out, _in, width, height, sigma, sigma); + } + } + dt += Clock() - t0; + } + cout << setw(9) << setprecision(3) << double(width * height * REPEATS) / (dt * 1000000.0) << " Mpix/s" << endl; + } + }; + int roundedWidth = RoundUp(originalWidth, 4), + roundedHeight = RoundUp(originalHeight, 4); + IterateCombinations(roundedWidth, roundedWidth, roundedHeight, roundedHeight, BenchmarkFunction); + +#endif + cairo_surface_destroy(in); + cairo_surface_destroy(grayScaleIn); + return 0; +} +#endif + void FilterGaussian::render_cairo(FilterSlot &slot) { cairo_surface_t *in = slot.getcairo(_input); @@ -645,22 +4469,81 @@ } cairo_surface_flush(downsampled); + SimpleImage im((uint8_t *)cairo_image_surface_get_data(downsampled), cairo_image_surface_get_stride(downsampled)); + + // 2D filter benefits + // 1. intermediate image may have higher precision than uint8 + // 2. reduced cache pollution from useless flushing of intermediate image to memory + if (scr_len_x > 0 && scr_len_y > 0 && use_IIR_x == use_IIR_y && fmt == CAIRO_FORMAT_ARGB32) + { + if (use_IIR_x) + { + Convolve<4>(im, // out + im, // in + cairo_image_surface_get_width(downsampled), + cairo_image_surface_get_height(downsampled), + deviation_x, deviation_y); + } + else + { + ConvolveFIR<4>(im, // out + im, // in + cairo_image_surface_get_width(downsampled), + cairo_image_surface_get_height(downsampled), + deviation_x, deviation_y); + } + } + else + { if (scr_len_x > 0) { if (use_IIR_x) { - gaussian_pass_IIR(Geom::X, deviation_x, downsampled, downsampled, tmpdata, threads); + if (fmt == CAIRO_FORMAT_ARGB32) + { + ConvolveHorizontal(im, // out + im, // in + cairo_image_surface_get_width(downsampled), + cairo_image_surface_get_height(downsampled), + deviation_x); + } + else { + ConvolveHorizontal(im, // out + im, // in + cairo_image_surface_get_width(downsampled), + cairo_image_surface_get_height(downsampled), + deviation_x); + //gaussian_pass_IIR(Geom::X, deviation_x, downsampled, downsampled, tmpdata, threads); + } } else { + // optimized 1D FIR filter can't work in-place gaussian_pass_FIR(Geom::X, deviation_x, downsampled, downsampled, threads); } } if (scr_len_y > 0) { if (use_IIR_y) { - gaussian_pass_IIR(Geom::Y, deviation_y, downsampled, downsampled, tmpdata, threads); + if (fmt == CAIRO_FORMAT_ARGB32) + { + ConvolveVertical(im, // out + im, // in + cairo_image_surface_get_width(downsampled) * 4, + cairo_image_surface_get_height(downsampled), + deviation_y); + } + else + { + ConvolveVertical(im, // out + im, // in + cairo_image_surface_get_width(downsampled), + cairo_image_surface_get_height(downsampled), + deviation_y); + //gaussian_pass_IIR(Geom::Y, deviation_y, downsampled, downsampled, tmpdata, threads); + } } else { + // optimized 1D FIR filter can't work in-place gaussian_pass_FIR(Geom::Y, deviation_y, downsampled, downsampled, threads); } } - + } // free the temporary data if ( use_IIR_x || use_IIR_y ) { for(int i = 0; i < threads; ++i) {