From d28caec8cc367b97bd2ed2e89e9f1ced2d591e77 Mon Sep 17 00:00:00 2001 From: Fabien Freling Date: Mon, 8 Sep 2014 22:32:35 +0200 Subject: [PATCH] Implement SIMD in two passes. In a first pass, we put the pixels values and the coefficients in two separate buffers. In a second pass, we computer the interpolation. --- Makefile | 2 +- rotation.cpp | 109 +++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 84 insertions(+), 27 deletions(-) diff --git a/Makefile b/Makefile index 10c1a11..6f7c025 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ include Makefile.rules CXXFLAGS = -std=c++11 -W -Wall -O3 -ffast-math -g $(CXXFLAGS_PLATFORM) -DEFINES = #-DSIMD +DEFINES = -DSIMD BUILD_DIR = . SRC = rotation.cpp \ image.cpp \ diff --git a/rotation.cpp b/rotation.cpp index 6b232c4..d3441a3 100644 --- a/rotation.cpp +++ b/rotation.cpp @@ -325,6 +325,61 @@ uint16_t* generate_border_table_back(uint16_t const* front_padding, // Image rotation // +inline +void fill_row(Image const& src, + Point const& src_rotated_point, + uint8_t* row_buffer, unsigned int row_index, + uint8_t* row_coefs, + int q_pow) +{ + // Quantize on a 8x8 grid + int const q_inter_pow = 3; + int const q_inter = 8; // 1 << q_inter_pow; + int const mask = 0x07; + + int const src_x = src_rotated_point.x >> q_pow; + int const src_y = src_rotated_point.y >> q_pow; + + // Bilinear interpolation + unsigned int const src_index_1 = src_y * src.width + src_x; + unsigned int const src_index_3 = src_index_1 + src.width; + + row_buffer[4 * row_index] = src.buffer[src_index_1]; + row_buffer[4 * row_index + 1] = src.buffer[src_index_1 + 1]; + row_buffer[4 * row_index + 2] = src.buffer[src_index_3]; + row_buffer[4 * row_index + 3] = src.buffer[src_index_3 + 1]; + + unsigned int const x_delta = (src_rotated_point.x >> (q_pow - q_inter_pow)) & mask; + unsigned int const y_delta = (src_rotated_point.y >> (q_pow - q_inter_pow)) & mask; + unsigned int const inv_x = q_inter - x_delta; + unsigned int const inv_y = q_inter - y_delta; + + row_coefs[4 * row_index] = inv_x * inv_y; + row_coefs[4 * row_index + 1] = x_delta * inv_y; + row_coefs[4 * row_index + 2] = inv_x * y_delta; + row_coefs[4 * row_index + 3] = x_delta * y_delta; +} + +inline +void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs, + unsigned int row_index, + pvalue_t* rotate_buffer, unsigned int rot_index) +{ + __m128i pixels = _mm_loadu_si128((__m128i*) &row_buffer[row_index]); + __m128i coefs = _mm_loadu_si128((__m128i*) &row_coefs[row_index]); + + pixels = _mm_maddubs_epi16(pixels, coefs); // 2 bins per pixel, 4 pixels + + __m128i const zero = _mm_set1_epi16(0); + pixels = _mm_hadd_epi16(pixels, zero); // 1 bin per pixel, 4 pixels + pixels = _mm_srli_epi16(pixels, 6); + + rotate_buffer[rot_index] = _mm_extract_epi16(pixels, 0); + rotate_buffer[rot_index + 1] = _mm_extract_epi16(pixels, 1); + rotate_buffer[rot_index + 2] = _mm_extract_epi16(pixels, 2); + rotate_buffer[rot_index + 3] = _mm_extract_epi16(pixels, 3); +} + inline void rotate_pixel(Image const& src, Point const& src_rotated_point, @@ -333,7 +388,7 @@ void rotate_pixel(Image const& src, { // Quantize on a 8x8 grid int const q_inter_pow = 3; - int const q_inter = 1 << q_inter_pow; + int const q_inter = 8; // 1 << q_inter_pow; int const mask = 0x07; int const src_x = src_rotated_point.x >> q_pow; @@ -353,33 +408,9 @@ void rotate_pixel(Image const& src, unsigned int const inv_x = q_inter - x_delta; unsigned int const inv_y = q_inter - y_delta; -#ifndef SIMD - pvalue_t interpolated = ((src_tl * inv_x + src_tr * x_delta) * inv_y - + (src_bl * inv_x + src_br * x_delta) * y_delta) >> (q_inter_pow << 1); + + (src_bl * inv_x + src_br * x_delta) * y_delta) >> 6; rotate_buffer[rot_index] = interpolated; - -#else - - // X-axis - __m128i top = _mm_loadu_si128((__m128i*) &src.buffer[src_index_1]); - __m128i bottom = _mm_loadu_si128((__m128i*) &src.buffer[src_index_3]); - __m128i coef = _mm_set_epi16(x_delta, x_delta, x_delta, x_delta, inv_x, inv_x, inv_x, inv_x); - top = _mm_mullo_epi16(top, coef); - bottom = _mm_mullo_epi16(bottom, coef); - - // Y-axis - coef = _mm_set1_epi16(inv_y); - top = _mm_mullo_epi16(top, coef); - coef = _mm_set1_epi16(y_delta); - bottom = _mm_mullo_epi16(bottom, coef); - top = _mm_add_epi16(top, bottom); - - top = _mm_srli_epi16(top, 2 * q_pow); - - rotate_buffer[rot_index] = _mm_extract_epi16(top, 0) + _mm_extract_epi16(top, 4); - -#endif // ! SIMD } Image* rotate(Image const& src, double angle) @@ -428,6 +459,11 @@ Image* rotate(Image const& src, double angle) int previous_right_padding = 0; + // Row buffer + unique_ptr row_buffer(new uint8_t[width * 4]); + unique_ptr row_coefs(new uint8_t[width * 4]); + + for (int y = 0; y < height; ++y) { @@ -453,12 +489,33 @@ Image* rotate(Image const& src, double angle) src_rotated_origin.y + left_padding * qdx.y); // Body +#ifndef SIMD + for (int x = 0; x < core_pixels; ++x, ++buffer_index) { rotate_pixel(src, src_rotated_point, buffer, buffer_index, q_pos_pow); src_rotated_point += qdx; } +#else + + for (int x = 0; x < core_pixels; ++x) + { + fill_row(src, src_rotated_point, row_buffer.get(), x, row_coefs.get(), q_pos_pow); + src_rotated_point += qdx; + } + + // We process 4 pixels at a time + for (int x = 0; x < core_pixels / 4; ++x) + { + interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer, buffer_index); + buffer_index += 4; + } + buffer_index += core_pixels % 4; + +#endif // ! SIMD + + // // Border // for (int x = 0; x < right_border; ++x, ++buffer_index) // {