From d28caec8cc367b97bd2ed2e89e9f1ced2d591e77 Mon Sep 17 00:00:00 2001
From: Fabien Freling <public@ffreling.com>
Date: Mon, 8 Sep 2014 22:32:35 +0200
Subject: [PATCH] Implement SIMD in two passes.

In a first pass, we put the pixels values and the coefficients
in two separate buffers.

In a second pass, we computer the interpolation.
---
 Makefile     |   2 +-
 rotation.cpp | 109 +++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/Makefile b/Makefile
index 10c1a11..6f7c025 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 include Makefile.rules
 
 CXXFLAGS  = -std=c++11 -W -Wall -O3 -ffast-math -g $(CXXFLAGS_PLATFORM)
-DEFINES   = #-DSIMD
+DEFINES   = -DSIMD
 BUILD_DIR = .
 SRC       = rotation.cpp \
 	    image.cpp \
diff --git a/rotation.cpp b/rotation.cpp
index 6b232c4..d3441a3 100644
--- a/rotation.cpp
+++ b/rotation.cpp
@@ -325,6 +325,61 @@ uint16_t* generate_border_table_back(uint16_t const* front_padding,
 // Image rotation
 //
 
+inline
+void fill_row(Image const& src,
+              Point const& src_rotated_point,
+              uint8_t* row_buffer, unsigned int row_index,
+              uint8_t* row_coefs,
+              int q_pow)
+{
+  // Quantize on a 8x8 grid
+  int const q_inter_pow = 3;
+  int const q_inter = 8; // 1 << q_inter_pow;
+  int const mask = 0x07;
+
+  int const src_x = src_rotated_point.x >> q_pow;
+  int const src_y = src_rotated_point.y >> q_pow;
+
+  // Bilinear interpolation
+  unsigned int const src_index_1 = src_y * src.width + src_x;
+  unsigned int const src_index_3 = src_index_1 + src.width;
+
+  row_buffer[4 * row_index] = src.buffer[src_index_1];
+  row_buffer[4 * row_index + 1] = src.buffer[src_index_1 + 1];
+  row_buffer[4 * row_index + 2] = src.buffer[src_index_3];
+  row_buffer[4 * row_index + 3] = src.buffer[src_index_3 + 1];
+
+  unsigned int const x_delta = (src_rotated_point.x >> (q_pow - q_inter_pow)) & mask;
+  unsigned int const y_delta = (src_rotated_point.y >> (q_pow - q_inter_pow)) & mask;
+  unsigned int const inv_x = q_inter - x_delta;
+  unsigned int const inv_y = q_inter - y_delta;
+
+  row_coefs[4 * row_index] = inv_x * inv_y;
+  row_coefs[4 * row_index + 1] = x_delta * inv_y;
+  row_coefs[4 * row_index + 2] = inv_x * y_delta;
+  row_coefs[4 * row_index + 3] = x_delta * y_delta;
+}
+
+inline
+void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs,
+                     unsigned int row_index,
+                     pvalue_t* rotate_buffer, unsigned int rot_index)
+{
+  __m128i pixels = _mm_loadu_si128((__m128i*) &row_buffer[row_index]);
+  __m128i coefs = _mm_loadu_si128((__m128i*) &row_coefs[row_index]);
+
+  pixels = _mm_maddubs_epi16(pixels, coefs); // 2 bins per pixel, 4 pixels
+
+  __m128i const zero = _mm_set1_epi16(0);
+  pixels = _mm_hadd_epi16(pixels, zero); // 1 bin per pixel, 4 pixels
+  pixels = _mm_srli_epi16(pixels, 6);
+
+  rotate_buffer[rot_index] = _mm_extract_epi16(pixels, 0);
+  rotate_buffer[rot_index + 1] = _mm_extract_epi16(pixels, 1);
+  rotate_buffer[rot_index + 2] = _mm_extract_epi16(pixels, 2);
+  rotate_buffer[rot_index + 3] = _mm_extract_epi16(pixels, 3);
+}
+
 inline
 void rotate_pixel(Image const& src,
                   Point const& src_rotated_point,
@@ -333,7 +388,7 @@ void rotate_pixel(Image const& src,
 {
   // Quantize on a 8x8 grid
   int const q_inter_pow = 3;
-  int const q_inter = 1 << q_inter_pow;
+  int const q_inter = 8; // 1 << q_inter_pow;
   int const mask = 0x07;
 
   int const src_x = src_rotated_point.x >> q_pow;
@@ -353,33 +408,9 @@ void rotate_pixel(Image const& src,
   unsigned int const inv_x = q_inter - x_delta;
   unsigned int const inv_y = q_inter - y_delta;
 
-#ifndef SIMD
-
   pvalue_t interpolated = ((src_tl * inv_x + src_tr * x_delta) * inv_y
-                         + (src_bl * inv_x + src_br * x_delta) * y_delta) >> (q_inter_pow << 1);
+                        + (src_bl * inv_x + src_br * x_delta) * y_delta) >> 6;
   rotate_buffer[rot_index] = interpolated;
-
-#else
-
-  // X-axis
-  __m128i top = _mm_loadu_si128((__m128i*) &src.buffer[src_index_1]);
-  __m128i bottom = _mm_loadu_si128((__m128i*) &src.buffer[src_index_3]);
-  __m128i coef = _mm_set_epi16(x_delta, x_delta, x_delta, x_delta, inv_x, inv_x, inv_x, inv_x);
-  top = _mm_mullo_epi16(top, coef);
-  bottom = _mm_mullo_epi16(bottom, coef);
-
-  // Y-axis
-  coef = _mm_set1_epi16(inv_y);
-  top = _mm_mullo_epi16(top, coef);
-  coef = _mm_set1_epi16(y_delta);
-  bottom = _mm_mullo_epi16(bottom, coef);
-  top = _mm_add_epi16(top, bottom);
-
-  top = _mm_srli_epi16(top, 2 * q_pow);
-
-  rotate_buffer[rot_index] = _mm_extract_epi16(top, 0) + _mm_extract_epi16(top, 4);
-
-#endif // ! SIMD
 }
 
 Image* rotate(Image const& src, double angle)
@@ -428,6 +459,11 @@ Image* rotate(Image const& src, double angle)
 
   int previous_right_padding = 0;
 
+  // Row buffer
+  unique_ptr<uint8_t[]> row_buffer(new uint8_t[width * 4]);
+  unique_ptr<uint8_t[]> row_coefs(new uint8_t[width * 4]);
+
+
   for (int y = 0; y < height; ++y)
   {
 
@@ -453,12 +489,33 @@ Image* rotate(Image const& src, double angle)
                             src_rotated_origin.y + left_padding * qdx.y);
 
     // Body
+#ifndef SIMD
+
     for (int x = 0; x < core_pixels; ++x, ++buffer_index)
     {
       rotate_pixel(src, src_rotated_point, buffer, buffer_index, q_pos_pow);
       src_rotated_point += qdx;
     }
 
+#else
+
+    for (int x = 0; x < core_pixels; ++x)
+    {
+      fill_row(src, src_rotated_point, row_buffer.get(), x, row_coefs.get(), q_pos_pow);
+      src_rotated_point += qdx;
+    }
+
+    // We process 4 pixels at a time
+    for (int x = 0; x < core_pixels / 4; ++x)
+    {
+      interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer, buffer_index);
+      buffer_index += 4;
+    }
+    buffer_index += core_pixels % 4;
+
+#endif // ! SIMD
+
+
 //    // Border
 //    for (int x = 0; x < right_border; ++x, ++buffer_index)
 //    {