Extract one 64-bit value for SSE.

Instead of extracting 4 16-bit value, we directly extract a 64-bit value.
2014-09-08 22:53:54 +02:00 · 2014-09-08 22:53:54 +02:00 · f618b3fed1
commit f618b3fed1
parent d28caec8cc
2 changed files with 6 additions and 7 deletions
--- a/2
+++ b/2
@ -4,7 +4,7 @@ case `uname -s` in

  Darwin)
    echo 'CXX = clang++' > Makefile.rules
-    echo 'CXXFLAGS_PLATFORM = ' >> Makefile.rules
+    echo 'CXXFLAGS_PLATFORM = -march=native -msse4.2' >> Makefile.rules
    echo 'LFLAGS = -flto' >> Makefile.rules
    ;;

--- a/rotation.cpp
+++ b/rotation.cpp
@ -13,6 +13,7 @@
 #include <xmmintrin.h>
 #include <emmintrin.h>
 #include <tmmintrin.h>
+#include <smmintrin.h>

 #include "image.h"

@ -363,7 +364,7 @@ void fill_row(Image const& src,
 inline
 void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs,
                     unsigned int row_index,
-                     pvalue_t* rotate_buffer, unsigned int rot_index)
+                     uint64_t* rotate_buffer, unsigned int rot_index)
 {
  __m128i pixels = _mm_loadu_si128((__m128i*) &row_buffer[row_index]);
  __m128i coefs = _mm_loadu_si128((__m128i*) &row_coefs[row_index]);
@ -374,10 +375,7 @@ void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs,
  pixels = _mm_hadd_epi16(pixels, zero); // 1 bin per pixel, 4 pixels
  pixels = _mm_srli_epi16(pixels, 6);

-  rotate_buffer[rot_index] = _mm_extract_epi16(pixels, 0);
-  rotate_buffer[rot_index + 1] = _mm_extract_epi16(pixels, 1);
-  rotate_buffer[rot_index + 2] = _mm_extract_epi16(pixels, 2);
-  rotate_buffer[rot_index + 3] = _mm_extract_epi16(pixels, 3);
+  rotate_buffer[rot_index / 4] = _mm_extract_epi64(pixels, 0);
 }

 inline
@ -443,6 +441,7 @@ Image* rotate(Image const& src, double angle)

  unsigned int buffer_index = 0;
  pvalue_t* buffer = rotated->buffer;
+  uint64_t* buffer64 = (uint64_t*) rotated->buffer;

  int const width = rotated->width;
  int const height = rotated->height;
@ -508,7 +507,7 @@ Image* rotate(Image const& src, double angle)
    // We process 4 pixels at a time
    for (int x = 0; x < core_pixels / 4; ++x)
    {
-      interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer, buffer_index);
+      interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer64, buffer_index);
      buffer_index += 4;
    }
    buffer_index += core_pixels % 4;