diff --git a/configure b/configure index c8b0731..8d08ba1 100755 --- a/configure +++ b/configure @@ -4,7 +4,7 @@ case `uname -s` in Darwin) echo 'CXX = clang++' > Makefile.rules - echo 'CXXFLAGS_PLATFORM = ' >> Makefile.rules + echo 'CXXFLAGS_PLATFORM = -march=native -msse4.2' >> Makefile.rules echo 'LFLAGS = -flto' >> Makefile.rules ;; diff --git a/rotation.cpp b/rotation.cpp index d3441a3..94a13a4 100644 --- a/rotation.cpp +++ b/rotation.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include "image.h" @@ -363,7 +364,7 @@ void fill_row(Image const& src, inline void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs, unsigned int row_index, - pvalue_t* rotate_buffer, unsigned int rot_index) + uint64_t* rotate_buffer, unsigned int rot_index) { __m128i pixels = _mm_loadu_si128((__m128i*) &row_buffer[row_index]); __m128i coefs = _mm_loadu_si128((__m128i*) &row_coefs[row_index]); @@ -374,10 +375,7 @@ void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs, pixels = _mm_hadd_epi16(pixels, zero); // 1 bin per pixel, 4 pixels pixels = _mm_srli_epi16(pixels, 6); - rotate_buffer[rot_index] = _mm_extract_epi16(pixels, 0); - rotate_buffer[rot_index + 1] = _mm_extract_epi16(pixels, 1); - rotate_buffer[rot_index + 2] = _mm_extract_epi16(pixels, 2); - rotate_buffer[rot_index + 3] = _mm_extract_epi16(pixels, 3); + rotate_buffer[rot_index / 4] = _mm_extract_epi64(pixels, 0); } inline @@ -443,6 +441,7 @@ Image* rotate(Image const& src, double angle) unsigned int buffer_index = 0; pvalue_t* buffer = rotated->buffer; + uint64_t* buffer64 = (uint64_t*) rotated->buffer; int const width = rotated->width; int const height = rotated->height; @@ -508,7 +507,7 @@ Image* rotate(Image const& src, double angle) // We process 4 pixels at a time for (int x = 0; x < core_pixels / 4; ++x) { - interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer, buffer_index); + interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer64, buffer_index); buffer_index += 4; } buffer_index += core_pixels % 4;