Extract one 64-bit value for SSE.

Instead of extracting 4 16-bit value, we directly extract a 64-bit
value.
This commit is contained in:
Fabien Freling 2014-09-08 22:53:54 +02:00
parent d28caec8cc
commit f618b3fed1
2 changed files with 6 additions and 7 deletions

2
configure vendored
View file

@ -4,7 +4,7 @@ case `uname -s` in
Darwin) Darwin)
echo 'CXX = clang++' > Makefile.rules echo 'CXX = clang++' > Makefile.rules
echo 'CXXFLAGS_PLATFORM = ' >> Makefile.rules echo 'CXXFLAGS_PLATFORM = -march=native -msse4.2' >> Makefile.rules
echo 'LFLAGS = -flto' >> Makefile.rules echo 'LFLAGS = -flto' >> Makefile.rules
;; ;;

View file

@ -13,6 +13,7 @@
#include <xmmintrin.h> #include <xmmintrin.h>
#include <emmintrin.h> #include <emmintrin.h>
#include <tmmintrin.h> #include <tmmintrin.h>
#include <smmintrin.h>
#include "image.h" #include "image.h"
@ -363,7 +364,7 @@ void fill_row(Image const& src,
inline inline
void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs, void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs,
unsigned int row_index, unsigned int row_index,
pvalue_t* rotate_buffer, unsigned int rot_index) uint64_t* rotate_buffer, unsigned int rot_index)
{ {
__m128i pixels = _mm_loadu_si128((__m128i*) &row_buffer[row_index]); __m128i pixels = _mm_loadu_si128((__m128i*) &row_buffer[row_index]);
__m128i coefs = _mm_loadu_si128((__m128i*) &row_coefs[row_index]); __m128i coefs = _mm_loadu_si128((__m128i*) &row_coefs[row_index]);
@ -374,10 +375,7 @@ void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs,
pixels = _mm_hadd_epi16(pixels, zero); // 1 bin per pixel, 4 pixels pixels = _mm_hadd_epi16(pixels, zero); // 1 bin per pixel, 4 pixels
pixels = _mm_srli_epi16(pixels, 6); pixels = _mm_srli_epi16(pixels, 6);
rotate_buffer[rot_index] = _mm_extract_epi16(pixels, 0); rotate_buffer[rot_index / 4] = _mm_extract_epi64(pixels, 0);
rotate_buffer[rot_index + 1] = _mm_extract_epi16(pixels, 1);
rotate_buffer[rot_index + 2] = _mm_extract_epi16(pixels, 2);
rotate_buffer[rot_index + 3] = _mm_extract_epi16(pixels, 3);
} }
inline inline
@ -443,6 +441,7 @@ Image* rotate(Image const& src, double angle)
unsigned int buffer_index = 0; unsigned int buffer_index = 0;
pvalue_t* buffer = rotated->buffer; pvalue_t* buffer = rotated->buffer;
uint64_t* buffer64 = (uint64_t*) rotated->buffer;
int const width = rotated->width; int const width = rotated->width;
int const height = rotated->height; int const height = rotated->height;
@ -508,7 +507,7 @@ Image* rotate(Image const& src, double angle)
// We process 4 pixels at a time // We process 4 pixels at a time
for (int x = 0; x < core_pixels / 4; ++x) for (int x = 0; x < core_pixels / 4; ++x)
{ {
interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer, buffer_index); interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer64, buffer_index);
buffer_index += 4; buffer_index += 4;
} }
buffer_index += core_pixels % 4; buffer_index += core_pixels % 4;