Extract one 64-bit value for SSE.

Instead of extracting 4 16-bit value, we directly extract a 64-bit
value.
This commit is contained in:
Fabien Freling 2014-09-08 22:53:54 +02:00
parent d28caec8cc
commit f618b3fed1
2 changed files with 6 additions and 7 deletions

2
configure vendored
View file

@ -4,7 +4,7 @@ case `uname -s` in
Darwin)
echo 'CXX = clang++' > Makefile.rules
echo 'CXXFLAGS_PLATFORM = ' >> Makefile.rules
echo 'CXXFLAGS_PLATFORM = -march=native -msse4.2' >> Makefile.rules
echo 'LFLAGS = -flto' >> Makefile.rules
;;

View file

@ -13,6 +13,7 @@
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <smmintrin.h>
#include "image.h"
@ -363,7 +364,7 @@ void fill_row(Image const& src,
inline
void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs,
unsigned int row_index,
pvalue_t* rotate_buffer, unsigned int rot_index)
uint64_t* rotate_buffer, unsigned int rot_index)
{
__m128i pixels = _mm_loadu_si128((__m128i*) &row_buffer[row_index]);
__m128i coefs = _mm_loadu_si128((__m128i*) &row_coefs[row_index]);
@ -374,10 +375,7 @@ void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs,
pixels = _mm_hadd_epi16(pixels, zero); // 1 bin per pixel, 4 pixels
pixels = _mm_srli_epi16(pixels, 6);
rotate_buffer[rot_index] = _mm_extract_epi16(pixels, 0);
rotate_buffer[rot_index + 1] = _mm_extract_epi16(pixels, 1);
rotate_buffer[rot_index + 2] = _mm_extract_epi16(pixels, 2);
rotate_buffer[rot_index + 3] = _mm_extract_epi16(pixels, 3);
rotate_buffer[rot_index / 4] = _mm_extract_epi64(pixels, 0);
}
inline
@ -443,6 +441,7 @@ Image* rotate(Image const& src, double angle)
unsigned int buffer_index = 0;
pvalue_t* buffer = rotated->buffer;
uint64_t* buffer64 = (uint64_t*) rotated->buffer;
int const width = rotated->width;
int const height = rotated->height;
@ -508,7 +507,7 @@ Image* rotate(Image const& src, double angle)
// We process 4 pixels at a time
for (int x = 0; x < core_pixels / 4; ++x)
{
interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer, buffer_index);
interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer64, buffer_index);
buffer_index += 4;
}
buffer_index += core_pixels % 4;