Extract one 64-bit value for SSE.
Instead of extracting 4 16-bit value, we directly extract a 64-bit value.
This commit is contained in:
parent
d28caec8cc
commit
f618b3fed1
2
configure
vendored
2
configure
vendored
|
@ -4,7 +4,7 @@ case `uname -s` in
|
|||
|
||||
Darwin)
|
||||
echo 'CXX = clang++' > Makefile.rules
|
||||
echo 'CXXFLAGS_PLATFORM = ' >> Makefile.rules
|
||||
echo 'CXXFLAGS_PLATFORM = -march=native -msse4.2' >> Makefile.rules
|
||||
echo 'LFLAGS = -flto' >> Makefile.rules
|
||||
;;
|
||||
|
||||
|
|
11
rotation.cpp
11
rotation.cpp
|
@ -13,6 +13,7 @@
|
|||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
#include <smmintrin.h>
|
||||
|
||||
#include "image.h"
|
||||
|
||||
|
@ -363,7 +364,7 @@ void fill_row(Image const& src,
|
|||
inline
|
||||
void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs,
|
||||
unsigned int row_index,
|
||||
pvalue_t* rotate_buffer, unsigned int rot_index)
|
||||
uint64_t* rotate_buffer, unsigned int rot_index)
|
||||
{
|
||||
__m128i pixels = _mm_loadu_si128((__m128i*) &row_buffer[row_index]);
|
||||
__m128i coefs = _mm_loadu_si128((__m128i*) &row_coefs[row_index]);
|
||||
|
@ -374,10 +375,7 @@ void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs,
|
|||
pixels = _mm_hadd_epi16(pixels, zero); // 1 bin per pixel, 4 pixels
|
||||
pixels = _mm_srli_epi16(pixels, 6);
|
||||
|
||||
rotate_buffer[rot_index] = _mm_extract_epi16(pixels, 0);
|
||||
rotate_buffer[rot_index + 1] = _mm_extract_epi16(pixels, 1);
|
||||
rotate_buffer[rot_index + 2] = _mm_extract_epi16(pixels, 2);
|
||||
rotate_buffer[rot_index + 3] = _mm_extract_epi16(pixels, 3);
|
||||
rotate_buffer[rot_index / 4] = _mm_extract_epi64(pixels, 0);
|
||||
}
|
||||
|
||||
inline
|
||||
|
@ -443,6 +441,7 @@ Image* rotate(Image const& src, double angle)
|
|||
|
||||
unsigned int buffer_index = 0;
|
||||
pvalue_t* buffer = rotated->buffer;
|
||||
uint64_t* buffer64 = (uint64_t*) rotated->buffer;
|
||||
|
||||
int const width = rotated->width;
|
||||
int const height = rotated->height;
|
||||
|
@ -508,7 +507,7 @@ Image* rotate(Image const& src, double angle)
|
|||
// We process 4 pixels at a time
|
||||
for (int x = 0; x < core_pixels / 4; ++x)
|
||||
{
|
||||
interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer, buffer_index);
|
||||
interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer64, buffer_index);
|
||||
buffer_index += 4;
|
||||
}
|
||||
buffer_index += core_pixels % 4;
|
||||
|
|
Loading…
Reference in a new issue