|
|
|
@ -13,6 +13,7 @@ |
|
|
|
|
#include <xmmintrin.h> |
|
|
|
|
#include <emmintrin.h> |
|
|
|
|
#include <tmmintrin.h> |
|
|
|
|
#include <smmintrin.h> |
|
|
|
|
|
|
|
|
|
#include "image.h" |
|
|
|
|
|
|
|
|
@ -363,7 +364,7 @@ void fill_row(Image const& src, |
|
|
|
|
inline |
|
|
|
|
void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs, |
|
|
|
|
unsigned int row_index, |
|
|
|
|
pvalue_t* rotate_buffer, unsigned int rot_index) |
|
|
|
|
uint64_t* rotate_buffer, unsigned int rot_index) |
|
|
|
|
{ |
|
|
|
|
__m128i pixels = _mm_loadu_si128((__m128i*) &row_buffer[row_index]); |
|
|
|
|
__m128i coefs = _mm_loadu_si128((__m128i*) &row_coefs[row_index]); |
|
|
|
@ -374,10 +375,7 @@ void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs, |
|
|
|
|
pixels = _mm_hadd_epi16(pixels, zero); // 1 bin per pixel, 4 pixels
|
|
|
|
|
pixels = _mm_srli_epi16(pixels, 6); |
|
|
|
|
|
|
|
|
|
rotate_buffer[rot_index] = _mm_extract_epi16(pixels, 0); |
|
|
|
|
rotate_buffer[rot_index + 1] = _mm_extract_epi16(pixels, 1); |
|
|
|
|
rotate_buffer[rot_index + 2] = _mm_extract_epi16(pixels, 2); |
|
|
|
|
rotate_buffer[rot_index + 3] = _mm_extract_epi16(pixels, 3); |
|
|
|
|
rotate_buffer[rot_index / 4] = _mm_extract_epi64(pixels, 0); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
inline |
|
|
|
@ -443,6 +441,7 @@ Image* rotate(Image const& src, double angle) |
|
|
|
|
|
|
|
|
|
unsigned int buffer_index = 0; |
|
|
|
|
pvalue_t* buffer = rotated->buffer; |
|
|
|
|
uint64_t* buffer64 = (uint64_t*) rotated->buffer; |
|
|
|
|
|
|
|
|
|
int const width = rotated->width; |
|
|
|
|
int const height = rotated->height; |
|
|
|
@ -508,7 +507,7 @@ Image* rotate(Image const& src, double angle) |
|
|
|
|
// We process 4 pixels at a time
|
|
|
|
|
for (int x = 0; x < core_pixels / 4; ++x) |
|
|
|
|
{ |
|
|
|
|
interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer, buffer_index); |
|
|
|
|
interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer64, buffer_index); |
|
|
|
|
buffer_index += 4; |
|
|
|
|
} |
|
|
|
|
buffer_index += core_pixels % 4; |
|
|
|
|