Implement SIMD in two passes.
In a first pass, we put the pixels values and the coefficients in two separate buffers. In a second pass, we computer the interpolation.
This commit is contained in:
parent
2491796107
commit
d28caec8cc
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
include Makefile.rules
|
include Makefile.rules
|
||||||
|
|
||||||
CXXFLAGS = -std=c++11 -W -Wall -O3 -ffast-math -g $(CXXFLAGS_PLATFORM)
|
CXXFLAGS = -std=c++11 -W -Wall -O3 -ffast-math -g $(CXXFLAGS_PLATFORM)
|
||||||
DEFINES = #-DSIMD
|
DEFINES = -DSIMD
|
||||||
BUILD_DIR = .
|
BUILD_DIR = .
|
||||||
SRC = rotation.cpp \
|
SRC = rotation.cpp \
|
||||||
image.cpp \
|
image.cpp \
|
||||||
|
|
109
rotation.cpp
109
rotation.cpp
|
@ -325,6 +325,61 @@ uint16_t* generate_border_table_back(uint16_t const* front_padding,
|
||||||
// Image rotation
|
// Image rotation
|
||||||
//
|
//
|
||||||
|
|
||||||
|
inline
|
||||||
|
void fill_row(Image const& src,
|
||||||
|
Point const& src_rotated_point,
|
||||||
|
uint8_t* row_buffer, unsigned int row_index,
|
||||||
|
uint8_t* row_coefs,
|
||||||
|
int q_pow)
|
||||||
|
{
|
||||||
|
// Quantize on a 8x8 grid
|
||||||
|
int const q_inter_pow = 3;
|
||||||
|
int const q_inter = 8; // 1 << q_inter_pow;
|
||||||
|
int const mask = 0x07;
|
||||||
|
|
||||||
|
int const src_x = src_rotated_point.x >> q_pow;
|
||||||
|
int const src_y = src_rotated_point.y >> q_pow;
|
||||||
|
|
||||||
|
// Bilinear interpolation
|
||||||
|
unsigned int const src_index_1 = src_y * src.width + src_x;
|
||||||
|
unsigned int const src_index_3 = src_index_1 + src.width;
|
||||||
|
|
||||||
|
row_buffer[4 * row_index] = src.buffer[src_index_1];
|
||||||
|
row_buffer[4 * row_index + 1] = src.buffer[src_index_1 + 1];
|
||||||
|
row_buffer[4 * row_index + 2] = src.buffer[src_index_3];
|
||||||
|
row_buffer[4 * row_index + 3] = src.buffer[src_index_3 + 1];
|
||||||
|
|
||||||
|
unsigned int const x_delta = (src_rotated_point.x >> (q_pow - q_inter_pow)) & mask;
|
||||||
|
unsigned int const y_delta = (src_rotated_point.y >> (q_pow - q_inter_pow)) & mask;
|
||||||
|
unsigned int const inv_x = q_inter - x_delta;
|
||||||
|
unsigned int const inv_y = q_inter - y_delta;
|
||||||
|
|
||||||
|
row_coefs[4 * row_index] = inv_x * inv_y;
|
||||||
|
row_coefs[4 * row_index + 1] = x_delta * inv_y;
|
||||||
|
row_coefs[4 * row_index + 2] = inv_x * y_delta;
|
||||||
|
row_coefs[4 * row_index + 3] = x_delta * y_delta;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline
|
||||||
|
void interpolate_row(uint8_t* row_buffer, uint8_t* row_coefs,
|
||||||
|
unsigned int row_index,
|
||||||
|
pvalue_t* rotate_buffer, unsigned int rot_index)
|
||||||
|
{
|
||||||
|
__m128i pixels = _mm_loadu_si128((__m128i*) &row_buffer[row_index]);
|
||||||
|
__m128i coefs = _mm_loadu_si128((__m128i*) &row_coefs[row_index]);
|
||||||
|
|
||||||
|
pixels = _mm_maddubs_epi16(pixels, coefs); // 2 bins per pixel, 4 pixels
|
||||||
|
|
||||||
|
__m128i const zero = _mm_set1_epi16(0);
|
||||||
|
pixels = _mm_hadd_epi16(pixels, zero); // 1 bin per pixel, 4 pixels
|
||||||
|
pixels = _mm_srli_epi16(pixels, 6);
|
||||||
|
|
||||||
|
rotate_buffer[rot_index] = _mm_extract_epi16(pixels, 0);
|
||||||
|
rotate_buffer[rot_index + 1] = _mm_extract_epi16(pixels, 1);
|
||||||
|
rotate_buffer[rot_index + 2] = _mm_extract_epi16(pixels, 2);
|
||||||
|
rotate_buffer[rot_index + 3] = _mm_extract_epi16(pixels, 3);
|
||||||
|
}
|
||||||
|
|
||||||
inline
|
inline
|
||||||
void rotate_pixel(Image const& src,
|
void rotate_pixel(Image const& src,
|
||||||
Point const& src_rotated_point,
|
Point const& src_rotated_point,
|
||||||
|
@ -333,7 +388,7 @@ void rotate_pixel(Image const& src,
|
||||||
{
|
{
|
||||||
// Quantize on a 8x8 grid
|
// Quantize on a 8x8 grid
|
||||||
int const q_inter_pow = 3;
|
int const q_inter_pow = 3;
|
||||||
int const q_inter = 1 << q_inter_pow;
|
int const q_inter = 8; // 1 << q_inter_pow;
|
||||||
int const mask = 0x07;
|
int const mask = 0x07;
|
||||||
|
|
||||||
int const src_x = src_rotated_point.x >> q_pow;
|
int const src_x = src_rotated_point.x >> q_pow;
|
||||||
|
@ -353,33 +408,9 @@ void rotate_pixel(Image const& src,
|
||||||
unsigned int const inv_x = q_inter - x_delta;
|
unsigned int const inv_x = q_inter - x_delta;
|
||||||
unsigned int const inv_y = q_inter - y_delta;
|
unsigned int const inv_y = q_inter - y_delta;
|
||||||
|
|
||||||
#ifndef SIMD
|
|
||||||
|
|
||||||
pvalue_t interpolated = ((src_tl * inv_x + src_tr * x_delta) * inv_y
|
pvalue_t interpolated = ((src_tl * inv_x + src_tr * x_delta) * inv_y
|
||||||
+ (src_bl * inv_x + src_br * x_delta) * y_delta) >> (q_inter_pow << 1);
|
+ (src_bl * inv_x + src_br * x_delta) * y_delta) >> 6;
|
||||||
rotate_buffer[rot_index] = interpolated;
|
rotate_buffer[rot_index] = interpolated;
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
// X-axis
|
|
||||||
__m128i top = _mm_loadu_si128((__m128i*) &src.buffer[src_index_1]);
|
|
||||||
__m128i bottom = _mm_loadu_si128((__m128i*) &src.buffer[src_index_3]);
|
|
||||||
__m128i coef = _mm_set_epi16(x_delta, x_delta, x_delta, x_delta, inv_x, inv_x, inv_x, inv_x);
|
|
||||||
top = _mm_mullo_epi16(top, coef);
|
|
||||||
bottom = _mm_mullo_epi16(bottom, coef);
|
|
||||||
|
|
||||||
// Y-axis
|
|
||||||
coef = _mm_set1_epi16(inv_y);
|
|
||||||
top = _mm_mullo_epi16(top, coef);
|
|
||||||
coef = _mm_set1_epi16(y_delta);
|
|
||||||
bottom = _mm_mullo_epi16(bottom, coef);
|
|
||||||
top = _mm_add_epi16(top, bottom);
|
|
||||||
|
|
||||||
top = _mm_srli_epi16(top, 2 * q_pow);
|
|
||||||
|
|
||||||
rotate_buffer[rot_index] = _mm_extract_epi16(top, 0) + _mm_extract_epi16(top, 4);
|
|
||||||
|
|
||||||
#endif // ! SIMD
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Image* rotate(Image const& src, double angle)
|
Image* rotate(Image const& src, double angle)
|
||||||
|
@ -428,6 +459,11 @@ Image* rotate(Image const& src, double angle)
|
||||||
|
|
||||||
int previous_right_padding = 0;
|
int previous_right_padding = 0;
|
||||||
|
|
||||||
|
// Row buffer
|
||||||
|
unique_ptr<uint8_t[]> row_buffer(new uint8_t[width * 4]);
|
||||||
|
unique_ptr<uint8_t[]> row_coefs(new uint8_t[width * 4]);
|
||||||
|
|
||||||
|
|
||||||
for (int y = 0; y < height; ++y)
|
for (int y = 0; y < height; ++y)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@ -453,12 +489,33 @@ Image* rotate(Image const& src, double angle)
|
||||||
src_rotated_origin.y + left_padding * qdx.y);
|
src_rotated_origin.y + left_padding * qdx.y);
|
||||||
|
|
||||||
// Body
|
// Body
|
||||||
|
#ifndef SIMD
|
||||||
|
|
||||||
for (int x = 0; x < core_pixels; ++x, ++buffer_index)
|
for (int x = 0; x < core_pixels; ++x, ++buffer_index)
|
||||||
{
|
{
|
||||||
rotate_pixel(src, src_rotated_point, buffer, buffer_index, q_pos_pow);
|
rotate_pixel(src, src_rotated_point, buffer, buffer_index, q_pos_pow);
|
||||||
src_rotated_point += qdx;
|
src_rotated_point += qdx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
for (int x = 0; x < core_pixels; ++x)
|
||||||
|
{
|
||||||
|
fill_row(src, src_rotated_point, row_buffer.get(), x, row_coefs.get(), q_pos_pow);
|
||||||
|
src_rotated_point += qdx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We process 4 pixels at a time
|
||||||
|
for (int x = 0; x < core_pixels / 4; ++x)
|
||||||
|
{
|
||||||
|
interpolate_row(row_buffer.get(), row_coefs.get(), x * 16, buffer, buffer_index);
|
||||||
|
buffer_index += 4;
|
||||||
|
}
|
||||||
|
buffer_index += core_pixels % 4;
|
||||||
|
|
||||||
|
#endif // ! SIMD
|
||||||
|
|
||||||
|
|
||||||
// // Border
|
// // Border
|
||||||
// for (int x = 0; x < right_border; ++x, ++buffer_index)
|
// for (int x = 0; x < right_border; ++x, ++buffer_index)
|
||||||
// {
|
// {
|
||||||
|
|
Loading…
Reference in a new issue