From bcf16680ae9875dc764af5e94d95791d9a64c9e9 Mon Sep 17 00:00:00 2001 From: Fabien Freling Date: Wed, 16 Jul 2014 02:08:32 +0200 Subject: [PATCH] Remove RGBX structure. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pixels are still packed as RGBX in memory but no structure is created, it’s just a contiguous buffer. Interpolation is now done with SIMD on integer values. - Add SIMD define. --- Makefile | 5 +-- TODO.md | 6 ++-- rotation.cpp | 98 +++++++++++++++++++++++++++++++--------------------- 3 files changed, 65 insertions(+), 44 deletions(-) diff --git a/Makefile b/Makefile index e6dfa57..523d247 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,11 @@ CXX = clang++ -CXXFLAGS = -std=c++11 -W -Wall -O3 -ffast-math -Werror -g +CXXFLAGS = -std=c++11 -W -Wall -O3 -ffast-math -g -Werror +DEFINES = -DSIMD BUILD_DIR=/tmp IMG=img/lena.ppm all: rotation.cpp - $(CXX) $(CXXFLAGS) $< -o $(BUILD_DIR)/rotation + $(CXX) $(CXXFLAGS) $(DEFINES) $< -o $(BUILD_DIR)/rotation clean: @rm -f *~ *.o .*.swp *.ppm cachegrind.out.* diff --git a/TODO.md b/TODO.md index bdb3740..b576775 100644 --- a/TODO.md +++ b/TODO.md @@ -18,8 +18,10 @@ ## Alignement [X] RGBX format (create pixel structure) on 8 bytes (can do computation in-place) -[ ] Load pixels in 64-bit variable -[ ] Align memory on 16 bytes +[X] Load pixels in 64-bit variable + [X] Directly load in SIMD 128-bit variable +[ ] Align memory on 16 bytes (would require padding) +[ ] RGBX tiles ## Layout [ ] Pack 4 neighbors in 16B structure (aligned) diff --git a/rotation.cpp b/rotation.cpp index 294932d..ce85872 100644 --- a/rotation.cpp +++ b/rotation.cpp @@ -65,18 +65,8 @@ uint8_t interpolate_packed(uint32_t pack, double x, double x_inv, double y, doub // Pixel // -typedef uint8_t pvalue_t; -struct pixel_t { - pvalue_t r; - pvalue_t g; - pvalue_t b; - pvalue_t x; // padding - - pixel_t() - : r(0), g(0), b(0), x(0) - {} - -}; +typedef uint16_t pvalue_t; // pixel value type +#define PIXEL_SIZE 4 @@ -88,7 +78,7 @@ struct pixel_t { struct Image { unsigned int width; unsigned int height; - pixel_t* buffer; + pvalue_t* buffer; Image() : width(0) @@ -105,7 +95,8 @@ struct Image { { this->width = w; this->height = h; - buffer = new pixel_t[width * height]; + buffer = new pvalue_t[width * height * PIXEL_SIZE]; + memset(buffer, 0, width * height * PIXEL_SIZE * sizeof (pvalue_t)); } Image(string const& path) @@ -217,15 +208,15 @@ struct Image { virtual bool read_body(std::ifstream& istr) { unsigned int const nb_pixels = width * height; - buffer = new pixel_t[nb_pixels]; + buffer = new pvalue_t[nb_pixels * PIXEL_SIZE]; - pixel_t* pixel = buffer; + pvalue_t* pixel = buffer; for (unsigned int i = 0; i < nb_pixels; ++i) { - pixel->r = istr.get(); - pixel->g = istr.get(); - pixel->b = istr.get(); - ++pixel; + *(pixel++) = istr.get(); + *(pixel++) = istr.get(); + *(pixel++) = istr.get(); + *(pixel++) = 0; // padding } return true; @@ -234,13 +225,13 @@ struct Image { virtual bool write_body(std::ofstream& ostr) const { unsigned int const nb_pixels = width * height; - pixel_t* pixel = buffer; + pvalue_t* pixel = buffer; for (unsigned int i = 0; i < nb_pixels; ++i) { - ostr << (char) pixel->r; - ostr << (char) pixel->g; - ostr << (char) pixel->b; - ++pixel; + ostr << (char) *(pixel++); + ostr << (char) *(pixel++); + ostr << (char) *(pixel++); + pixel++; // padding } return true; @@ -643,20 +634,19 @@ inline void rotate_pixel(Image const& src, Point const& src_rotated_point, unsigned int const src_limit, - pixel_t* rotate_buffer, unsigned int rot_index) + pvalue_t* rotate_buffer, unsigned int rot_index) { unsigned int const quantize = 8; int const src_x = src_rotated_point.x >> 3; int const src_y = src_rotated_point.y >> 3; - unsigned int src_index = src_y * src.width + src_x; + unsigned int src_index = (src_y * src.width + src_x) * PIXEL_SIZE; // Bilinear interpolation unsigned int src_index_1 = src_index; - unsigned int src_index_2 = src_index_1 + 1; - unsigned int src_index_3 = src_index_1 + 1 * src.width; - unsigned int src_index_4 = src_index_3 + 1; + unsigned int src_index_3 = src_index_1 + PIXEL_SIZE * src.width; + unsigned int src_index_4 = src_index_3 + PIXEL_SIZE; // Out-of-bounds check if (src_index_4 >= src_limit) @@ -667,13 +657,41 @@ void rotate_pixel(Image const& src, unsigned int const inv_x = quantize - x_delta; unsigned int const inv_y = quantize - y_delta; - // No SIMD - rotate_buffer[rot_index].r = ((src.buffer[src_index_1].r * inv_x + src.buffer[src_index_2].r * x_delta) * inv_y - + (src.buffer[src_index_3].r * inv_x + src.buffer[src_index_4].r * x_delta) * y_delta) >> 6; - rotate_buffer[rot_index].g = ((src.buffer[src_index_1].g * inv_x + src.buffer[src_index_2].g * x_delta) * inv_y - + (src.buffer[src_index_3].g * inv_x + src.buffer[src_index_4].g * x_delta) * y_delta) >> 6; - rotate_buffer[rot_index].b = ((src.buffer[src_index_1].b * inv_x + src.buffer[src_index_2].b * x_delta) * inv_y - + (src.buffer[src_index_3].b * inv_x + src.buffer[src_index_4].b * x_delta) * y_delta) >> 6; +#ifndef SIMD + + unsigned int src_index_2 = src_index_1 + PIXEL_SIZE; + + rotate_buffer[rot_index] = ((src.buffer[src_index_1] * inv_x + src.buffer[src_index_2] * x_delta) * inv_y + + (src.buffer[src_index_3] * inv_x + src.buffer[src_index_4] * x_delta) * y_delta) >> 6; + rotate_buffer[rot_index + 1] = ((src.buffer[src_index_1 + 1] * inv_x + src.buffer[src_index_2 + 1] * x_delta) * inv_y + + (src.buffer[src_index_3 + 1] * inv_x + src.buffer[src_index_4 + 1] * x_delta) * y_delta) >> 6; + rotate_buffer[rot_index + 2] = ((src.buffer[src_index_1 + 2] * inv_x + src.buffer[src_index_2 + 2] * x_delta) * inv_y + + (src.buffer[src_index_3 + 2] * inv_x + src.buffer[src_index_4 + 2] * x_delta) * y_delta) >> 6; + +#else + + // X-axis + __m128i top = _mm_loadu_si128((__m128i*) &src.buffer[src_index_1]); + __m128i bottom = _mm_loadu_si128((__m128i*) &src.buffer[src_index_3]); + __m128i coef = _mm_set_epi16(x_delta, x_delta, x_delta, x_delta, inv_x, inv_x, inv_x, inv_x); + top = _mm_mullo_epi16(top, coef); + bottom = _mm_mullo_epi16(bottom, coef); + + // Y-axis + coef = _mm_set1_epi16(inv_y); + top = _mm_mullo_epi16(top, coef); + coef = _mm_set1_epi16(y_delta); + bottom = _mm_mullo_epi16(bottom, coef); + top = _mm_add_epi16(top, bottom); + + top = _mm_srli_epi16(top, 6); + + rotate_buffer[rot_index] = _mm_extract_epi16(top, 0) + _mm_extract_epi16(top, 4); + rotate_buffer[rot_index + 1] = _mm_extract_epi16(top, 1) + _mm_extract_epi16(top, 5); + rotate_buffer[rot_index + 2] = _mm_extract_epi16(top, 2) + _mm_extract_epi16(top, 6); + + +#endif // ! SIMD } Image* rotate(Image const& src, double angle) @@ -706,13 +724,13 @@ Image* rotate(Image const& src, double angle) round_if_very_small(src_delta_y.x); round_if_very_small(src_delta_y.y); - unsigned int const src_limit = src.width * src.height * 3; + unsigned int const src_limit = src.width * src.height * PIXEL_SIZE; DPoint const rot_origin_in_src_grid = get_mapped_point(*rotated, Point(0, 0), -rotation); DPoint const rot_origin_in_src = convert_img_coord_precision(src, rot_origin_in_src_grid); unsigned int buffer_index = 0; - pixel_t* buffer = rotated->buffer; + pvalue_t* buffer = rotated->buffer; unsigned int const quantize = 8; int const& src_qwidth = src.width * quantize; @@ -736,7 +754,7 @@ Image* rotate(Image const& src, double angle) buffer, buffer_index); } - ++buffer_index; + buffer_index += PIXEL_SIZE; } }