Implement pixels as RGBX structure.

- Add ‘make debug’ target - Add links in README
2014-07-12 22:37:15 +02:00 · 2014-07-12 22:37:15 +02:00 · cce4d45ba6
parent 8175b8a06c
commit cce4d45ba6
4 changed files with 73 additions and 53 deletions
--- a/3
+++ b/3
@ -12,5 +12,8 @@ clean:
 run: all
 	$(BUILD_DIR)/rotation $(IMG)

+debug: all
+	lldb $(BUILD_DIR)/rotation $(IMG)
+
 cachegrind: all
 	valgrind --tool=cachegrind $(BUILD_DIR)/rotation $(IMG)
--- a/README.md
+++ b/README.md
@ -1,3 +1,11 @@
 # RotateMeFast

 This project aims to rotate bitmap images very quickly (around a millisecond).
+
+## Links
+
+* [What Every Programmer Should Know About Memory](http://www.akkadia.org/drepper/cpumemory.pdf)
+* [Best Practices for Using vImage](https://developer.apple.com/library/ios/documentation/Performance/Conceptual/vImage/BestPractices/BestPractices.html)
+  * [vImageRotate_ARGB8888](https://developer.apple.com/library/mac/documentation/Performance/Reference/vImage_geometric/Reference/reference.html#//apple_ref/c/func/vImageRotate_ARGB8888)
+* [Vectorising code to take advantage of modern CPUs](http://www.walkingrandomly.com/?p=3378)
+  * http://locklessinc.com/articles/vectorize/
--- a/TODO.md
+++ b/TODO.md
@ -1,25 +1,31 @@
-[-] Quaternions
-[X] Draw rotated pixels in src order
+[-] Draw rotated pixels in src order -> cache write miss
 [X] Use atan2 at beginning and end of line.
    Interpolation in-between values
 [X] Test pixel perfect 90
 [ ] Fix out-of-bounds pixel set

 [ ] Optimization for square images?
-[X] Fixed point computation?
-[-] -funroll-loops
+[X] Fixed point computation
+[-] -funroll-loops -> no gain
+[-] restrict qualifier -> unavailable in C++

 # Cache
 [-] Rotate per channel -> no gain
-[ ] Load pixels in 64-bit variable
 [X] Cut image in tiles
 [X] Overlap
 [-] Rotate in one temp tile then copy/move it
 [X] Align tiles in memory
-[-] Align memory -> no gain
-[ ] RGBX format
+
+## Alignement
+[X] RGBX format (create pixel structure) on 8 bytes (can do computation in-place)
+[ ] Load pixels in 64-bit variable
+[ ] Align memory on 16 bytes
+
+## Layout
+[ ] Pack 4 neighbors in 16B structure (aligned)
+    Each point is followed by the point below
 [ ] Spiral layout?

 # Quality
-[X] Interpolate using SIMD, SSE (no big gain)
+[X] Interpolate using SIMD, SSE (no big gain, alignement problem?)
 [ ] Image borders
--- a/rotation.cpp
+++ b/rotation.cpp
@ -60,6 +60,26 @@ uint8_t interpolate_packed(uint32_t pack, double x, double x_inv, double y, doub



+//
+//
+// Pixel
+//
+
+typedef uint8_t pvalue_t;
+struct pixel_t {
+  pvalue_t r;
+  pvalue_t g;
+  pvalue_t b;
+  pvalue_t x; // padding
+
+  pixel_t()
+  : r(0), g(0), b(0), x(0)
+  {}
+
+};
+
+
+
 //
 //
 // Image
@ -68,7 +88,7 @@ uint8_t interpolate_packed(uint32_t pack, double x, double x_inv, double y, doub
 struct Image {
  unsigned int width;
  unsigned int height;
-  uint8_t* buffer;
+  pixel_t* buffer;

  Image()
  : width(0)
@ -85,8 +105,7 @@ struct Image {
  {
    this->width = w;
    this->height = h;
-    buffer = new uint8_t[width * height * 3];
-    memset(buffer, 0, width * height * 3 * sizeof (uint8_t));
+    buffer = new pixel_t[width * height];
  }

  Image(string const& path)
@ -128,26 +147,6 @@ struct Image {
    return true;
  }

-  void set_pixel(unsigned int x, unsigned int y, uint8_t r, uint8_t g, uint8_t b)
-  {
-    if (x >= width || y >= height)
-    {
-//        cerr << __LINE__ << " | Point (" << x << ", " << y << ") out of bounds" << endl;
-//        cerr << " Image dimensions: " << width << " x " << height << endl;
-//        assert(false);
-        return;
-    }
-    int index = (y * width + x) * 3;
-    buffer[index++] = r;
-    buffer[index++] = g;
-    buffer[index++] = b;
-  }
-
-  void set_pixel(Point const& p, uint8_t r, uint8_t g, uint8_t b)
-  {
-    this->set_pixel(p.x, p.y, r, g, b);
-  }
-

  protected:
    bool read_header(std::ifstream& istr)
@ -218,13 +217,15 @@ struct Image {
    virtual bool read_body(std::ifstream& istr)
    {
      unsigned int const nb_pixels = width * height;
-      buffer = new uint8_t[nb_pixels * 3];
+      buffer = new pixel_t[nb_pixels];

-      uint8_t* buf_index = buffer;
-      for (unsigned int i = 0; i < nb_pixels * 3; ++i)
+      pixel_t* pixel = buffer;
+      for (unsigned int i = 0; i < nb_pixels; ++i)
      {
-        *buf_index = istr.get();
-        ++buf_index;
+        pixel->r = istr.get();
+        pixel->g = istr.get();
+        pixel->b = istr.get();
+        ++pixel;
      }

      return true;
@ -233,11 +234,13 @@ struct Image {
    virtual bool write_body(std::ofstream& ostr) const
    {
      unsigned int const nb_pixels = width * height;
-      uint8_t* buf_index = buffer;
-      for (unsigned int i = 0; i < nb_pixels * 3; ++i)
+      pixel_t* pixel = buffer;
+      for (unsigned int i = 0; i < nb_pixels; ++i)
      {
-        ostr << (char) *buf_index;
-        ++buf_index;
+        ostr << (char) pixel->r;
+        ostr << (char) pixel->g;
+        ostr << (char) pixel->b;
+        ++pixel;
      }

      return true;
@ -640,20 +643,20 @@ inline
 void rotate_pixel(Image const& src,
                  Point const& src_rotated_point,
                  unsigned int const src_limit,
-                  uint8_t* rotate_buffer, unsigned int rot_index)
+                  pixel_t* rotate_buffer, unsigned int rot_index)
 {
  unsigned int const quantize = 8;

  int const src_x = src_rotated_point.x >> 3;
  int const src_y = src_rotated_point.y >> 3;

-  unsigned int src_index = (src_y * src.width + src_x) * 3;
+  unsigned int src_index = src_y * src.width + src_x;

  // Bilinear interpolation
  unsigned int src_index_1 = src_index;
-  unsigned int src_index_2 = src_index_1 + 3;
-  unsigned int src_index_3 = src_index_1 + 3 * src.width;
-  unsigned int src_index_4 = src_index_3 + 3;
+  unsigned int src_index_2 = src_index_1 + 1;
+  unsigned int src_index_3 = src_index_1 + 1 * src.width;
+  unsigned int src_index_4 = src_index_3 + 1;

  // Out-of-bounds check
  if (src_index_4 >= src_limit)
@ -665,12 +668,12 @@ void rotate_pixel(Image const& src,
  unsigned int const inv_y = quantize - y_delta;

  // No SIMD
-  rotate_buffer[rot_index] = ((src.buffer[src_index_1] * inv_x + src.buffer[src_index_2] * x_delta) * inv_y
-                           + (src.buffer[src_index_3] * inv_x + src.buffer[src_index_4] * x_delta) * y_delta) >> 6;
-  rotate_buffer[rot_index + 1] = ((src.buffer[src_index_1 + 1] * inv_x + src.buffer[src_index_2 + 1] * x_delta) * inv_y
-                               + (src.buffer[src_index_3 + 1] * inv_x + src.buffer[src_index_4 + 1] * x_delta) * y_delta) >> 6;
-  rotate_buffer[rot_index + 2] = ((src.buffer[src_index_1 + 2] * inv_x + src.buffer[src_index_2 + 2] * x_delta) * inv_y
-                               + (src.buffer[src_index_3 + 2] * inv_x + src.buffer[src_index_4 + 2] * x_delta) * y_delta) >> 6;
+  rotate_buffer[rot_index].r = ((src.buffer[src_index_1].r * inv_x + src.buffer[src_index_2].r * x_delta) * inv_y
+                              + (src.buffer[src_index_3].r * inv_x + src.buffer[src_index_4].r * x_delta) * y_delta) >> 6;
+  rotate_buffer[rot_index].g = ((src.buffer[src_index_1].g * inv_x + src.buffer[src_index_2].g * x_delta) * inv_y
+                              + (src.buffer[src_index_3].g * inv_x + src.buffer[src_index_4].g * x_delta) * y_delta) >> 6;
+  rotate_buffer[rot_index].b = ((src.buffer[src_index_1].b * inv_x + src.buffer[src_index_2].b * x_delta) * inv_y
+                              + (src.buffer[src_index_3].b * inv_x + src.buffer[src_index_4].b * x_delta) * y_delta) >> 6;
 }

 Image* rotate(Image const& src, double angle)
@ -709,7 +712,7 @@ Image* rotate(Image const& src, double angle)
  DPoint const rot_origin_in_src = convert_img_coord_precision(src, rot_origin_in_src_grid);

  unsigned int buffer_index = 0;
-  uint8_t* buffer = rotated->buffer;
+  pixel_t* buffer = rotated->buffer;

  unsigned int const quantize = 8;
  int const& src_qwidth = src.width * quantize;
@ -730,7 +733,7 @@ Image* rotate(Image const& src, double angle)
      {
        rotate_pixel(src, src_runner,
                     src_limit,
-                     buffer, buffer_index * 3);
+                     buffer, buffer_index);
      }

      ++buffer_index;