Big opimizations, almost 2x performance increase!

2022-07-08 10:47:55 +02:00
parent 560f6af3dc
commit b79e8d0df6
3 changed files with 27 additions and 329 deletions
--- a/main.cpp
+++ b/main.cpp
@@ -58,12 +58,12 @@
 /// - [x] Statistics based on profiler data
 /// - [x] Find cool profilers - ExtraSleepy, Vtune
 /// - [ ] Optimizations
-///   - [ ] Inline edge function
-///   - [ ] Expand edge functions to more optimized version
+///   - [x] Inline edge function
+///   - [x] Expand edge functions to more optimized version
 ///   - [ ] Test 4x2 bitmap layout?
 ///   - [ ] Edge function to integer
 ///   - [ ] Use integer bit operations to figure out if plus. (edge0|edge1|edge2)>=0
-///   - [ ] SIMD
+///   - [x] SIMD
 ///   - [ ] Multithreading
 ///
 /// - [x] Text rendering
@@ -76,18 +76,7 @@
 /// - [x] Gamma correct alpha blending for rectangles and bitmaps
 /// - [ ] Plotting of profile data
 ///    - [x] Simple scatter plot
-///
-///
-/// ### Urgent:
-///
-/// - [ ] Simplify the code, especially for the 2d routines
 /// - [x] Asset processor as second program
-///
-///
-#if 0
-#include "tracy/Tracy.hpp"
-#undef assert
-#endif

 // #include "obj_dump.cpp"
 #include "multimedia.cpp"
@@ -288,14 +277,12 @@ F32 edge_function(Vec4 vecp0, Vec4 vecp1, Vec4 p) {
  return result;
 }

-#define I(x,i) (((F32 *)&x)[i])
-#define Is(x,i) (((S32 *)&x)[i])
 #define F32x8 __m256
 #define S32x8 __m256i

 U64 filled_pixel_count;
 U64 filled_pixel_total_time;
-// #include "optimization_log.cpp"
+#include "optimization_log.cpp"

 function
 void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction,
@@ -439,13 +426,12 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
      F32x8 should_fill_term = _mm256_cmp_ps(depth, interpolated_w, _CMP_LT_OQ);
      should_fill = _mm256_and_ps(should_fill, should_fill_term);

-#if 0
      // If all pixels are not going to get drawn then opt out
-      // Seems to decrease perf
      F32x8 compare_with_zero = _mm256_cmpeq_epi32(should_fill, var0);
      int mask = _mm256_movemask_epi8(compare_with_zero);
-      if(mask == 1) continue;
-#endif
+      if(mask == 0xffffffff) {
+        continue;
+      }

      F32x8 invw0 = _mm256_mul_ps(w0, inv_p0w);
      F32x8 invw1 = _mm256_mul_ps(w1, inv_p1w);
@@ -484,18 +470,21 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
      S32x8 indices3 = _mm256_mullo_epi32(var_src_x_int, indices1);
      S32x8 indices  = _mm256_add_epi32(indices3, ui);

+
      //
      // Fetch and calculate texel values
      //
-      S32x8 pixel;
-      if(I(should_fill, 0)) Is(pixel, 0) = src->pixels[Is(indices, 0)];
-      if(I(should_fill, 1)) Is(pixel, 1) = src->pixels[Is(indices, 1)];
-      if(I(should_fill, 2)) Is(pixel, 2) = src->pixels[Is(indices, 2)];
-      if(I(should_fill, 3)) Is(pixel, 3) = src->pixels[Is(indices, 3)];
-      if(I(should_fill, 4)) Is(pixel, 4) = src->pixels[Is(indices, 4)];
-      if(I(should_fill, 5)) Is(pixel, 5) = src->pixels[Is(indices, 5)];
-      if(I(should_fill, 6)) Is(pixel, 6) = src->pixels[Is(indices, 6)];
-      if(I(should_fill, 7)) Is(pixel, 7) = src->pixels[Is(indices, 7)];
+      indices = _mm256_and_si256(indices, should_fill);
+      S32x8 pixel = _mm256_set_epi32(
+        src->pixels[_mm256_extract_epi32(indices, 7)],
+        src->pixels[_mm256_extract_epi32(indices, 6)],
+        src->pixels[_mm256_extract_epi32(indices, 5)],
+        src->pixels[_mm256_extract_epi32(indices, 4)],
+        src->pixels[_mm256_extract_epi32(indices, 3)],
+        src->pixels[_mm256_extract_epi32(indices, 2)],
+        src->pixels[_mm256_extract_epi32(indices, 1)],
+        src->pixels[_mm256_extract_epi32(indices, 0)]
+      );

      S32x8 texel_i_a = _mm256_and_si256(pixel, var_0xff000000);
      S32x8 texel_i_b = _mm256_and_si256(pixel, var_0x00ff0000);
@@ -505,11 +494,10 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
      // Alpha is done this way because signed integer shift is weird
      // When sign bit is set it sets all bits that we shift the sign through
      // So first we shift
-      texel_i_a = _mm256_srai_epi32(texel_i_a, 24);
-      texel_i_a = _mm256_and_si256(texel_i_a, var_0x000000ff);
-      texel_i_b = _mm256_srai_epi32(texel_i_b, 16);
-      texel_i_g = _mm256_srai_epi32(texel_i_g, 8 );
-      texel_i_r = _mm256_srai_epi32(texel_i_r, 0 );
+      texel_i_a = _mm256_srli_epi32(texel_i_a, 24);
+      texel_i_b = _mm256_srli_epi32(texel_i_b, 16);
+      texel_i_g = _mm256_srli_epi32(texel_i_g, 8 );
+      texel_i_r = _mm256_srli_epi32(texel_i_r, 0 );

      F32x8 texel_a0 = _mm256_cvtepi32_ps(texel_i_a);
      F32x8 texel_b0 = _mm256_cvtepi32_ps(texel_i_b);
@@ -1013,7 +1001,6 @@ main(int argc, char **argv) {

  array_free_all_nodes(&array);
  array_print(&array);
-  __debugbreak();


  f22 = load_obj_dump(os.perm_arena, "plane.bin"_s);