From 33f22effd426eab26ed42ea144477da8268ba587 Mon Sep 17 00:00:00 2001 From: Krzosa Karol Date: Sat, 9 Jul 2022 16:09:21 +0200 Subject: [PATCH] Test cases working, testing operator overloads vs no overloads --- build.bat | 2 +- main.cpp | 34 ++++++++++++++++++++++++++-------- optimization_log.cpp | 31 ++++++++++++++++++++----------- 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/build.bat b/build.bat index 780abfa..469a4c8 100644 --- a/build.bat +++ b/build.bat @@ -1,5 +1,5 @@ @echo off pushd %~dp0 -clang main.cpp -O2 -mfma -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -I".." -g -o main.exe -Wl,user32.lib +clang main.cpp -mfma -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -I".." -g -o main.exe -Wl,user32.lib popd \ No newline at end of file diff --git a/main.cpp b/main.cpp index 81a675b..48c9a9b 100644 --- a/main.cpp +++ b/main.cpp @@ -280,7 +280,8 @@ F32 edge_function(Vec4 vecp0, Vec4 vecp1, Vec4 p) { #define F32x8 __m256 #define S32x8 __m256i -S32 render_triangle_test_case_number; +S32 render_triangle_test_case_number = 3; +S32 render_triangle_test_case_angle = 1; U64 filled_pixel_count; U64 filled_pixel_cycles; U64 triangle_count; @@ -800,9 +801,11 @@ main(int argc, char **argv) { r.depth320 = (F32 *)arena_push_size(os.perm_arena, sizeof(F32) * screen_x * screen_y); String frame_data = {}; + String raster_details = {}; UISetup setup[] = { UI_SIGNAL("Change scene"_s, scene_callback), UI_LABEL(&frame_data), + UI_LABEL(&raster_details), UI_LABEL(&os.text), }; UI ui = ui_make(setup, buff_cap(setup)); @@ -879,14 +882,29 @@ main(int argc, char **argv) { } ui_end_frame(os.screen, &ui, &font); - frame_data = string_fmt(os.frame_arena, "FPS:%f dt:%f frame:%u camera_pos: %f %f %f camera_yaw: %f %f" - "\nCycle per pixel: %llu Cycles:%llu Pixels:%llu Triangles:%llu", - os.fps, os.delta_time*1000, os.frame, r.camera_pos.x, r.camera_pos.y, r.camera_pos.z, r.camera_yaw.x, r.camera_yaw.y, - filled_pixel_cycles/filled_pixel_count, filled_pixel_cycles, filled_pixel_count, triangle_count); + frame_data = string_fmt(os.frame_arena, "FPS:%f dt:%f frame:%u camera_pos: %f %f %f camera_yaw: %f %f", + os.fps, os.delta_time*1000, os.frame, r.camera_pos.x, r.camera_pos.y, r.camera_pos.z, r.camera_yaw.x, r.camera_yaw.y); + if(filled_pixel_count){ + raster_details = string_fmt(os.frame_arena, "\nAngle:%d Case:%d Cycle per pixel: %llu Cycles:%llu Pixels:%llu Triangles:%llu", + render_triangle_test_case_angle, render_triangle_test_case_number, filled_pixel_cycles/filled_pixel_count, filled_pixel_cycles, filled_pixel_count, triangle_count); - filled_pixel_count = 0; - filled_pixel_cycles = 0; - triangle_count = 0; + filled_pixel_count = 0; + filled_pixel_cycles = 0; + triangle_count = 0; + } + + if(os.frame % 4 == 0){ + render_triangle_test_case_number++; + if(render_triangle_test_case_number == 6){ + render_triangle_test_case_number = 0; + try_again: switch(render_triangle_test_case_angle){ + case 0: r.camera_pos = vec3(-228,94.5,-107); r.camera_yaw = vec2(-1.25, 0.21); break; + case 1: r.camera_pos = vec3(-356,89.5,168); r.camera_yaw = vec2(0.2, 0); break; + case 2: render_triangle_test_case_angle = 0; goto try_again; break; + } + render_triangle_test_case_angle += 1; + } + } } } diff --git a/optimization_log.cpp b/optimization_log.cpp index f01064e..a045c62 100644 --- a/optimization_log.cpp +++ b/optimization_log.cpp @@ -532,18 +532,27 @@ void draw_triangle_nearest_simd_with_overloads(Bitmap* dst, F32 *depth_buffer, B dst_b.simd = {_mm256_sqrt_ps(dst_b.simd)}; } - Vec8I result; - for(S64 i = 0; i < 8; i++){ - if (should_fill[i]){ - U8 red = (U8)(dst_r[i] * 255); - U8 green = (U8)(dst_g[i] * 255); - U8 blue = (U8)(dst_b[i] * 255); - U8 alpha = (U8)(dst_a[i] * 255); - result.e[i] = (U32)(alpha << 24 | blue << 16 | green << 8 | red << 0); - } - } + // Convert to integer format + dst_r = dst_r * var255; + dst_g = dst_g * var255; + dst_b = dst_b * var255; + dst_a = dst_a * var255; - _mm256_maskstore_epi32((int *)dst_memory, should_fill.simd, result.simd); + Vec8I dst_r_int = convert_vec8_to_vec8i(dst_r); + Vec8I dst_g_int = convert_vec8_to_vec8i(dst_g); + Vec8I dst_b_int = convert_vec8_to_vec8i(dst_b); + Vec8I dst_a_int = convert_vec8_to_vec8i(dst_a); + + Vec8I dst_int_a_shifted = {_mm256_slli_epi32(dst_a_int.simd, 24)}; + Vec8I dst_int_b_shifted = {_mm256_slli_epi32(dst_b_int.simd, 16)}; + Vec8I dst_int_g_shifted = {_mm256_slli_epi32(dst_g_int.simd, 8)}; + Vec8I dst_int_r_shifted = dst_r_int; + + Vec8I packed_abgr0 = {_mm256_or_si256(dst_int_a_shifted.simd, dst_int_b_shifted.simd)}; + Vec8I packed_abgr1 = {_mm256_or_si256(packed_abgr0.simd, dst_int_g_shifted.simd)}; + Vec8I packed_abgr2 = {_mm256_or_si256(packed_abgr1.simd, dst_int_r_shifted.simd)}; + + _mm256_maskstore_epi32((int *)dst_memory, should_fill.simd, packed_abgr2.simd); } Cy0 -= dx10;