From 1c3ff7e8cdfe61d64b601b3a550d0cc231cef480 Mon Sep 17 00:00:00 2001 From: Krzosa Karol Date: Wed, 6 Jul 2022 14:15:59 +0200 Subject: [PATCH] SIMD write to texture! --- build.bat | 2 +- main.cpp | 34 ++++++++++++++++++++++------------ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/build.bat b/build.bat index ecc9c28..e5932f0 100644 --- a/build.bat +++ b/build.bat @@ -6,4 +6,4 @@ rem assets.exe rem tracy/TracyClient.cpp -DTRACY_ENABLE -clang main.cpp -O2 -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib +clang main.cpp -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib diff --git a/main.cpp b/main.cpp index a853444..70b112f 100644 --- a/main.cpp +++ b/main.cpp @@ -371,6 +371,8 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig F32x8 one_over_p1w = _mm256_set1_ps(1.f / p1.w); F32x8 one_over_p2w = _mm256_set1_ps(1.f / p2.w); + + U32 *destination = dst->pixels + dst->x*min_y; F32 area = (p1.y - p0.y) * (p2.x - p0.x) - (p1.x - p0.x) * (p2.y - p0.y); F32x8 area8 = _mm256_set1_ps(area); @@ -423,7 +425,6 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig F32 *depth_pointer = (depth_buffer + (x8 + y * dst->x)); F32x8 depth = _mm256_loadu_ps((float *)depth_pointer); - // F32x8 should_fill_term = _mm256_cmp_ps(depth, interpolated_w, _CMP_LT_OQ); should_fill = _mm256_and_ps(should_fill, should_fill_term); @@ -550,18 +551,27 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig dst_b = _mm256_sqrt_ps(dst_b); } - S32x8 result; - for(S64 i = 0; i < 8; i++){ - if (I(should_fill, i)){ - U8 red = (U8)(dst_r[i] * 255); - U8 green = (U8)(dst_g[i] * 255); - U8 blue = (U8)(dst_b[i] * 255); - U8 alpha = (U8)(dst_a[i] * 255); - Is(result, i) = (U32)(alpha << 24 | blue << 16 | green << 8 | red << 0); - } - } + // Convert to integer format + dst_r = _mm256_mul_ps(dst_r, var255); + dst_g = _mm256_mul_ps(dst_g, var255); + dst_b = _mm256_mul_ps(dst_b, var255); + dst_a = _mm256_mul_ps(dst_a, var255); - _mm256_maskstore_epi32((int *)dst_memory, should_fill, result); + S32x8 dst_r_int = _mm256_cvtps_epi32(dst_r); + S32x8 dst_g_int = _mm256_cvtps_epi32(dst_g); + S32x8 dst_b_int = _mm256_cvtps_epi32(dst_b); + S32x8 dst_a_int = _mm256_cvtps_epi32(dst_a); + + S32x8 dst_int_a_shifted = _mm256_slli_epi32(dst_a_int, 24); + S32x8 dst_int_b_shifted = _mm256_slli_epi32(dst_b_int, 16); + S32x8 dst_int_g_shifted = _mm256_slli_epi32(dst_g_int, 8); + S32x8 dst_int_r_shifted = dst_r_int; + + S32x8 packed_abgr0 = _mm256_or_si256(dst_int_a_shifted, dst_int_b_shifted); + S32x8 packed_abgr1 = _mm256_or_si256(packed_abgr0, dst_int_g_shifted); + S32x8 packed_abgr2 = _mm256_or_si256(packed_abgr1, dst_int_r_shifted); + + _mm256_maskstore_epi32((int *)dst_memory, should_fill, packed_abgr2); } Cy0 -= dx10;