SIMD write to texture!
This commit is contained in:
@@ -6,4 +6,4 @@ rem assets.exe
|
|||||||
rem tracy/TracyClient.cpp -DTRACY_ENABLE
|
rem tracy/TracyClient.cpp -DTRACY_ENABLE
|
||||||
|
|
||||||
|
|
||||||
clang main.cpp -O2 -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib
|
clang main.cpp -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib
|
||||||
|
|||||||
34
main.cpp
34
main.cpp
@@ -371,6 +371,8 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
|
|||||||
F32x8 one_over_p1w = _mm256_set1_ps(1.f / p1.w);
|
F32x8 one_over_p1w = _mm256_set1_ps(1.f / p1.w);
|
||||||
F32x8 one_over_p2w = _mm256_set1_ps(1.f / p2.w);
|
F32x8 one_over_p2w = _mm256_set1_ps(1.f / p2.w);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
U32 *destination = dst->pixels + dst->x*min_y;
|
U32 *destination = dst->pixels + dst->x*min_y;
|
||||||
F32 area = (p1.y - p0.y) * (p2.x - p0.x) - (p1.x - p0.x) * (p2.y - p0.y);
|
F32 area = (p1.y - p0.y) * (p2.x - p0.x) - (p1.x - p0.x) * (p2.y - p0.y);
|
||||||
F32x8 area8 = _mm256_set1_ps(area);
|
F32x8 area8 = _mm256_set1_ps(area);
|
||||||
@@ -423,7 +425,6 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
|
|||||||
F32 *depth_pointer = (depth_buffer + (x8 + y * dst->x));
|
F32 *depth_pointer = (depth_buffer + (x8 + y * dst->x));
|
||||||
F32x8 depth = _mm256_loadu_ps((float *)depth_pointer);
|
F32x8 depth = _mm256_loadu_ps((float *)depth_pointer);
|
||||||
|
|
||||||
//
|
|
||||||
F32x8 should_fill_term = _mm256_cmp_ps(depth, interpolated_w, _CMP_LT_OQ);
|
F32x8 should_fill_term = _mm256_cmp_ps(depth, interpolated_w, _CMP_LT_OQ);
|
||||||
should_fill = _mm256_and_ps(should_fill, should_fill_term);
|
should_fill = _mm256_and_ps(should_fill, should_fill_term);
|
||||||
|
|
||||||
@@ -550,18 +551,27 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
|
|||||||
dst_b = _mm256_sqrt_ps(dst_b);
|
dst_b = _mm256_sqrt_ps(dst_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
S32x8 result;
|
// Convert to integer format
|
||||||
for(S64 i = 0; i < 8; i++){
|
dst_r = _mm256_mul_ps(dst_r, var255);
|
||||||
if (I(should_fill, i)){
|
dst_g = _mm256_mul_ps(dst_g, var255);
|
||||||
U8 red = (U8)(dst_r[i] * 255);
|
dst_b = _mm256_mul_ps(dst_b, var255);
|
||||||
U8 green = (U8)(dst_g[i] * 255);
|
dst_a = _mm256_mul_ps(dst_a, var255);
|
||||||
U8 blue = (U8)(dst_b[i] * 255);
|
|
||||||
U8 alpha = (U8)(dst_a[i] * 255);
|
|
||||||
Is(result, i) = (U32)(alpha << 24 | blue << 16 | green << 8 | red << 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_mm256_maskstore_epi32((int *)dst_memory, should_fill, result);
|
S32x8 dst_r_int = _mm256_cvtps_epi32(dst_r);
|
||||||
|
S32x8 dst_g_int = _mm256_cvtps_epi32(dst_g);
|
||||||
|
S32x8 dst_b_int = _mm256_cvtps_epi32(dst_b);
|
||||||
|
S32x8 dst_a_int = _mm256_cvtps_epi32(dst_a);
|
||||||
|
|
||||||
|
S32x8 dst_int_a_shifted = _mm256_slli_epi32(dst_a_int, 24);
|
||||||
|
S32x8 dst_int_b_shifted = _mm256_slli_epi32(dst_b_int, 16);
|
||||||
|
S32x8 dst_int_g_shifted = _mm256_slli_epi32(dst_g_int, 8);
|
||||||
|
S32x8 dst_int_r_shifted = dst_r_int;
|
||||||
|
|
||||||
|
S32x8 packed_abgr0 = _mm256_or_si256(dst_int_a_shifted, dst_int_b_shifted);
|
||||||
|
S32x8 packed_abgr1 = _mm256_or_si256(packed_abgr0, dst_int_g_shifted);
|
||||||
|
S32x8 packed_abgr2 = _mm256_or_si256(packed_abgr1, dst_int_r_shifted);
|
||||||
|
|
||||||
|
_mm256_maskstore_epi32((int *)dst_memory, should_fill, packed_abgr2);
|
||||||
|
|
||||||
}
|
}
|
||||||
Cy0 -= dx10;
|
Cy0 -= dx10;
|
||||||
|
|||||||
Reference in New Issue
Block a user