diff --git a/build.bat b/build.bat index 1b95a77..c63f53d 100644 --- a/build.bat +++ b/build.bat @@ -6,4 +6,4 @@ rem assets.exe rem tracy/TracyClient.cpp -DTRACY_ENABLE -clang main.cpp -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib \ No newline at end of file +clang main.cpp -O2 -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib \ No newline at end of file diff --git a/main.cpp b/main.cpp index 3170604..79ac272 100644 --- a/main.cpp +++ b/main.cpp @@ -335,11 +335,12 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig Vec8 Dy10 = vec8(dy10) * var07; Vec8 Dy21 = vec8(dy21) * var07; Vec8 Dy02 = vec8(dy02) * var07; - Vec8 w0, w1, w2, invw0, invw1, invw2, u, v, interpolated_w; + Vec8 w0, w1, w2, invw0, invw1, invw2, u, v, interpolated_w, should_fill; Vec8I ui, vi; U32 *destination = dst->pixels + dst->x*min_y; F32 area = (p1.y - p0.y) * (p2.x - p0.x) - (p1.x - p0.x) * (p2.y - p0.y); + Vec8 area8 = vec8(area); for (S64 y = min_y; y < max_y; y++) { Cx0 = vec8(Cy0); Cx1 = vec8(Cy1); @@ -353,16 +354,20 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig Vec8I x = vec8i(x8) + var07i; - for(S64 i = 0; i < 8; i++){ - PROFILE_SCOPE(fill_triangle_inner); - if (Cx0[i] >= 0 && Cx1[i] >= 0 && Cx2[i] >= 0) { - w0[i] = Cx1[i] / area; - w1[i] = Cx2[i] / area; - w2[i] = Cx0[i] / area; + should_fill = Cx0 >= vec8(0) & Cx1 >= vec8(0) & Cx2 >= vec8(0); + w0 = Cx1 / area8; + w1 = Cx2 / area8; + w2 = Cx0 / area8; + + // @Note: We could do: interpolated_w = 1.f / interpolated_w to get proper depth + // but why waste an instruction, the smaller the depth value the farther the object + interpolated_w = vec8(1.f / p0.w) * w0 + vec8(1.f / p1.w) * w1 + vec8(1.f / p2.w) * w2; + // Vec8 *depth = (Vec8 *)(depth_buffer + (x8 + y * dst->x)); + // should_fill = should_fill & (*depth < interpolated_w); + + for(S64 i = 0; i < 8; i++){ + if (should_fill[i]){ - // @Note: We could do: interpolated_w = 1.f / interpolated_w to get proper depth - // but why waste an instruction, the smaller the depth value the farther the object - interpolated_w[i] = (1.f / p0.w) * w0[i] + (1.f / p1.w) * w1[i] + (1.f / p2.w) * w2[i]; F32* depth = depth_buffer + (x[i] + y * dst->x); if (*depth < interpolated_w[i]) { PROFILE_SCOPE(fill_triangle_after_depth_test); @@ -371,7 +376,7 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig invw1[i] = (w1[i] / p1.w); invw2[i] = (w2[i] / p2.w); - Vec3 norm = (norm0 * invw0[i] + norm1 * invw1[i] + norm2 * invw2[i]) / interpolated_w[i]; + // Vec3 norm = (norm0 * invw0[i] + norm1 * invw1[i] + norm2 * invw2[i]) / interpolated_w[i]; u[i] = tex0.x * invw0[i] + tex1.x * invw1[i] + tex2.x * invw2[i]; v[i] = tex0.y * invw0[i] + tex1.y * invw1[i] + tex2.y * invw2[i]; { diff --git a/vec.cpp b/vec.cpp index b823e43..2070664 100644 --- a/vec.cpp +++ b/vec.cpp @@ -15,6 +15,11 @@ Vec8 operator*(Vec8 a, Vec8 b){ return {_mm256_mul_ps(a.simd, b.simd)}; } Vec8 operator/(Vec8 a, Vec8 b){ return {_mm256_div_ps(a.simd, b.simd)}; } Vec8 operator+=(Vec8 &a, Vec8 b){ a = a + b; return a; } Vec8 operator-=(Vec8 &a, Vec8 b){ a = a - b; return a; } +Vec8 operator>=(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_GE_OQ)}; } +Vec8 operator<=(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_LE_OQ)}; } +// Vec8 operator<(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_LE)}; } +// Vec8 operator>(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_GE)}; } +Vec8 operator&(Vec8 a, Vec8 b){ return {_mm256_and_ps(a.simd, b.simd)}; } union Vec8I{ __m256i simd;