diff --git a/build.bat b/build.bat index e5932f0..ecc9c28 100644 --- a/build.bat +++ b/build.bat @@ -6,4 +6,4 @@ rem assets.exe rem tracy/TracyClient.cpp -DTRACY_ENABLE -clang main.cpp -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib +clang main.cpp -O2 -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib diff --git a/main.cpp b/main.cpp index 95f515d..1a2678c 100644 --- a/main.cpp +++ b/main.cpp @@ -331,7 +331,7 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig Vec8 Dy10 = vec8(dy10) * var07; Vec8 Dy21 = vec8(dy21) * var07; Vec8 Dy02 = vec8(dy02) * var07; - Vec8 w0, w1, w2, invw0, invw1, invw2, u, v, interpolated_w, should_fill; + Vec8 w0, w1, w2, invw0, invw1, invw2, u, v, interpolated_w; Vec8I ui, vi; U32 *destination = dst->pixels + dst->x*min_y; @@ -349,7 +349,15 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig Cx2 = vec8(Cx2[7]) + Dy02; - should_fill = Cx0 >= vec8(0) & Cx1 >= vec8(0) & Cx2 >= vec8(0); + + Vec8 should_fill; + { + Vec8 a = (vec8(x8) + var07); + Vec8 b = vec8(max_x); + should_fill = a < b; + should_fill = should_fill & (Cx0 >= vec8(0) & Cx1 >= vec8(0) & Cx2 >= vec8(0)); + } + w0 = Cx1 / area8; w1 = Cx2 / area8; w2 = Cx0 / area8; @@ -362,6 +370,7 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig Vec8 depth = loadu8(depth_pointer); should_fill = should_fill & (depth < interpolated_w); + invw0 = (w0 / vec8(p0.w)); invw1 = (w1 / vec8(p1.w)); invw2 = (w2 / vec8(p2.w)); @@ -378,17 +387,26 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig vi = convert_vec8_to_vec8i(v); // Origin UV (0,0) is in bottom left - U32 *dst_pixel = destination + x8; + _mm256_maskstore_epi32((int *)depth_pointer, should_fill.simd, interpolated_w.simd); + Vec8I indices = ui + ((vec8i(src->y) - vec8i(1) - vi) * vec8i(src->x)); + U32 *pixel[8] = { + src->pixels + indices.e[0], + src->pixels + indices.e[1], + src->pixels + indices.e[2], + src->pixels + indices.e[3], + src->pixels + indices.e[4], + src->pixels + indices.e[5], + src->pixels + indices.e[6], + src->pixels + indices.e[7], + }; + U32 *dst_pixel = destination + x8; for(S64 i = 0; i < 8; i++){ if (should_fill[i]){ PROFILE_SCOPE(fill_triangle_after_depth_test); - depth_pointer[i] = interpolated_w[i]; - - U32 *pixel = src->pixels + (ui[i] + (src->y - 1ll - vi[i]) * src->x); Vec4 result_color; { - U32 c = *pixel; + U32 c = *pixel[i]; F32 a = ((c & 0xff000000) >> 24) / 255.f; F32 b = ((c & 0x00ff0000) >> 16) / 255.f; F32 g = ((c & 0x0000ff00) >> 8) / 255.f; @@ -652,7 +670,7 @@ main(int argc, char **argv) { os.window_size.y = 720; os.window_resizable = 1; assert(os_init()); - Font font = os_load_font(os.perm_arena, 16, "Arial", 0); + Font font = os_load_font(os.perm_arena, 12*os.dpi_scale, "Arial", 0); f22 = load_obj_dump(os.perm_arena, "plane.bin"_s); sponza = load_obj_dump(os.perm_arena, "sponza.bin"_s); diff --git a/vec.cpp b/vec.cpp index 5fa477b..75d9dcb 100644 --- a/vec.cpp +++ b/vec.cpp @@ -36,10 +36,15 @@ union Vec8I{ Vec8I vec8i(S32 x){return {_mm256_set1_epi32(x)}; } Vec8I vec8i(S32 a, S32 b, S32 c, S32 d, S32 e, S32 f, S32 g, S32 h){ return {_mm256_set_epi32(h, g, f, e, d, c, b, a)}; } +Vec8I operator>(Vec8I a, Vec8I b){ + return {_mm256_cmpgt_epi32(a.simd, b.simd)}; +} Vec8I operator+(Vec8I a, Vec8I b){ return {_mm256_add_epi32(a.simd, b.simd)}; } Vec8I operator-(Vec8I a, Vec8I b){ return {_mm256_sub_epi32(a.simd, b.simd)}; } -Vec8I operator*(Vec8I a, Vec8I b){ return {_mm256_mul_epi32(a.simd, b.simd)}; } +Vec8I operator*(Vec8I a, Vec8I b){ + return {_mm256_mullo_epi32(a.simd, b.simd)}; //_mm256_mul_epi32 +} // Vec8I operator/(Vec8I a, Vec8I b){ return {_mm256_div_epi32(a.simd, b.simd)}; } Vec8I operator+=(Vec8I &a, Vec8I b){ return a + b; } -Vec8I convert_vec8_to_vec8i(Vec8 v){ return Vec8I{_mm256_cvtps_epi32(v.simd)}; } +Vec8I convert_vec8_to_vec8i(Vec8 v){ return Vec8I{_mm256_cvtps_epi32(v.simd)}; } \ No newline at end of file