Setting up a test case to test different iterations of rasterization

This commit is contained in:
Krzosa Karol
2022-07-09 15:45:11 +02:00
parent 0ff19b6ab4
commit a6ec33ca4f
3 changed files with 48 additions and 504 deletions

View File

@@ -1,5 +1,5 @@
@echo off @echo off
pushd %~dp0 pushd %~dp0
clang main.cpp -mfma -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -I".." -g -o main.exe -Wl,user32.lib clang main.cpp -O2 -mfma -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -I".." -g -o main.exe -Wl,user32.lib
popd popd

View File

@@ -280,6 +280,7 @@ F32 edge_function(Vec4 vecp0, Vec4 vecp1, Vec4 p) {
#define F32x8 __m256 #define F32x8 __m256
#define S32x8 __m256i #define S32x8 __m256i
S32 render_triangle_test_case_number;
U64 filled_pixel_count; U64 filled_pixel_count;
U64 filled_pixel_cycles; U64 filled_pixel_cycles;
U64 triangle_count; U64 triangle_count;
@@ -706,12 +707,30 @@ void draw_mesh(Render *r, String scene_name, Obj_Material *materials, Obj_Mesh *
in[j].pos.y += r->screen320.y / 2; in[j].pos.y += r->screen320.y / 2;
} }
triangle_count++; triangle_count++;
draw_triangle_nearest_b(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[1].pos, in[2].pos, in[0].tex, in[1].tex, in[2].tex, in[0].norm, in[1].norm, in[2].norm); if (in_count > 3) triangle_count++;
if (in_count > 3) {
triangle_count++; switch(render_triangle_test_case_number){
draw_triangle_nearest_b(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[2].pos, in[3].pos, in[0].tex, in[2].tex, in[3].tex, in[0].norm, in[2].norm, in[3].norm); case 0: break;
case 1:
draw_triangle_nearest_a(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[1].pos, in[2].pos, in[0].tex, in[1].tex, in[2].tex, in[0].norm, in[1].norm, in[2].norm);
if (in_count > 3) draw_triangle_nearest_a(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[2].pos, in[3].pos, in[0].tex, in[2].tex, in[3].tex, in[0].norm, in[2].norm, in[3].norm);
case 2:
draw_triangle_nearest_b(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[1].pos, in[2].pos, in[0].tex, in[1].tex, in[2].tex, in[0].norm, in[1].norm, in[2].norm);
if (in_count > 3) draw_triangle_nearest_b(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[2].pos, in[3].pos, in[0].tex, in[2].tex, in[3].tex, in[0].norm, in[2].norm, in[3].norm);
break;
case 3:
draw_triangle_nearest_simd_with_overloads(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[1].pos, in[2].pos, in[0].tex, in[1].tex, in[2].tex, in[0].norm, in[1].norm, in[2].norm);
if (in_count > 3) draw_triangle_nearest_simd_with_overloads(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[2].pos, in[3].pos, in[0].tex, in[2].tex, in[3].tex, in[0].norm, in[2].norm, in[3].norm);
break;
case 4:
draw_triangle_nearest_simd_without_overloads(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[1].pos, in[2].pos, in[0].tex, in[1].tex, in[2].tex, in[0].norm, in[1].norm, in[2].norm);
if (in_count > 3) draw_triangle_nearest_simd_without_overloads(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[2].pos, in[3].pos, in[0].tex, in[2].tex, in[3].tex, in[0].norm, in[2].norm, in[3].norm);
break;
case 5:
draw_triangle_nearest(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[1].pos, in[2].pos, in[0].tex, in[1].tex, in[2].tex, in[0].norm, in[1].norm, in[2].norm);
if (in_count > 3) draw_triangle_nearest(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[2].pos, in[3].pos, in[0].tex, in[2].tex, in[3].tex, in[0].norm, in[2].norm, in[3].norm);
break;
} }
} }
} }

View File

@@ -6,6 +6,9 @@ void draw_triangle_nearest_a(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 l
Vec4 p0, Vec4 p1, Vec4 p2, Vec4 p0, Vec4 p1, Vec4 p2,
Vec2 tex0, Vec2 tex1, Vec2 tex2, Vec2 tex0, Vec2 tex1, Vec2 tex2,
Vec3 norm0, Vec3 norm1, Vec3 norm2) { Vec3 norm0, Vec3 norm1, Vec3 norm2) {
if(src->pixels == 0) return;
U64 fill_pixels_begin = __rdtsc();
F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x))); F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x)));
F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y))); F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y)));
F32 max_x1 = (F32)(max(p0.x, max(p1.x, p2.x))); F32 max_x1 = (F32)(max(p0.x, max(p1.x, p2.x)));
@@ -15,6 +18,11 @@ void draw_triangle_nearest_a(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 l
S64 max_x = (S64)min((F32)dst->x, ceil(max_x1)); S64 max_x = (S64)min((F32)dst->x, ceil(max_x1));
S64 max_y = (S64)min((F32)dst->y, ceil(max_y1)); S64 max_y = (S64)min((F32)dst->y, ceil(max_y1));
if (min_y >= max_y) return;
if (min_x >= max_x) return;
U32 *destination = dst->pixels + dst->x*min_y; U32 *destination = dst->pixels + dst->x*min_y;
F32 area = (p1.y - p0.y) * (p2.x - p0.x) - (p1.x - p0.x) * (p2.y - p0.y); F32 area = (p1.y - p0.y) * (p2.x - p0.x) - (p1.x - p0.x) * (p2.y - p0.y);
for (S64 y = min_y; y < max_y; y++) { for (S64 y = min_y; y < max_y; y++) {
@@ -98,9 +106,10 @@ void draw_triangle_nearest_a(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 l
} }
destination += dst->x; destination += dst->x;
} }
filled_pixel_cycles += __rdtsc() - fill_pixels_begin;
filled_pixel_count += (max_x - min_x)*(max_y - min_y);
} }
function function
void draw_triangle_nearest_b(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction, void draw_triangle_nearest_b(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction,
Vec4 p0, Vec4 p1, Vec4 p2, Vec4 p0, Vec4 p1, Vec4 p2,
@@ -226,9 +235,7 @@ void draw_triangle_nearest_b(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 l
Cy2 -= dx02; Cy2 -= dx02;
destination += dst->x; destination += dst->x;
} }
U64 end_time = __rdtsc(); filled_pixel_cycles += __rdtsc() - fill_pixels_begin;
filled_pixel_cycles += end_time - fill_pixels_begin;
filled_pixel_count += (max_x - min_x)*(max_y - min_y); filled_pixel_count += (max_x - min_x)*(max_y - min_y);
} }
@@ -238,6 +245,8 @@ void draw_triangle_bilinear(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 li
Vec4 p0, Vec4 p1, Vec4 p2, Vec4 p0, Vec4 p1, Vec4 p2,
Vec2 tex0, Vec2 tex1, Vec2 tex2, Vec2 tex0, Vec2 tex1, Vec2 tex2,
Vec3 norm0, Vec3 norm1, Vec3 norm2) { Vec3 norm0, Vec3 norm1, Vec3 norm2) {
if(src->pixels == 0) return;
U64 fill_pixels_begin = __rdtsc();
F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x))); F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x)));
F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y))); F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y)));
F32 max_x1 = (F32)(max(p0.x, max(p1.x, p2.x))); F32 max_x1 = (F32)(max(p0.x, max(p1.x, p2.x)));
@@ -248,6 +257,10 @@ void draw_triangle_bilinear(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 li
S64 max_x = (S64)clamp_top((F32)dst->x, ceil(max_x1)); S64 max_x = (S64)clamp_top((F32)dst->x, ceil(max_x1));
S64 max_y = (S64)clamp_top((F32)dst->y, ceil(max_y1)); S64 max_y = (S64)clamp_top((F32)dst->y, ceil(max_y1));
if (min_y >= max_y) return;
if (min_x >= max_x) return;
F32 area = edge_function(p0, p1, p2); F32 area = edge_function(p0, p1, p2);
for (S64 y = min_y; y < max_y; y++) { for (S64 y = min_y; y < max_y; y++) {
for (S64 x = min_x; x < max_x; x++) { for (S64 x = min_x; x < max_x; x++) {
@@ -324,504 +337,18 @@ void draw_triangle_bilinear(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 li
} }
} }
} }
filled_pixel_cycles += __rdtsc() - fill_pixels_begin;
filled_pixel_count += (max_x - min_x)*(max_y - min_y);
} }
function function
void draw_triangle_nearest_c(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction, void draw_triangle_nearest_simd_with_overloads(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction,
Vec4 p0, Vec4 p1, Vec4 p2,
Vec2 tex0, Vec2 tex1, Vec2 tex2,
Vec3 norm0, Vec3 norm1, Vec3 norm2) {
if(src->pixels == 0) return;
PROFILE_SCOPE(draw_triangle);
F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x)));
F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y)));
F32 max_x1 = (F32)(max(p0.x, max(p1.x, p2.x)));
F32 max_y1 = (F32)(max(p0.y, max(p1.y, p2.y)));
S64 min_x = (S64)max(0.f, floor(min_x1));
S64 min_y = (S64)max(0.f, floor(min_y1));
S64 max_x = (S64)min((F32)dst->x, ceil(max_x1));
S64 max_y = (S64)min((F32)dst->y, ceil(max_y1));
if (min_y >= max_y) return;
if (min_x >= max_x) return;
F32 dy10 = (p1.y - p0.y);
F32 dy21 = (p2.y - p1.y);
F32 dy02 = (p0.y - p2.y);
F32 dx10 = (p1.x - p0.x);
F32 dx21 = (p2.x - p1.x);
F32 dx02 = (p0.x - p2.x);
F32 C0 = dy10 * (p0.x) - dx10 * (p0.y);
F32 C1 = dy21 * (p1.x) - dx21 * (p1.y);
F32 C2 = dy02 * (p2.x) - dx02 * (p2.y);
F32 Cy0 = dy10 * min_x - dx10 * min_y - C0;
F32 Cy1 = dy21 * min_x - dx21 * min_y - C1;
F32 Cy2 = dy02 * min_x - dx02 * min_y - C2;
Vec8 Cx0;
Vec8 Cx1;
Vec8 Cx2;
Vec8I var07i = vec8i(0,1,2,3,4,5,6,7);
Vec8 var07 = vec8(0,1,2,3,4,5,6,7);
Vec8 Dy10 = vec8(dy10) * var07;
Vec8 Dy21 = vec8(dy21) * var07;
Vec8 Dy02 = vec8(dy02) * var07;
U32 *destination = dst->pixels + dst->x*min_y;
F32 area = (p1.y - p0.y) * (p2.x - p0.x) - (p1.x - p0.x) * (p2.y - p0.y);
for (S64 y = min_y; y < max_y; y++) {
Cx0 = vec8(Cy0);
Cx1 = vec8(Cy1);
Cx2 = vec8(Cy2);
for (S64 x8 = min_x; x8 < max_x; x8+=8) {
PROFILE_SCOPE(fill_triangle_outer);
Cx0 += Dy10;
Cx1 += Dy21;
Cx2 += Dy02;
Vec8I x = vec8i(x8) + var07i;
for(S64 i = 0; i < 8; i++){
// S64 x = x8+i;
if (Cx0[0] >= 0 && Cx1[0] >= 0 && Cx2[0] >= 0) {
PROFILE_SCOPE(fill_triangle_inner);
F32 w1 = Cx1[0] / area;
F32 w2 = Cx2[0] / area;
F32 w3 = Cx0[0] / area;
// @Note: We could do: interpolated_w = 1.f / interpolated_w to get proper depth
// but why waste an instruction, the smaller the depth value the farther the object
F32 interpolated_w = (1.f / p0.w) * w1 + (1.f / p1.w) * w2 + (1.f / p2.w) * w3;
F32* depth = depth_buffer + (x[i] + y * dst->x);
if (*depth < interpolated_w) {
PROFILE_SCOPE(fill_triangle_after_depth_test);
*depth = interpolated_w;
F32 invw0 = (w1 / p0.w);
F32 invw1 = (w2 / p1.w);
F32 invw2 = (w3 / p2.w);
// Vec3 norm = (norm0 * invw0 + norm1 * invw1 + norm2 * invw2) / interpolated_w;
F32 u = tex0.x * invw0 + tex1.x * invw1 + tex2.x * invw2;
F32 v = tex0.y * invw0 + tex1.y * invw1 + tex2.y * invw2;
{
u /= interpolated_w;
v /= interpolated_w;
u = u - floor(u);
v = v - floor(v);
u = u * (src->x - 1);
v = v * (src->y - 1);
}
S64 ui = (S64)(u);
S64 vi = (S64)(v);
//F32 udiff = u - (F32)ui;
//F32 vdiff = v - (F32)vi;
// Origin UV (0,0) is in bottom left
U32 *dst_pixel = destination + x[i];
U32 *pixel = src->pixels + (ui + (src->y - 1ll - vi) * src->x);
Vec4 result_color; {
U32 c = *pixel;
F32 a = ((c & 0xff000000) >> 24) / 255.f;
F32 b = ((c & 0x00ff0000) >> 16) / 255.f;
F32 g = ((c & 0x0000ff00) >> 8) / 255.f;
F32 r = ((c & 0x000000ff) >> 0) / 255.f;
r*=r;
g*=g;
b*=b;
result_color = { r,g,b,a };
}
Vec4 dst_color; {
U32 c = *dst_pixel;
F32 a = ((c & 0xff000000) >> 24) / 255.f;
F32 b = ((c & 0x00ff0000) >> 16) / 255.f;
F32 g = ((c & 0x0000ff00) >> 8) / 255.f;
F32 r = ((c & 0x000000ff) >> 0) / 255.f;
r*=r; g*=g; b*=b;
dst_color = { r,g,b,a };
}
#if 0
Vec3 light_color = vec3(0.8,0.8,1);
constexpr F32 ambient_strength = 0.1f; {
Vec3 ambient = ambient_strength * light_color;
Vec3 diffuse = clamp_bot(0.f, -dot(norm, light_direction)) * light_color;
result_color.rgb *= (ambient+diffuse);
}
#endif
result_color = premultiplied_alpha(dst_color, result_color);
result_color = almost_linear_to_srgb(result_color);
U32 color32 = vec4_to_u32abgr(result_color);
*dst_pixel = color32;
}
}
Cx0[0] += dy10;
Cx1[0] += dy21;
Cx2[0] += dy02;
}
}
Cy0 -= dx10;
Cy1 -= dx21;
Cy2 -= dx02;
destination += dst->x;
}
}
function
void draw_triangle_nearest_d_bugged(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction,
Vec4 p0, Vec4 p1, Vec4 p2,
Vec2 tex0, Vec2 tex1, Vec2 tex2,
Vec3 norm0, Vec3 norm1, Vec3 norm2) {
if(src->pixels == 0) return;
PROFILE_SCOPE(draw_triangle);
F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x)));
F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y)));
F32 max_x1 = (F32)(max(p0.x, max(p1.x, p2.x)));
F32 max_y1 = (F32)(max(p0.y, max(p1.y, p2.y)));
S64 min_x = (S64)max(0.f, floor(min_x1));
S64 min_y = (S64)max(0.f, floor(min_y1));
S64 max_x = (S64)min((F32)dst->x, ceil(max_x1));
S64 max_y = (S64)min((F32)dst->y, ceil(max_y1));
if (min_y >= max_y) return;
if (min_x >= max_x) return;
F32 dy10 = (p1.y - p0.y);
F32 dy21 = (p2.y - p1.y);
F32 dy02 = (p0.y - p2.y);
F32 dx10 = (p1.x - p0.x);
F32 dx21 = (p2.x - p1.x);
F32 dx02 = (p0.x - p2.x);
F32 C0 = dy10 * (p0.x) - dx10 * (p0.y);
F32 C1 = dy21 * (p1.x) - dx21 * (p1.y);
F32 C2 = dy02 * (p2.x) - dx02 * (p2.y);
F32 Cy0 = dy10 * min_x - dx10 * min_y - C0;
F32 Cy1 = dy21 * min_x - dx21 * min_y - C1;
F32 Cy2 = dy02 * min_x - dx02 * min_y - C2;
Vec8 zero8 = vec8(0);
// Vec8I var07i = vec8i(0,1,2,3,4,5,6,7);
Vec8 var07 = vec8(0,1,2,3,4,5,6,7);
Vec8 var1_8 = vec8(1,2,3,4,5,6,7,8);
Vec8 Dy10 = vec8(dy10) * var1_8;
Vec8 Dy21 = vec8(dy21) * var1_8;
Vec8 Dy02 = vec8(dy02) * var1_8;
Vec8 w0, w1, w2, invw0, invw1, invw2, u, v, interpolated_w;
Vec8I ui, vi;
U32 *destination = dst->pixels + dst->x*min_y;
F32 area = (p1.y - p0.y) * (p2.x - p0.x) - (p1.x - p0.x) * (p2.y - p0.y);
Vec8 area8 = vec8(area);
for (S64 y = min_y; y < max_y; y++) {
Vec8 Cx0 = vec8(Cy0);
Vec8 Cx1 = vec8(Cy1);
Vec8 Cx2 = vec8(Cy2);
for (S64 x8 = min_x; x8 < max_x; x8+=8) {
PROFILE_SCOPE(fill_triangle_outer);
Cx0 = vec8(Cx0[7]) + Dy10;
Cx1 = vec8(Cx1[7]) + Dy21;
Cx2 = vec8(Cx2[7]) + Dy02;
Vec8 should_fill;
{
Vec8 a = (vec8(x8) + var07);
Vec8 b = vec8(max_x);
should_fill = a < b;
should_fill = should_fill & (Cx0 >= zero8 & Cx1 >= zero8 & Cx2 >= zero8);
}
w0 = Cx1 / area8;
w1 = Cx2 / area8;
w2 = Cx0 / area8;
// @Note: We could do: interpolated_w = 1.f / interpolated_w to get proper depth
// but why waste an instruction, the smaller the depth value the farther the object
interpolated_w = vec8(1.f / p0.w) * w0 + vec8(1.f / p1.w) * w1 + vec8(1.f / p2.w) * w2;
F32 *depth_pointer = (depth_buffer + (x8 + y * dst->x));
Vec8 depth = loadu8(depth_pointer);
should_fill = should_fill & (depth < interpolated_w);
invw0 = (w0 / vec8(p0.w));
invw1 = (w1 / vec8(p1.w));
invw2 = (w2 / vec8(p2.w));
u = vec8(tex0.x) * invw0 + vec8(tex1.x) * invw1 + vec8(tex2.x) * invw2;
v = vec8(tex0.y) * invw0 + vec8(tex1.y) * invw1 + vec8(tex2.y) * invw2;
u /= interpolated_w;
v /= interpolated_w;
u = u - floor8(u);
v = v - floor8(v);
u = u * vec8(src->x - 1);
v = v * vec8(src->y - 1);
ui = convert_vec8_to_vec8i(u);
vi = convert_vec8_to_vec8i(v);
// Origin UV (0,0) is in bottom left
_mm256_maskstore_epi32((int *)depth_pointer, should_fill.simd, interpolated_w.simd);
Vec8I indices = ui + ((vec8i(src->y) - vec8i(1) - vi) * vec8i(src->x));
U32 *pixel[8] = {
src->pixels + indices.e[0],
src->pixels + indices.e[1],
src->pixels + indices.e[2],
src->pixels + indices.e[3],
src->pixels + indices.e[4],
src->pixels + indices.e[5],
src->pixels + indices.e[6],
src->pixels + indices.e[7],
};
U32 *dst_pixel = destination + x8;
for(S64 i = 0; i < 8; i++){
if (should_fill[i]){
PROFILE_SCOPE(fill_triangle_after_depth_test);
Vec4 result_color; {
U32 c = *pixel[i];
F32 a = ((c & 0xff000000) >> 24) / 255.f;
F32 b = ((c & 0x00ff0000) >> 16) / 255.f;
F32 g = ((c & 0x0000ff00) >> 8) / 255.f;
F32 r = ((c & 0x000000ff) >> 0) / 255.f;
r*=r;
g*=g;
b*=b;
result_color = { r,g,b,a };
}
Vec4 dst_color; {
U32 c = dst_pixel[i];
F32 a = ((c & 0xff000000) >> 24) / 255.f;
F32 b = ((c & 0x00ff0000) >> 16) / 255.f;
F32 g = ((c & 0x0000ff00) >> 8) / 255.f;
F32 r = ((c & 0x000000ff) >> 0) / 255.f;
r*=r; g*=g; b*=b;
dst_color = { r,g,b,a };
}
#if 0
Vec3 light_color = vec3(0.8,0.8,1);
constexpr F32 ambient_strength = 0.1f; {
Vec3 ambient = ambient_strength * light_color;
Vec3 diffuse = clamp_bot(0.f, -dot(norm, light_direction)) * light_color;
result_color.rgb *= (ambient+diffuse);
}
#endif
// Premultiplied alpha
{
result_color.r = result_color.r + ((1-result_color.a) * dst_color.r);
result_color.g = result_color.g + ((1-result_color.a) * dst_color.g);
result_color.b = result_color.b + ((1-result_color.a) * dst_color.b);
result_color.a = result_color.a + dst_color.a - result_color.a*dst_color.a;
}
// Almost linear to srgb
{
result_color.r = sqrtf(result_color.r);
result_color.g = sqrtf(result_color.g);
result_color.b = sqrtf(result_color.b);
}
U32 color32;
{
U8 red = (U8)(result_color.r * 255);
U8 green = (U8)(result_color.g * 255);
U8 blue = (U8)(result_color.b * 255);
U8 alpha = (U8)(result_color.a * 255);
color32 = (U32)(alpha << 24 | blue << 16 | green << 8 | red << 0);
}
dst_pixel[i] = color32;
}
}
}
Cy0 -= dx10;
Cy1 -= dx21;
Cy2 -= dx02;
destination += dst->x;
}
}
function
void draw_triangle_nearest_e(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction,
Vec4 p0, Vec4 p1, Vec4 p2,
Vec2 tex0, Vec2 tex1, Vec2 tex2,
Vec3 norm0, Vec3 norm1, Vec3 norm2) {
if(src->pixels == 0) return;
PROFILE_SCOPE(draw_triangle);
F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x)));
F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y)));
F32 max_x1 = (F32)(max(p0.x, max(p1.x, p2.x)));
F32 max_y1 = (F32)(max(p0.y, max(p1.y, p2.y)));
S64 min_x = (S64)max(0.f, floor(min_x1));
S64 min_y = (S64)max(0.f, floor(min_y1));
S64 max_x = (S64)min((F32)dst->x, ceil(max_x1));
S64 max_y = (S64)min((F32)dst->y, ceil(max_y1));
if (min_y >= max_y) return;
if (min_x >= max_x) return;
F32 dy10 = (p1.y - p0.y);
F32 dy21 = (p2.y - p1.y);
F32 dy02 = (p0.y - p2.y);
F32 dx10 = (p1.x - p0.x);
F32 dx21 = (p2.x - p1.x);
F32 dx02 = (p0.x - p2.x);
F32 C0 = dy10 * (p0.x) - dx10 * (p0.y);
F32 C1 = dy21 * (p1.x) - dx21 * (p1.y);
F32 C2 = dy02 * (p2.x) - dx02 * (p2.y);
F32 Cy0 = dy10 * min_x - dx10 * min_y - C0;
F32 Cy1 = dy21 * min_x - dx21 * min_y - C1;
F32 Cy2 = dy02 * min_x - dx02 * min_y - C2;
Vec8I var07i = vec8i(0,1,2,3,4,5,6,7);
Vec8 var07 = vec8(0,1,2,3,4,5,6,7);
Vec8 var1_8 = vec8(1,2,3,4,5,6,7,8);
Vec8 Dy10 = vec8(dy10) * var07;
Vec8 Dy21 = vec8(dy21) * var07;
Vec8 Dy02 = vec8(dy02) * var07;
U32 *destination = dst->pixels + dst->x*min_y;
F32 area = (p1.y - p0.y) * (p2.x - p0.x) - (p1.x - p0.x) * (p2.y - p0.y);
for (S64 y = min_y; y < max_y; y++) {
Vec8 Cx0 = vec8(Cy0);
Vec8 Cx1 = vec8(Cy1);
Vec8 Cx2 = vec8(Cy2);
for (S64 x8 = min_x; x8 < max_x; x8+=8) {
PROFILE_SCOPE(fill_triangle_outer);
Cx0 = vec8(Cx0[7]) + Dy10*var1_8;
Cx1 = vec8(Cx1[7]) + Dy21*var1_8;
Cx2 = vec8(Cx2[7]) + Dy02*var1_8;
Vec8I x = vec8i(x8) + var07i;
for(S64 i = 0; i < 8; i++){
// S64 x = x8+i;
if (Cx0[i] >= 0 && Cx1[i] >= 0 && Cx2[i] >= 0) {
PROFILE_SCOPE(fill_triangle_inner);
F32 w1 = Cx1[i] / area;
F32 w2 = Cx2[i] / area;
F32 w3 = Cx0[i] / area;
// @Note: We could do: interpolated_w = 1.f / interpolated_w to get proper depth
// but why waste an instruction, the smaller the depth value the farther the object
F32 interpolated_w = (1.f / p0.w) * w1 + (1.f / p1.w) * w2 + (1.f / p2.w) * w3;
F32* depth = depth_buffer + (x[i] + y * dst->x);
if (*depth < interpolated_w) {
PROFILE_SCOPE(fill_triangle_after_depth_test);
*depth = interpolated_w;
F32 invw0 = (w1 / p0.w);
F32 invw1 = (w2 / p1.w);
F32 invw2 = (w3 / p2.w);
// Vec3 norm = (norm0 * invw0 + norm1 * invw1 + norm2 * invw2) / interpolated_w;
F32 u = tex0.x * invw0 + tex1.x * invw1 + tex2.x * invw2;
F32 v = tex0.y * invw0 + tex1.y * invw1 + tex2.y * invw2;
{
u /= interpolated_w;
v /= interpolated_w;
u = u - floor(u);
v = v - floor(v);
u = u * (src->x - 1);
v = v * (src->y - 1);
}
S64 ui = (S64)(u);
S64 vi = (S64)(v);
//F32 udiff = u - (F32)ui;
//F32 vdiff = v - (F32)vi;
// Origin UV (0,0) is in bottom left
U32 *dst_pixel = destination + x[i];
U32 *pixel = src->pixels + (ui + (src->y - 1ll - vi) * src->x);
Vec4 result_color; {
U32 c = *pixel;
F32 a = ((c & 0xff000000) >> 24) / 255.f;
F32 b = ((c & 0x00ff0000) >> 16) / 255.f;
F32 g = ((c & 0x0000ff00) >> 8) / 255.f;
F32 r = ((c & 0x000000ff) >> 0) / 255.f;
r*=r;
g*=g;
b*=b;
result_color = { r,g,b,a };
}
Vec4 dst_color; {
U32 c = *dst_pixel;
F32 a = ((c & 0xff000000) >> 24) / 255.f;
F32 b = ((c & 0x00ff0000) >> 16) / 255.f;
F32 g = ((c & 0x0000ff00) >> 8) / 255.f;
F32 r = ((c & 0x000000ff) >> 0) / 255.f;
r*=r; g*=g; b*=b;
dst_color = { r,g,b,a };
}
#if 0
Vec3 light_color = vec3(0.8,0.8,1);
constexpr F32 ambient_strength = 0.1f; {
Vec3 ambient = ambient_strength * light_color;
Vec3 diffuse = clamp_bot(0.f, -dot(norm, light_direction)) * light_color;
result_color.rgb *= (ambient+diffuse);
}
#endif
result_color = premultiplied_alpha(dst_color, result_color);
result_color = almost_linear_to_srgb(result_color);
U32 color32 = vec4_to_u32abgr(result_color);
*dst_pixel = color32;
}
}
}
}
Cy0 -= dx10;
Cy1 -= dx21;
Cy2 -= dx02;
destination += dst->x;
}
}
function
void draw_triangle_nearest_f(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction,
Vec4 p0, Vec4 p1, Vec4 p2, Vec4 p0, Vec4 p1, Vec4 p2,
Vec2 tex0, Vec2 tex1, Vec2 tex2, Vec2 tex0, Vec2 tex1, Vec2 tex2,
Vec3 norm0, Vec3 norm1, Vec3 norm2) { Vec3 norm0, Vec3 norm1, Vec3 norm2) {
if(src->pixels == 0) return; if(src->pixels == 0) return;
U64 fill_pixels_begin = __rdtsc(); U64 fill_pixels_begin = __rdtsc();
PROFILE_SCOPE(draw_triangle);
F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x))); F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x)));
F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y))); F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y)));
F32 max_x1 = (F32)(max(p0.x, max(p1.x, p2.x))); F32 max_x1 = (F32)(max(p0.x, max(p1.x, p2.x)));
@@ -1024,14 +551,12 @@ void draw_triangle_nearest_f(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 l
Cy2 -= dx02; Cy2 -= dx02;
destination += dst->x; destination += dst->x;
} }
U64 end_time = __rdtsc(); filled_pixel_cycles += __rdtsc() - fill_pixels_begin;
filled_pixel_count += (max_x - min_x)*(max_y - min_y);
filled_pixel_cycles += end_time - fill_pixels_begin;
filled_pixel_count += (max_x - min_x)*(max_y - min_y);
} }
function function
void draw_triangle_nearest_g(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction, void draw_triangle_nearest_simd_without_overloads(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction,
Vec4 p0, Vec4 p1, Vec4 p2, Vec4 p0, Vec4 p1, Vec4 p2,
Vec2 tex0, Vec2 tex1, Vec2 tex2, Vec2 tex0, Vec2 tex1, Vec2 tex2,
Vec3 norm0, Vec3 norm1, Vec3 norm2) { Vec3 norm0, Vec3 norm1, Vec3 norm2) {