More code inlined, that pesky epi32 multiply!

This commit is contained in:
Krzosa Karol
2022-07-05 21:00:23 +02:00
parent 5b4412f80a
commit 2d2615b6c9

View File

@@ -293,6 +293,7 @@ U64 filled_pixel_total_time;
// #include "optimization_log.cpp" // #include "optimization_log.cpp"
#define I(x,i) (((F32 *)&x)[i]) #define I(x,i) (((F32 *)&x)[i])
#define Is(x,i) (((S32 *)&x)[i])
typedef __m256 F32x8; typedef __m256 F32x8;
typedef __m256i S32x8; typedef __m256i S32x8;
function function
@@ -347,6 +348,9 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
Vec8 Dy21 = vec8(dy21) * var1_8; Vec8 Dy21 = vec8(dy21) * var1_8;
Vec8 Dy02 = vec8(dy02) * var1_8; Vec8 Dy02 = vec8(dy02) * var1_8;
F32x8 var_src_x_minus_one = _mm256_set1_ps(src->x-1);
F32x8 var_src_y_minus_one = _mm256_set1_ps(src->y-1);
F32x8 var_tex0x = _mm256_set1_ps(tex0.x); F32x8 var_tex0x = _mm256_set1_ps(tex0.x);
F32x8 var_tex1x = _mm256_set1_ps(tex1.x); F32x8 var_tex1x = _mm256_set1_ps(tex1.x);
F32x8 var_tex2x = _mm256_set1_ps(tex2.x); F32x8 var_tex2x = _mm256_set1_ps(tex2.x);
@@ -417,44 +421,61 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
F32x8 depth = _mm256_loadu_ps((float *)depth_pointer); F32x8 depth = _mm256_loadu_ps((float *)depth_pointer);
// //
F32x8 i19 = _mm256_cmp_ps(depth, interpolated_w, _CMP_LT_OQ); F32x8 should_fill_term = _mm256_cmp_ps(depth, interpolated_w, _CMP_LT_OQ);
should_fill = _mm256_and_ps(should_fill, i19); should_fill = _mm256_and_ps(should_fill, should_fill_term);
F32x8 invw0 = _mm256_div_ps(w0, var_p0w); F32x8 invw0 = _mm256_div_ps(w0, var_p0w);
F32x8 invw1 = _mm256_div_ps(w1, var_p1w); F32x8 invw1 = _mm256_div_ps(w1, var_p1w);
F32x8 invw2 = _mm256_div_ps(w2, var_p2w); F32x8 invw2 = _mm256_div_ps(w2, var_p2w);
Vec8 u = vec8(tex0.x) * Vec8{invw0} + vec8(tex1.x) * Vec8{invw1} + vec8(tex2.x) * Vec8{invw2}; F32x8 u_term0 = _mm256_mul_ps(var_tex0x, invw0);
Vec8 v = vec8(tex0.y) * Vec8{invw0} + vec8(tex1.y) * Vec8{invw1} + vec8(tex2.y) * Vec8{invw2}; F32x8 u_term1 = _mm256_mul_ps(var_tex1x, invw1);
F32x8 u_term2 = _mm256_mul_ps(var_tex2x, invw2);
F32x8 u_term3 = _mm256_add_ps(u_term0, u_term1);
F32x8 u0 = _mm256_add_ps(u_term2, u_term3);
u.simd = _mm256_div_ps(u.simd, interpolated_w); F32x8 v_term0 = _mm256_mul_ps(var_tex0y, invw0);
v.simd = _mm256_div_ps(v.simd, interpolated_w); F32x8 v_term1 = _mm256_mul_ps(var_tex1y, invw1);
u = u - floor8(u); F32x8 v_term2 = _mm256_mul_ps(var_tex2y, invw2);
v = v - floor8(v); F32x8 v_term3 = _mm256_add_ps(v_term0, v_term1);
u = u * vec8(src->x - 1); F32x8 v0 = _mm256_add_ps(v_term2, v_term3);
v = v * vec8(src->y - 1);
Vec8I ui = convert_vec8_to_vec8i(u); F32x8 u1 = _mm256_div_ps(u0, interpolated_w);
Vec8I vi = convert_vec8_to_vec8i(v); F32x8 v1 = _mm256_div_ps(v0, interpolated_w);
F32x8 u_floored = _mm256_floor_ps(u1);
F32x8 v_floored = _mm256_floor_ps(v1);
F32x8 u2 = _mm256_sub_ps(u1, u_floored);
F32x8 v2 = _mm256_sub_ps(v1, v_floored);
F32x8 u3 = _mm256_mul_ps(u2, var_src_x_minus_one);
F32x8 v3 = _mm256_mul_ps(v2, var_src_y_minus_one);
F32x8 ui = _mm256_cvtps_epi32(u3);
F32x8 vi = _mm256_cvtps_epi32(v3);
// Origin UV (0,0) is in bottom left // Origin UV (0,0) is in bottom left
_mm256_maskstore_epi32((int *)depth_pointer, should_fill, interpolated_w); _mm256_maskstore_epi32((int *)depth_pointer, should_fill, interpolated_w);
Vec8I indices = ui + ((vec8i(src->y) - var1i - vi) * vec8i(src->x));
S32 size = src->x * src->y; S32x8 indices0 = _mm256_set1_epi32(src->y - 1);
indices.simd = _mm256_min_epi32(_mm256_set1_ps(size), indices.simd); S32x8 indices1 = _mm256_sub_epi32(indices0, vi);
indices.simd = _mm256_max_epi32(var0i.simd, indices.simd); S32x8 indices3 = _mm256_mullo_epi32(_mm256_set1_epi32(src->x), indices1);
S32x8 indices = _mm256_add_epi32(indices3, ui);
// //
// Fetch and calculate texel values // Fetch and calculate texel values
// //
Vec8I pixel; Vec8I pixel;
if(I(should_fill, 0)) pixel.e[0] = src->pixels[indices.e[0]]; if(I(should_fill, 0)) pixel.e[0] = src->pixels[Is(indices, 0)];
if(I(should_fill, 1)) pixel.e[1] = src->pixels[indices.e[1]]; if(I(should_fill, 1)) pixel.e[1] = src->pixels[Is(indices, 1)];
if(I(should_fill, 2)) pixel.e[2] = src->pixels[indices.e[2]]; if(I(should_fill, 2)) pixel.e[2] = src->pixels[Is(indices, 2)];
if(I(should_fill, 3)) pixel.e[3] = src->pixels[indices.e[3]]; if(I(should_fill, 3)) pixel.e[3] = src->pixels[Is(indices, 3)];
if(I(should_fill, 4)) pixel.e[4] = src->pixels[indices.e[4]]; if(I(should_fill, 4)) pixel.e[4] = src->pixels[Is(indices, 4)];
if(I(should_fill, 5)) pixel.e[5] = src->pixels[indices.e[5]]; if(I(should_fill, 5)) pixel.e[5] = src->pixels[Is(indices, 5)];
if(I(should_fill, 6)) pixel.e[6] = src->pixels[indices.e[6]]; if(I(should_fill, 6)) pixel.e[6] = src->pixels[Is(indices, 6)];
if(I(should_fill, 7)) pixel.e[7] = src->pixels[indices.e[7]]; if(I(should_fill, 7)) pixel.e[7] = src->pixels[Is(indices, 7)];
Vec8I texel_i_a = pixel & vec8i(0xff000000); Vec8I texel_i_a = pixel & vec8i(0xff000000);
Vec8I texel_i_b = pixel & vec8i(0x00ff0000); Vec8I texel_i_b = pixel & vec8i(0x00ff0000);