Even more vectorized and even slower

This commit is contained in:
Krzosa Karol
2022-07-01 13:55:28 +02:00
parent d6caf62ced
commit 88652d40d6
3 changed files with 21 additions and 20 deletions

View File

@@ -6,4 +6,4 @@ rem assets.exe
rem tracy/TracyClient.cpp -DTRACY_ENABLE rem tracy/TracyClient.cpp -DTRACY_ENABLE
clang main.cpp -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib clang main.cpp -O2 -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib

View File

@@ -373,26 +373,20 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
u = vec8(tex0.x) * invw0 + vec8(tex1.x) * invw1 + vec8(tex2.x) * invw2; u = vec8(tex0.x) * invw0 + vec8(tex1.x) * invw1 + vec8(tex2.x) * invw2;
v = vec8(tex0.y) * invw0 + vec8(tex1.y) * invw1 + vec8(tex2.y) * invw2; v = vec8(tex0.y) * invw0 + vec8(tex1.y) * invw1 + vec8(tex2.y) * invw2;
u /= interpolated_w;
v /= interpolated_w;
u = u - floor8(u);
v = v - floor8(v);
u = u * vec8(src->x - 1);
v = v * vec8(src->y - 1);
ui = convert_vec8_to_vec8i(u);
vi = convert_vec8_to_vec8i(v);
for(S64 i = 0; i < 8; i++){ for(S64 i = 0; i < 8; i++){
if (should_fill[i]){ if (should_fill[i]){
PROFILE_SCOPE(fill_triangle_after_depth_test); PROFILE_SCOPE(fill_triangle_after_depth_test);
depth_pointer[i] = interpolated_w[i]; depth_pointer[i] = interpolated_w[i];
// Vec3 norm = (norm0 * invw0[i] + norm1 * invw1[i] + norm2 * invw2[i]) / interpolated_w[i];
{
u[i] /= interpolated_w[i];
v[i] /= interpolated_w[i];
u[i] = u[i] - floor(u[i]);
v[i] = v[i] - floor(v[i]);
u[i] = u[i] * (src->x - 1);
v[i] = v[i] * (src->y - 1);
}
ui[i] = (S64)(u[i]);
vi[i] = (S64)(v[i]);
//F32 udiff = u - (F32)ui;
//F32 vdiff = v - (F32)vi;
// Origin UV (0,0) is in bottom left // Origin UV (0,0) is in bottom left
U32 *dst_pixel = destination + x[i]; U32 *dst_pixel = destination + x[i];
U32 *pixel = src->pixels + (ui[i] + (src->y - 1ll - vi[i]) * src->x); U32 *pixel = src->pixels + (ui[i] + (src->y - 1ll - vi[i]) * src->x);

17
vec.cpp
View File

@@ -7,21 +7,26 @@ union Vec8{
force_inline F32 &operator[](S64 i){ return e[i]; } force_inline F32 &operator[](S64 i){ return e[i]; }
}; };
Vec8 loadu8(void *m){ return {_mm256_loadu_ps((const float *)m)}; } force_inline Vec8 floor8(Vec8 v){ return {_mm256_floor_ps(v.simd)}; }
Vec8 vec8(F32 x){return {_mm256_set1_ps(x)}; } force_inline Vec8 loadu8(void *m){ return {_mm256_loadu_ps((const float *)m)}; }
Vec8 vec8(F32 a, F32 b, F32 c, F32 d, F32 e, F32 f, F32 g, F32 h){ return {_mm256_set_ps(h, g, f, e, d, c, b, a)}; } force_inline Vec8 vec8(F32 x){return {_mm256_set1_ps(x)}; }
force_inline Vec8 vec8(F32 a, F32 b, F32 c, F32 d, F32 e, F32 f, F32 g, F32 h){ return {_mm256_set_ps(h, g, f, e, d, c, b, a)}; }
Vec8 operator+(Vec8 a, Vec8 b){ return {_mm256_add_ps(a.simd, b.simd)}; } Vec8 operator+(Vec8 a, Vec8 b){ return {_mm256_add_ps(a.simd, b.simd)}; }
Vec8 operator-(Vec8 a, Vec8 b){ return {_mm256_sub_ps(a.simd, b.simd)}; } Vec8 operator-(Vec8 a, Vec8 b){ return {_mm256_sub_ps(a.simd, b.simd)}; }
Vec8 operator*(Vec8 a, Vec8 b){ return {_mm256_mul_ps(a.simd, b.simd)}; } Vec8 operator*(Vec8 a, Vec8 b){ return {_mm256_mul_ps(a.simd, b.simd)}; }
Vec8 operator/(Vec8 a, Vec8 b){ return {_mm256_div_ps(a.simd, b.simd)}; } Vec8 operator/(Vec8 a, Vec8 b){ return {_mm256_div_ps(a.simd, b.simd)}; }
Vec8 operator+=(Vec8 &a, Vec8 b){ a = a + b; return a; }
Vec8 operator-=(Vec8 &a, Vec8 b){ a = a - b; return a; }
Vec8 operator>=(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_GE_OQ)}; } Vec8 operator>=(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_GE_OQ)}; }
Vec8 operator<=(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_LE_OQ)}; } Vec8 operator<=(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_LE_OQ)}; }
Vec8 operator<(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_LT_OQ)}; } Vec8 operator<(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_LT_OQ)}; }
Vec8 operator>(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_GT_OQ)}; } Vec8 operator>(Vec8 a, Vec8 b){ return {_mm256_cmp_ps(a.simd, b.simd, _CMP_GT_OQ)}; }
Vec8 operator&(Vec8 a, Vec8 b){ return {_mm256_and_ps(a.simd, b.simd)}; } Vec8 operator&(Vec8 a, Vec8 b){ return {_mm256_and_ps(a.simd, b.simd)}; }
Vec8 operator+=(Vec8 &a, Vec8 b){ a = a + b; return a; }
Vec8 operator-=(Vec8 &a, Vec8 b){ a = a - b; return a; }
Vec8 operator*=(Vec8 &a, Vec8 b){ a = a * b; return a; }
Vec8 operator/=(Vec8 &a, Vec8 b){ a = a / b; return a; }
union Vec8I{ union Vec8I{
__m256i simd; __m256i simd;
S32 e[8]; S32 e[8];
@@ -36,3 +41,5 @@ Vec8I operator-(Vec8I a, Vec8I b){ return {_mm256_sub_epi32(a.simd, b.simd)}; }
Vec8I operator*(Vec8I a, Vec8I b){ return {_mm256_mul_epi32(a.simd, b.simd)}; } Vec8I operator*(Vec8I a, Vec8I b){ return {_mm256_mul_epi32(a.simd, b.simd)}; }
// Vec8I operator/(Vec8I a, Vec8I b){ return {_mm256_div_epi32(a.simd, b.simd)}; } // Vec8I operator/(Vec8I a, Vec8I b){ return {_mm256_div_epi32(a.simd, b.simd)}; }
Vec8I operator+=(Vec8I &a, Vec8I b){ return a + b; } Vec8I operator+=(Vec8I &a, Vec8I b){ return a + b; }
Vec8I convert_vec8_to_vec8i(Vec8 v){ return Vec8I{_mm256_cvtps_epi32(v.simd)}; }