diff --git a/build.bat b/build.bat
index e5932f0..ecc9c28 100644
--- a/build.bat
+++ b/build.bat
@@ -6,4 +6,4 @@ rem assets.exe
 rem tracy/TracyClient.cpp -DTRACY_ENABLE
 
 
-clang main.cpp -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib
+clang main.cpp -O2 -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib
diff --git a/main.cpp b/main.cpp
index f7f7959..a853444 100644
--- a/main.cpp
+++ b/main.cpp
@@ -296,6 +296,7 @@ U64 filled_pixel_total_time;
 #define Is(x,i) (((S32 *)&x)[i])
 typedef __m256  F32x8;
 typedef __m256i S32x8;
+
 function
 void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 light_direction,
                            Vec4 p0,   Vec4 p1,   Vec4 p2,
@@ -336,14 +337,14 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
 
   F32x8 var255 = _mm256_set1_ps(255);
   F32x8 var0 = _mm256_set1_ps(0);
+  F32x8 var1 = _mm256_set1_ps(1);
   F32x8 var_max_x = _mm256_set1_ps(max_x);
   F32x8 var07 = _mm256_set_ps(7,6,5,4,3,2,1,0);
 
-  Vec8 var1 = vec8(1);
-  Vec8 var1_8 = vec8(1,2,3,4,5,6,7,8);
-  Vec8 Dy10 = vec8(dy10) * var1_8;
-  Vec8 Dy21 = vec8(dy21) * var1_8;
-  Vec8 Dy02 = vec8(dy02) * var1_8;
+  F32x8 var_1_8 = _mm256_set_ps(8,7,6,5,4,3,2,1);
+  F32x8 Dy10 = _mm256_mul_ps(_mm256_set1_ps(dy10), var_1_8);
+  F32x8 Dy21 = _mm256_mul_ps(_mm256_set1_ps(dy21), var_1_8);
+  F32x8 Dy02 = _mm256_mul_ps(_mm256_set1_ps(dy02), var_1_8);
 
   F32x8 var_src_x_minus_one = _mm256_set1_ps(src->x-1);
   F32x8 var_src_y_minus_one = _mm256_set1_ps(src->y-1);
@@ -355,6 +356,7 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
   S32x8 var_0x0000ff00 = _mm256_set1_epi32(0x0000ff00);
   S32x8 var_0x000000ff = _mm256_set1_epi32(0x000000ff);
 
+  F32x8 var_255 = _mm256_set1_ps(255);
   F32x8 var_tex0x = _mm256_set1_ps(tex0.x);
   F32x8 var_tex1x = _mm256_set1_ps(tex1.x);
   F32x8 var_tex2x = _mm256_set1_ps(tex2.x);
@@ -382,16 +384,13 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
     for (S64 x8 = min_x; x8 < max_x; x8+=8) {
       {
         F32x8 i0 = _mm256_set1_ps(I(Cx0, 7));
-        F32x8 i1 = _mm256_add_ps(i0, Dy10.simd);
-        Cx0 = {i1};
+        Cx0 = _mm256_add_ps(i0, Dy10);
 
         F32x8 i2 = _mm256_set1_ps(I(Cx1, 7));
-        F32x8 i3 = _mm256_add_ps(i2, Dy21.simd);
-        Cx1 = {i3};
+        Cx1 = _mm256_add_ps(i2, Dy21);
 
         F32x8 i4 = _mm256_set1_ps(I(Cx2, 7));
-        F32x8 i5 = _mm256_add_ps(i4, Dy02.simd);
-        Cx2 = {i5};
+        Cx2 = _mm256_add_ps(i4, Dy02);
       }
 
 
@@ -477,92 +476,92 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
       if(I(should_fill, 6)) Is(pixel, 6) = src->pixels[Is(indices, 6)];
       if(I(should_fill, 7)) Is(pixel, 7) = src->pixels[Is(indices, 7)];
 
-      Vec8I texel_i_a = {_mm256_and_si256(pixel, var_0xff000000)};
-      Vec8I texel_i_b = {_mm256_and_si256(pixel, var_0x00ff0000)};
-      Vec8I texel_i_g = {_mm256_and_si256(pixel, var_0x0000ff00)};
-      Vec8I texel_i_r = {_mm256_and_si256(pixel, var_0x000000ff)};
+      S32x8 texel_i_a = _mm256_and_si256(pixel, var_0xff000000);
+      S32x8 texel_i_b = _mm256_and_si256(pixel, var_0x00ff0000);
+      S32x8 texel_i_g = _mm256_and_si256(pixel, var_0x0000ff00);
+      S32x8 texel_i_r = _mm256_and_si256(pixel, var_0x000000ff);
 
       // Alpha is done this way because signed integer shift is weird
       // When sign bit is set it sets all bits that we shift the sign through
       // So first we shift
-      texel_i_a = (texel_i_a >> 24);
-      texel_i_a = texel_i_a & vec8i(0x000000ff);
-      texel_i_b = (texel_i_b >> 16);
-      texel_i_g = (texel_i_g >> 8 );
-      texel_i_r = (texel_i_r >> 0 );
+      texel_i_a = _mm256_srai_epi32(texel_i_a, 24);
+      texel_i_a = _mm256_and_si256(texel_i_a, var_0x000000ff);
+      texel_i_b = _mm256_srai_epi32(texel_i_b, 16);
+      texel_i_g = _mm256_srai_epi32(texel_i_g, 8 );
+      texel_i_r = _mm256_srai_epi32(texel_i_r, 0 );
 
-      Vec8 texel_a = convert_vec8i_to_vec8(texel_i_a);
-      Vec8 texel_b = convert_vec8i_to_vec8(texel_i_b);
-      Vec8 texel_g = convert_vec8i_to_vec8(texel_i_g);
-      Vec8 texel_r = convert_vec8i_to_vec8(texel_i_r);
+      F32x8 texel_a0 = _mm256_cvtepi32_ps(texel_i_a);
+      F32x8 texel_b0 = _mm256_cvtepi32_ps(texel_i_b);
+      F32x8 texel_g0 = _mm256_cvtepi32_ps(texel_i_g);
+      F32x8 texel_r0 = _mm256_cvtepi32_ps(texel_i_r);
 
-      Vec8 v255 = vec8(255.f);
-      texel_a = texel_a / v255;
-      texel_b = texel_b / v255;
-      texel_g = texel_g / v255;
-      texel_r = texel_r / v255;
+      F32x8 texel_a1 = _mm256_div_ps(texel_a0, var_255);
+      F32x8 texel_b1 = _mm256_div_ps(texel_b0, var_255);
+      F32x8 texel_g1 = _mm256_div_ps(texel_g0, var_255);
+      F32x8 texel_r1 = _mm256_div_ps(texel_r0, var_255);
 
-      texel_r = texel_r * texel_r;
-      texel_g = texel_g * texel_g;
-      texel_b = texel_b * texel_b;
+      texel_r1 = _mm256_mul_ps(texel_r1, texel_r1);
+      texel_g1 = _mm256_mul_ps(texel_g1, texel_g1);
+      texel_b1 = _mm256_mul_ps(texel_b1, texel_b1);
 
       //
       // Fetch and calculate dst pixels
       //
       U32 *dst_memory = destination + x8;
-      Vec8I dst_pixel = {_mm256_maskload_epi32((const int *)dst_memory, should_fill)};
+      S32x8 dst_pixel = _mm256_maskload_epi32((const int *)dst_memory, should_fill);
 
-      Vec8I dst_i_a = dst_pixel & vec8i(0xff000000);
-      Vec8I dst_i_b = dst_pixel & vec8i(0x00ff0000);
-      Vec8I dst_i_g = dst_pixel & vec8i(0x0000ff00);
-      Vec8I dst_i_r = dst_pixel & vec8i(0x000000ff);
+      S32x8 dst_i_a0 = _mm256_and_si256(dst_pixel, var_0xff000000);
+      S32x8 dst_i_b0 = _mm256_and_si256(dst_pixel, var_0x00ff0000);
+      S32x8 dst_i_g0 = _mm256_and_si256(dst_pixel, var_0x0000ff00);
+      S32x8 dst_i_r0 = _mm256_and_si256(dst_pixel, var_0x000000ff);
 
-      dst_i_a = dst_i_a >> 24;
-      dst_i_a = dst_i_a &  vec8i(0x000000ff);
-      dst_i_b = dst_i_b >> 16 ;
-      dst_i_g = dst_i_g >> 8;
+      S32x8 dst_i_a1 = _mm256_srai_epi32(dst_i_a0, 24);
+      dst_i_a1 = _mm256_and_si256(dst_i_a1, var_0x000000ff);
+      S32x8 dst_i_b1 = _mm256_srai_epi32(dst_i_b0, 16);
+      S32x8 dst_i_g1 = _mm256_srai_epi32(dst_i_g0, 8);
+      S32x8 dst_i_r1 = dst_i_r0;
 
-      Vec8 dst_a = convert_vec8i_to_vec8(dst_i_a);
-      Vec8 dst_b = convert_vec8i_to_vec8(dst_i_b);
-      Vec8 dst_g = convert_vec8i_to_vec8(dst_i_g);
-      Vec8 dst_r = convert_vec8i_to_vec8(dst_i_r);
+      F32x8 dst_a = _mm256_cvtepi32_ps(dst_i_a1);
+      F32x8 dst_b = _mm256_cvtepi32_ps(dst_i_b1);
+      F32x8 dst_g = _mm256_cvtepi32_ps(dst_i_g1);
+      F32x8 dst_r = _mm256_cvtepi32_ps(dst_i_r1);
 
-      dst_a.simd = _mm256_div_ps(dst_a.simd, var255);
-      dst_b.simd = _mm256_div_ps(dst_b.simd, var255);
-      dst_g.simd = _mm256_div_ps(dst_g.simd, var255);
-      dst_r.simd = _mm256_div_ps(dst_r.simd, var255);
+      dst_a = _mm256_div_ps(dst_a, var255);
+      dst_b = _mm256_div_ps(dst_b, var255);
+      dst_g = _mm256_div_ps(dst_g, var255);
+      dst_r = _mm256_div_ps(dst_r, var255);
 
-      dst_r *= dst_r;
-      dst_g *= dst_g;
-      dst_b *= dst_b;
+      dst_r = _mm256_mul_ps(dst_r, dst_r);
+      dst_g = _mm256_mul_ps(dst_g, dst_g);
+      dst_b = _mm256_mul_ps(dst_b, dst_b);
 
       // Premultiplied alpha
       {
-        dst_r = texel_r + ((var1-texel_a) * dst_r);
-        dst_g = texel_g + ((var1-texel_a) * dst_g);
-        dst_b = texel_b + ((var1-texel_a) * dst_b);
-        dst_a = texel_a + dst_a - texel_a*dst_a;
+        dst_r = _mm256_add_ps(texel_r1, _mm256_mul_ps(_mm256_sub_ps(var1,texel_a1), dst_r));
+        dst_g = _mm256_add_ps(texel_g1, _mm256_mul_ps(_mm256_sub_ps(var1,texel_a1), dst_g));
+        dst_b = _mm256_add_ps(texel_b1, _mm256_mul_ps(_mm256_sub_ps(var1,texel_a1), dst_b));
+        dst_a = _mm256_sub_ps(_mm256_add_ps(texel_a1, dst_a), _mm256_mul_ps(texel_a1,dst_a));
       }
 
       // Almost linear to srgb
       {
-        dst_r.simd = {_mm256_sqrt_ps(dst_r.simd)};
-        dst_g.simd = {_mm256_sqrt_ps(dst_g.simd)};
-        dst_b.simd = {_mm256_sqrt_ps(dst_b.simd)};
+        dst_r = _mm256_sqrt_ps(dst_r);
+        dst_g = _mm256_sqrt_ps(dst_g);
+        dst_b = _mm256_sqrt_ps(dst_b);
       }
 
-      Vec8I result;
+      S32x8 result;
       for(S64 i = 0; i < 8; i++){
         if (I(should_fill, i)){
             U8 red     = (U8)(dst_r[i] * 255);
             U8 green   = (U8)(dst_g[i] * 255);
             U8 blue    = (U8)(dst_b[i] * 255);
             U8 alpha   = (U8)(dst_a[i] * 255);
-            result.e[i] = (U32)(alpha << 24 | blue << 16 | green << 8 | red << 0);
+            Is(result, i) = (U32)(alpha << 24 | blue << 16 | green << 8 | red << 0);
         }
       }
 
-      _mm256_maskstore_epi32((int *)dst_memory, should_fill, result.simd);
+      _mm256_maskstore_epi32((int *)dst_memory, should_fill, result);
 
     }
     Cy0 -= dx10;