diff --git a/build.bat b/build.bat
index e5932f0..ecc9c28 100644
--- a/build.bat
+++ b/build.bat
@@ -6,4 +6,4 @@ rem assets.exe
 rem tracy/TracyClient.cpp -DTRACY_ENABLE
 
 
-clang main.cpp -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib
+clang main.cpp -O2 -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib
diff --git a/main.cpp b/main.cpp
index feca0f8..a7ba6fd 100644
--- a/main.cpp
+++ b/main.cpp
@@ -396,56 +396,77 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
       indices.simd = _mm256_min_epi32(_mm256_set1_ps(size), indices.simd);
       indices.simd = _mm256_max_epi32(_mm256_set1_ps(0), indices.simd);
 
-      U32 pixel[8] = {};
-      if(should_fill[0]) pixel[0] = src->pixels[indices.e[0]];
-      if(should_fill[1]) pixel[1] = src->pixels[indices.e[1]];
-      if(should_fill[2]) pixel[2] = src->pixels[indices.e[2]];
-      if(should_fill[3]) pixel[3] = src->pixels[indices.e[3]];
-      if(should_fill[4]) pixel[4] = src->pixels[indices.e[4]];
-      if(should_fill[5]) pixel[5] = src->pixels[indices.e[5]];
-      if(should_fill[6]) pixel[6] = src->pixels[indices.e[6]];
-      if(should_fill[7]) pixel[7] = src->pixels[indices.e[7]];
-      // Vec8I *pixelv = (Vec8I *)pixel;
+      //
+      // Fetch and calculate texel values
+      //
+      Vec8I pixel;
+      if(should_fill[0]) pixel.e[0] = src->pixels[indices.e[0]];
+      if(should_fill[1]) pixel.e[1] = src->pixels[indices.e[1]];
+      if(should_fill[2]) pixel.e[2] = src->pixels[indices.e[2]];
+      if(should_fill[3]) pixel.e[3] = src->pixels[indices.e[3]];
+      if(should_fill[4]) pixel.e[4] = src->pixels[indices.e[4]];
+      if(should_fill[5]) pixel.e[5] = src->pixels[indices.e[5]];
+      if(should_fill[6]) pixel.e[6] = src->pixels[indices.e[6]];
+      if(should_fill[7]) pixel.e[7] = src->pixels[indices.e[7]];
 
-      // Vec8I texel_i_a = *pixelv & vec8i(0xff000000);
-      // Vec8I texel_i_b = *pixelv & vec8i(0x00ff0000);
-      // Vec8I texel_i_g = *pixelv & vec8i(0x0000ff00);
-      // Vec8I texel_i_r = *pixelv & vec8i(0x000000ff);
+      Vec8I texel_i_a = pixel & vec8i(0xff000000);
+      Vec8I texel_i_b = pixel & vec8i(0x00ff0000);
+      Vec8I texel_i_g = pixel & vec8i(0x0000ff00);
+      Vec8I texel_i_r = pixel & vec8i(0x000000ff);
 
-      // Vec8 texel_a = convert_vec8i_to_vec8(texel_i_a >> 24) / vec8(255.f);
-      // Vec8 texel_b = convert_vec8i_to_vec8(texel_i_b >> 16) / vec8(255.f);
-      // Vec8 texel_g = convert_vec8i_to_vec8(texel_i_g >> 8 ) / vec8(255.f);
-      // Vec8 texel_r = convert_vec8i_to_vec8(texel_i_r >> 0 ) / vec8(255.f);
+      // Alpha is done this way because signed integer shift is weird
+      // When sign bit is set it sets all bits that we shift the sign through
+      // So first we shift
+      texel_i_a = (texel_i_a >> 24);
+      texel_i_a = texel_i_a & vec8i(0x000000ff);
+      texel_i_b = (texel_i_b >> 16);
+      texel_i_g = (texel_i_g >> 8 );
+      texel_i_r = (texel_i_r >> 0 );
 
-      // texel_r = texel_r * texel_r;
-      // texel_g = texel_g * texel_g;
-      // texel_b = texel_b * texel_b;
+      Vec8 texel_a = convert_vec8i_to_vec8(texel_i_a);
+      Vec8 texel_b = convert_vec8i_to_vec8(texel_i_b);
+      Vec8 texel_g = convert_vec8i_to_vec8(texel_i_g);
+      Vec8 texel_r = convert_vec8i_to_vec8(texel_i_r);
+
+      Vec8 v255 = vec8(255.f);
+      texel_a = texel_a / v255;
+      texel_b = texel_b / v255;
+      texel_g = texel_g / v255;
+      texel_r = texel_r / v255;
+
+      texel_r = texel_r * texel_r;
+      texel_g = texel_g * texel_g;
+      texel_b = texel_b * texel_b;
+
+      //
+      // Fetch and calculate dst pixels
+      //
+      U32 *dst_memory = destination + x8;
+      Vec8I dst_pixel = {_mm256_maskload_epi32((const int *)dst_memory, should_fill.simd)};
+
+      Vec8I dst_i_a = dst_pixel & vec8i(0xff000000);
+      Vec8I dst_i_b = dst_pixel & vec8i(0x00ff0000);
+      Vec8I dst_i_g = dst_pixel & vec8i(0x0000ff00);
+      Vec8I dst_i_r = dst_pixel & vec8i(0x000000ff);
+
+      dst_i_a = dst_i_a >> 24;
+      dst_i_a = dst_i_a &  vec8i(0x000000ff);
+      dst_i_b = dst_i_b >> 16 ;
+      dst_i_g = dst_i_g >> 8;
+
+      Vec8 dst_a = convert_vec8i_to_vec8(dst_i_a) / vec8(255);
+      Vec8 dst_b = convert_vec8i_to_vec8(dst_i_b) / vec8(255);
+      Vec8 dst_g = convert_vec8i_to_vec8(dst_i_g) / vec8(255);
+      Vec8 dst_r = convert_vec8i_to_vec8(dst_i_r) / vec8(255);
+
+      dst_r *= dst_r;
+      dst_g *= dst_g;
+      dst_b *= dst_b;
 
-      U32 *dst_pixel = destination + x8;
       for(S64 i = 0; i < 8; i++){
         if (should_fill[i]){
-          Vec4 result_color;// = {texel_r[i], texel_g[i], texel_b[i], texel_a[i]};
-           {
-            U32 c = pixel[i];
-            F32 a = ((c & 0xff000000) >> 24) / 255.f;
-            F32 b = ((c & 0x00ff0000) >> 16) / 255.f;
-            F32 g = ((c & 0x0000ff00) >> 8)  / 255.f;
-            F32 r = ((c & 0x000000ff) >> 0)  / 255.f;
-            r*=r;
-            g*=g;
-            b*=b;
-            result_color = { r,g,b,a };
-          }
-
-          Vec4 dst_color; {
-            U32 c = dst_pixel[i];
-            F32 a = ((c & 0xff000000) >> 24) / 255.f;
-            F32 b = ((c & 0x00ff0000) >> 16) / 255.f;
-            F32 g = ((c & 0x0000ff00) >> 8)  / 255.f;
-            F32 r = ((c & 0x000000ff) >> 0)  / 255.f;
-            r*=r; g*=g; b*=b;
-            dst_color = { r,g,b,a };
-          }
+          Vec4 result_color = {texel_r[i], texel_g[i], texel_b[i], texel_a[i]};
+          Vec4 dst_color = {dst_r[i], dst_g[i], dst_b[i], dst_a[i]};
 
 #if 0
           Vec3 light_color = vec3(0.8,0.8,1);
@@ -480,7 +501,7 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
             color32 = (U32)(alpha << 24 | blue << 16 | green << 8 | red << 0);
           }
 
-          dst_pixel[i] = color32;
+          dst_memory[i] = color32;
         }
       }