diff --git a/build.bat b/build.bat
index e5932f0..ecc9c28 100644
--- a/build.bat
+++ b/build.bat
@@ -6,4 +6,4 @@ rem assets.exe
 rem tracy/TracyClient.cpp -DTRACY_ENABLE
 
 
-clang main.cpp -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib
+clang main.cpp -O2 -mavx2 -Wall -Wno-unused-function -Wno-missing-braces -fno-exceptions -fdiagnostics-absolute-paths -g -I".." -o main.exe -Wl,user32.lib -Wl,optick\lib\x64\release\OptickCore.lib
diff --git a/main.cpp b/main.cpp
index 95f515d..1a2678c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -331,7 +331,7 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
   Vec8 Dy10 = vec8(dy10) * var07;
   Vec8 Dy21 = vec8(dy21) * var07;
   Vec8 Dy02 = vec8(dy02) * var07;
-  Vec8 w0, w1, w2, invw0, invw1, invw2, u, v, interpolated_w, should_fill;
+  Vec8 w0, w1, w2, invw0, invw1, invw2, u, v, interpolated_w;
   Vec8I ui, vi;
 
   U32 *destination = dst->pixels + dst->x*min_y;
@@ -349,7 +349,15 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
       Cx2 = vec8(Cx2[7]) + Dy02;
 
 
-      should_fill = Cx0 >= vec8(0) & Cx1 >= vec8(0) & Cx2 >= vec8(0);
+
+      Vec8 should_fill;
+      {
+        Vec8 a = (vec8(x8) + var07);
+        Vec8 b = vec8(max_x);
+        should_fill = a < b;
+        should_fill = should_fill & (Cx0 >= vec8(0) & Cx1 >= vec8(0) & Cx2 >= vec8(0));
+      }
+
       w0 = Cx1 / area8;
       w1 = Cx2 / area8;
       w2 = Cx0 / area8;
@@ -362,6 +370,7 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
       Vec8 depth = loadu8(depth_pointer);
       should_fill = should_fill & (depth < interpolated_w);
 
+
       invw0 = (w0 / vec8(p0.w));
       invw1 = (w1 / vec8(p1.w));
       invw2 = (w2 / vec8(p2.w));
@@ -378,17 +387,26 @@ void draw_triangle_nearest(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 lig
       vi = convert_vec8_to_vec8i(v);
 
       // Origin UV (0,0) is in bottom left
-      U32 *dst_pixel = destination + x8;
+      _mm256_maskstore_epi32((int *)depth_pointer, should_fill.simd, interpolated_w.simd);
+      Vec8I indices = ui + ((vec8i(src->y) - vec8i(1) - vi) * vec8i(src->x));
+      U32 *pixel[8] = {
+        src->pixels + indices.e[0],
+        src->pixels + indices.e[1],
+        src->pixels + indices.e[2],
+        src->pixels + indices.e[3],
+        src->pixels + indices.e[4],
+        src->pixels + indices.e[5],
+        src->pixels + indices.e[6],
+        src->pixels + indices.e[7],
+      };
 
+      U32 *dst_pixel = destination + x8;
       for(S64 i = 0; i < 8; i++){
         if (should_fill[i]){
           PROFILE_SCOPE(fill_triangle_after_depth_test);
-          depth_pointer[i] = interpolated_w[i];
-
-          U32 *pixel = src->pixels + (ui[i] + (src->y - 1ll - vi[i]) * src->x);
 
           Vec4 result_color; {
-            U32 c = *pixel;
+            U32 c = *pixel[i];
             F32 a = ((c & 0xff000000) >> 24) / 255.f;
             F32 b = ((c & 0x00ff0000) >> 16) / 255.f;
             F32 g = ((c & 0x0000ff00) >> 8)  / 255.f;
@@ -652,7 +670,7 @@ main(int argc, char **argv) {
   os.window_size.y = 720;
   os.window_resizable = 1;
   assert(os_init());
-  Font font = os_load_font(os.perm_arena, 16, "Arial", 0);
+  Font font = os_load_font(os.perm_arena, 12*os.dpi_scale, "Arial", 0);
 
   f22 = load_obj_dump(os.perm_arena, "plane.bin"_s);
   sponza = load_obj_dump(os.perm_arena, "sponza.bin"_s);
diff --git a/vec.cpp b/vec.cpp
index 5fa477b..75d9dcb 100644
--- a/vec.cpp
+++ b/vec.cpp
@@ -36,10 +36,15 @@ union Vec8I{
 
 Vec8I vec8i(S32 x){return {_mm256_set1_epi32(x)}; }
 Vec8I vec8i(S32 a, S32 b, S32 c, S32 d, S32 e, S32 f, S32 g, S32 h){ return {_mm256_set_epi32(h, g, f, e, d, c, b, a)}; }
+Vec8I operator>(Vec8I a, Vec8I b){
+  return {_mm256_cmpgt_epi32(a.simd, b.simd)};
+}
 Vec8I operator+(Vec8I a, Vec8I b){ return {_mm256_add_epi32(a.simd, b.simd)}; }
 Vec8I operator-(Vec8I a, Vec8I b){ return {_mm256_sub_epi32(a.simd, b.simd)}; }
-Vec8I operator*(Vec8I a, Vec8I b){ return {_mm256_mul_epi32(a.simd, b.simd)}; }
+Vec8I operator*(Vec8I a, Vec8I b){
+  return {_mm256_mullo_epi32(a.simd, b.simd)};  //_mm256_mul_epi32
+}
 // Vec8I operator/(Vec8I a, Vec8I b){ return {_mm256_div_epi32(a.simd, b.simd)}; }
 Vec8I operator+=(Vec8I &a, Vec8I b){ return a + b; }
 
-Vec8I convert_vec8_to_vec8i(Vec8 v){ return Vec8I{_mm256_cvtps_epi32(v.simd)}; }
+Vec8I convert_vec8_to_vec8i(Vec8 v){ return Vec8I{_mm256_cvtps_epi32(v.simd)}; }
\ No newline at end of file