diff --git a/.gitignore b/.gitignore index ffd0c4f..b5f3698 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,6 @@ data.txt *.bin *.4c asset.log.txt -perfclocks.txt \ No newline at end of file +perfclocks* +*.ipynb +zmiany diff --git a/README.md b/README.md index 02d1a50..32cd2c4 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,27 @@ +# Realtime Software Renderer + +![screenshot1](assets/Screenshot1.png) +![screenshot2](assets/Screenshot2.png) + + +## Clipping + +There are 3 clipping stages, 2 clipping stages in 3D space against zfar and znear and 1 clipping +stage in 2D against left, bottom, right, top(2D image bounds). + +First the triangles get clipped against the zfar plane, +if a triangle has even one vertex outside the clipping region, the entire triangle gets cut. +So far I didn't have problems with that. It simplifies the computations and splitting triangles +on zfar seems like a waste of power. + +The second clipping stage is znear plane. Triangles get fully and nicely clipped against znear. +Every time a triangle gets partially outside the clipping region it gets cut to the znear and +either one or two new triangles get derived from the old one. + +Last clipping stage is performed in the 2D image space. Every triangle has a corresponding AABB +box. In this box every pixel gets tested to see if it's in the triangle. In this clipping stage +the box is clipped to the image metrics - 0, 0, width, height. ### Things to do: @@ -43,7 +66,7 @@ - [ ] Outlines - [ ] Lightning - [ ] Proper normal interpolation - * `https://hero.handmade.network/episode/code/day101/#105 + * https://hero.handmade.network/episode/code/day101/#105 - [ ] Phong - [x] diffuse - [x] ambient @@ -57,14 +80,15 @@ - [x] Simple profiling tooling - [x] Statistics based on profiler data - [x] Find cool profilers - ExtraSleepy, Vtune -- [ ] Optimizations - - [ ] Inline edge function - - [ ] Expand edge functions to more optimized version - - [ ] Test 4x2 bitmap layout? - - [ ] Edge function to integer - - [ ] Use integer bit operations to figure out if plus. (edge0|edge1|edge2)>=0 - - [ ] SIMD - - [ ] Multithreading +- [x] Optimizations + - [x] Inline edge function + - [x] Expand edge functions to more optimized version + - [-] Test 4x2 bitmap layout? + - [-] Edge function to integer + - [-] Use integer bit operations to figure out if plus. (edge0|edge1|edge2)>=0 + - [x] SIMD + - [x] Optimized SIMD + - [x] Multithreading - [x] Text rendering - [ ] UI @@ -76,32 +100,9 @@ - [x] Gamma correct alpha blending for rectangles and bitmaps - [ ] Plotting of profile data - [x] Simple scatter plot - - -### Urgent: - -- [ ] Simplify the code, especially for the 2d routines - [x] Asset processor as second program -## Clipping - -There are 3 clipping stages, 2 clipping stages in 3D space against zfar and znear and 1 clipping -stage in 2D against left, bottom, right, top(2D image bounds). - -First the triangles get clipped against the zfar plane, -if a triangle has even one vertex outside the clipping region, the entire triangle gets cut. -So far I didn't have problems with that. It simplifies the computations and splitting triangles -on zfar seems like a waste of power. - -The second clipping stage is znear plane. Triangles get fully and nicely clipped against znear. -Every time a triangle gets partially outside the clipping region it gets cut to the znear and -either one or two new triangles get derived from the old one. - -Last clipping stage is performed in the 2D image space. Every triangle has a corresponding AABB -box. In this box every pixel gets tested to see if it's in the triangle. In this clipping stage -the box is clipped to the image metrics - 0, 0, width, height. - ### Resources that helped me build the rasterizer (Might be helpful to you too): diff --git a/assets/Screenshot1.png b/assets/Screenshot1.png new file mode 100644 index 0000000..f105d7d Binary files /dev/null and b/assets/Screenshot1.png differ diff --git a/assets/Screenshot2.png b/assets/Screenshot2.png new file mode 100644 index 0000000..ac3dbf0 Binary files /dev/null and b/assets/Screenshot2.png differ diff --git a/main.cpp b/main.cpp index 7481238..835e5ae 100644 --- a/main.cpp +++ b/main.cpp @@ -78,12 +78,13 @@ /// - [x] Simple scatter plot /// - [x] Asset processor as second program -// #include "obj_dump.cpp" -#include "multimedia.cpp" -#include "obj.cpp" +#include "obj_dump.cpp" +// #include "multimedia.cpp" +// #include "obj.cpp" #include "vec.cpp" #include "work_queue.cpp" #define PROFILE_SCOPE(x) +#define MULTITHREADING 1 struct Vertex { Vec3 pos; @@ -296,7 +297,7 @@ F32 edge_function(Vec4 vecp0, Vec4 vecp1, Vec4 p) { #define S32x8 __m256i S32 render_triangle_test_case_number = 5; -S32 render_triangle_test_case_angle = 1; +S32 render_triangle_test_case_angle = -1; U64 filled_pixel_count; U64 filled_pixel_cycles; U64 triangle_count; @@ -635,7 +636,7 @@ void draw_mesh(Render *r, String scene_name, Obj_Material *materials, Obj_Mesh * Vec3 p0_to_p1 = vert[1].pos - vert[0].pos; Vec3 p0_to_p2 = vert[2].pos - vert[0].pos; Vec3 normal = normalize(cross(p0_to_p1, p0_to_p2)); - // Vec3 light_direction = mat4_rotation_x(light_rotation) * vec3(0, 0, 1); + Vec3 light_direction = mat4_rotation_x(light_rotation) * vec3(0, 0, 1); if (dot(normal, p0_to_camera) > 0) { //@Note: Backface culling /// ## Clipping @@ -731,6 +732,7 @@ void draw_mesh(Render *r, String scene_name, Obj_Material *materials, Obj_Mesh * triangle_count++; if (in_count > 3) triangle_count++; +#if MULTITHREADING Render_Command *command = array_alloc(os.perm_arena, &r->commands); command->src = image; command->p0 = in[0].pos; @@ -750,12 +752,12 @@ void draw_mesh(Render *r, String scene_name, Obj_Material *materials, Obj_Mesh * command->tex2 = in[3].tex; } -#if 0 +#else switch(render_triangle_test_case_number){ - case 0: break; case 1: draw_triangle_nearest_a(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[1].pos, in[2].pos, in[0].tex, in[1].tex, in[2].tex, in[0].norm, in[1].norm, in[2].norm); if (in_count > 3) draw_triangle_nearest_a(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[2].pos, in[3].pos, in[0].tex, in[2].tex, in[3].tex, in[0].norm, in[2].norm, in[3].norm); + break; case 2: draw_triangle_nearest_b(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[1].pos, in[2].pos, in[0].tex, in[1].tex, in[2].tex, in[0].norm, in[1].norm, in[2].norm); if (in_count > 3) draw_triangle_nearest_b(&r->screen320, r->depth320, image, light_direction, in[0].pos, in[2].pos, in[3].pos, in[0].tex, in[2].tex, in[3].tex, in[0].norm, in[2].norm, in[3].norm); @@ -797,8 +799,7 @@ UI_SIGNAL_CALLBACK(scene_callback) { } break; case Scene_Sponza: { speed = 100; - r.camera_pos = vec3(-228,94.5,-107); - r.camera_yaw = vec2(-1.25, 0.21); + r.camera_pos = vec3(-1020, 687, -85); r.camera_yaw = vec2(-1.3, -0.44); obj = sponza; } break; case Scene_Count: @@ -810,36 +811,49 @@ UI_SIGNAL_CALLBACK(scene_callback) { FILE *global_file; function void windows_log(Log_Kind kind, String string, char *file, int line){ - // fprintf(global_file, "%s", string.str); + fprintf(global_file, "%s", string.str); // OutputDebugStringA((const char *)string.str); } +function void +next_test_case(B32 first_time){ + render_triangle_test_case_number += 1; + if(first_time || render_triangle_test_case_number == 6){ + render_triangle_test_case_angle += 1; + render_triangle_test_case_number = 1; + try_again: switch(render_triangle_test_case_angle){ + case 0: r.camera_pos = vec3(-1020, 687, -85); r.camera_yaw = vec2(-1.3, -0.44); break; + case 1: r.camera_pos = vec3(-356,89.5,168); r.camera_yaw = vec2(0.2, 0); break; + case 2: render_triangle_test_case_angle = 0; goto try_again; break; + } + } +} + int main(int argc, char **argv) { global_file = fopen("perfclocks.txt", "a"); thread_ctx.log_proc = windows_log; - fprintf(global_file, "\n---------------------"); os.window_size.x = 1920; os.window_size.y = 1080; os.window_resizable = 1; assert(os_init()); Font font = os_load_font(os.perm_arena, 12*os.dpi_scale, "Arial", 0); - test_array_list(); + // test_array_list(); - f22 = load_obj_dump(os.perm_arena, "plane.bin"_s); - sponza = load_obj_dump(os.perm_arena, "sponza.bin"_s); - // Obj sponza_obj = load_obj(&os_process_heap, "assets/sponza/sponza.obj"_s); - // sponza = &sponza_obj; + // f22 = load_obj_dump(os.perm_arena, "plane.bin"_s); + // sponza = load_obj_dump(os.perm_arena, "sponza.bin"_s); + Obj sponza_obj = load_obj(&os_process_heap, "assets/sponza/sponza.obj"_s); + sponza = &sponza_obj; scene_callback(); + next_test_case(true); int screen_x = os.window_size.x; int screen_y = os.window_size.y; - r.camera_pos = vec3(-228,94.5,-107); - r.camera_yaw = vec2(-1.25, 0.21); r.screen320 = {(U32 *)arena_push_size(os.perm_arena, screen_x*screen_y*sizeof(U32)), screen_x, screen_y}; r.depth320 = (F32 *)arena_push_size(os.perm_arena, sizeof(F32) * screen_x * screen_y); + r.commands.block_size = 1024*1024; ThreadStartupInfo thread_infos[16] = {}; init_work_queue(&r.work_queue, buff_cap(thread_infos), thread_infos); @@ -911,8 +925,8 @@ main(int argc, char **argv) { draw_mesh(&r, obj->name, obj->materials.data, mesh+i, vertices, tex_coords, normals); } - - Render_Tile_Job_Data tile_job_data[16]; +#if MULTITHREADING + Render_Tile_Job_Data tile_job_data[32]; S32 x_tiles = 1; S32 y_tiles = 16; F32 block_size_x = r.screen320.x / x_tiles; @@ -935,7 +949,7 @@ main(int argc, char **argv) { wait_until_completion(&r.work_queue); array_free_all_nodes(&r.commands); - +#endif // @Note: Draw 320screen to OS screen U32* ptr = os.screen->pixels; @@ -952,30 +966,23 @@ main(int argc, char **argv) { ui_end_frame(os.screen, &ui, &font); frame_data = string_fmt(os.frame_arena, "FPS:%f dt:%f frame:%u camera_pos: %f %f %f camera_yaw: %f %f", os.fps, os.delta_time*1000, os.frame, r.camera_pos.x, r.camera_pos.y, r.camera_pos.z, r.camera_yaw.x, r.camera_yaw.y); - if(filled_pixel_count){ - raster_details = string_fmt(os.frame_arena, "\nAngle:%d Case:%d Cycle per pixel: %llu Cycles:%llu Pixels:%llu Triangles:%llu", - render_triangle_test_case_angle, render_triangle_test_case_number, filled_pixel_cycles/filled_pixel_count, filled_pixel_cycles, filled_pixel_count, triangle_count); +#if MULTITHREADING + if(os.frame == 1) log_info("Angle;Frame_Time\n"); + log_info("%d;%f\n", render_triangle_test_case_angle, os.delta_time*1000); +#else + if(os.frame == 1) log_info("Angle;Algorithm;Frame_Time;Cycles_Per_Pixel;Cycles_To_Process_Triangles;Pixels_Processed;Triangles\n"); + log_info("%d;%d;%f;%llu;%llu;%llu;%llu\n", render_triangle_test_case_angle, render_triangle_test_case_number, + os.delta_time*1000, filled_pixel_cycles/filled_pixel_count, filled_pixel_cycles, filled_pixel_count, triangle_count); +#endif - filled_pixel_count = 0; - filled_pixel_cycles = 0; - triangle_count = 0; - } + filled_pixel_count = 0; + filled_pixel_cycles = 0; + triangle_count = 0; // @Todo I think there is bug with test_case_number, after doing full round it // skips a phase - if(os.frame % 60 == 0){ - continue; - render_triangle_test_case_number++; - if(render_triangle_test_case_number == 6){ - render_triangle_test_case_number = 0; - try_again: switch(render_triangle_test_case_angle){ - case 0: r.camera_pos = vec3(-228,94.5,-107); r.camera_yaw = vec2(-1.25, 0.21); break; - case 1: r.camera_pos = vec3(-356,89.5,168); r.camera_yaw = vec2(0.2, 0); break; - case 2: r.camera_pos = vec3(-1020, 687, -85); r.camera_yaw = vec2(-1.3, -0.44); break; - case 3: render_triangle_test_case_angle = 0; goto try_again; break; - } - render_triangle_test_case_angle += 1; - } + if(os.frame % 15 == 0){ + next_test_case(false); } } diff --git a/optimization_log.cpp b/optimization_log.cpp index 8590b1c..123f469 100644 --- a/optimization_log.cpp +++ b/optimization_log.cpp @@ -7,7 +7,6 @@ void draw_triangle_nearest_a(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 l Vec2 tex0, Vec2 tex1, Vec2 tex2, Vec3 norm0, Vec3 norm1, Vec3 norm2) { if(src->pixels == 0) return; - U64 fill_pixels_begin = __rdtsc(); F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x))); F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y))); @@ -22,9 +21,10 @@ void draw_triangle_nearest_a(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 l if (min_y >= max_y) return; if (min_x >= max_x) return; + U64 fill_pixels_begin = __rdtsc(); U32 *destination = dst->pixels + dst->x*min_y; - F32 area = (p1.y - p0.y) * (p2.x - p0.x) - (p1.x - p0.x) * (p2.y - p0.y); + F32 area = edge_function(p0, p1, p2); for (S64 y = min_y; y < max_y; y++) { for (S64 x = min_x; x < max_x; x++) { F32 Cx0 = edge_function(p0, p1, { (F32)x,(F32)y }); @@ -116,7 +116,6 @@ void draw_triangle_nearest_b(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 l Vec2 tex0, Vec2 tex1, Vec2 tex2, Vec3 norm0, Vec3 norm1, Vec3 norm2) { if(src->pixels == 0) return; - U64 fill_pixels_begin = __rdtsc(); F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x))); F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y))); @@ -129,6 +128,7 @@ void draw_triangle_nearest_b(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 l if (min_y >= max_y) return; if (min_x >= max_x) return; + U64 fill_pixels_begin = __rdtsc(); F32 dy10 = (p1.y - p0.y); F32 dy21 = (p2.y - p1.y); @@ -246,7 +246,6 @@ void draw_triangle_bilinear(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 li Vec2 tex0, Vec2 tex1, Vec2 tex2, Vec3 norm0, Vec3 norm1, Vec3 norm2) { if(src->pixels == 0) return; - U64 fill_pixels_begin = __rdtsc(); F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x))); F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y))); F32 max_x1 = (F32)(max(p0.x, max(p1.x, p2.x))); @@ -260,6 +259,7 @@ void draw_triangle_bilinear(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Vec3 li if (min_y >= max_y) return; if (min_x >= max_x) return; + U64 fill_pixels_begin = __rdtsc(); F32 area = edge_function(p0, p1, p2); for (S64 y = min_y; y < max_y; y++) { @@ -347,7 +347,6 @@ void draw_triangle_nearest_simd_with_overloads(Bitmap* dst, F32 *depth_buffer, B Vec2 tex0, Vec2 tex1, Vec2 tex2, Vec3 norm0, Vec3 norm1, Vec3 norm2) { if(src->pixels == 0) return; - U64 fill_pixels_begin = __rdtsc(); F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x))); F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y))); @@ -362,6 +361,7 @@ void draw_triangle_nearest_simd_with_overloads(Bitmap* dst, F32 *depth_buffer, B if (min_y >= max_y) return; if (min_x >= max_x) return; + U64 fill_pixels_begin = __rdtsc(); F32 dy10 = (p1.y - p0.y); F32 dy21 = (p2.y - p1.y); F32 dy02 = (p0.y - p2.y); @@ -570,9 +570,6 @@ void draw_triangle_nearest_simd_without_overloads(Bitmap* dst, F32 *depth_buffer Vec2 tex0, Vec2 tex1, Vec2 tex2, Vec3 norm0, Vec3 norm1, Vec3 norm2) { if(src->pixels == 0) return; - U64 fill_pixels_begin = __rdtsc(); - - PROFILE_SCOPE(draw_triangle); F32 min_x1 = (F32)(min(p0.x, min(p1.x, p2.x))); F32 min_y1 = (F32)(min(p0.y, min(p1.y, p2.y))); @@ -587,6 +584,7 @@ void draw_triangle_nearest_simd_without_overloads(Bitmap* dst, F32 *depth_buffer if (min_y >= max_y) return; if (min_x >= max_x) return; + U64 fill_pixels_begin = __rdtsc(); F32 dy10 = (p1.y - p0.y); F32 dy21 = (p2.y - p1.y); F32 dy02 = (p0.y - p2.y); @@ -858,7 +856,6 @@ void draw_triangle_nearest_final(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Ve Vec3 norm0, Vec3 norm1, Vec3 norm2) { if(src->pixels == 0) return; - U64 fill_pixels_begin = __rdtsc(); F32 region_min_x = 0; F32 region_min_y = 0; @@ -878,6 +875,8 @@ void draw_triangle_nearest_final(Bitmap* dst, F32 *depth_buffer, Bitmap *src, Ve if (min_y >= max_y) return; if (min_x >= max_x) return; + U64 fill_pixels_begin = __rdtsc(); + F32 dy10 = (p1.y - p0.y); F32 dy21 = (p2.y - p1.y); F32 dy02 = (p0.y - p2.y);