336 simd_benchmark_t result = {0};
340 if (checked_size_mul((
size_t)width, (
size_t)height, &pixel_count) != ASCIICHAT_OK) {
341 log_error(
"Image dimensions %d x %d too large (overflow)", width, height);
346 rgb_pixel_t *test_pixels;
348 test_pixels = SAFE_CALLOC_SIMD(pixel_count,
sizeof(rgb_pixel_t), rgb_pixel_t *);
349 output_buffer = SAFE_MALLOC(pixel_count,
char *);
352 image_t *test_image =
image_new(width, height);
354 SAFE_FREE(test_pixels);
355 SAFE_FREE(output_buffer);
360 printf(
"Using synthetic gradient data for consistent benchmarking\n");
362 for (
size_t i = 0; i < pixel_count; i++) {
366 int base_r = (x * 255) / width;
367 int base_g = (y * 255) / height;
368 int base_b = ((x + y) * 127) / (width + height);
371 int temp_r = base_r + (rand() % 32 - 16);
372 int temp_g = base_g + (rand() % 32 - 16);
373 int temp_b = base_b + (rand() % 32 - 16);
375 test_pixels[i].r = clamp_rgb(temp_r);
376 test_pixels[i].g = clamp_rgb(temp_g);
377 test_pixels[i].b = clamp_rgb(temp_b);
381 memcpy(test_image->pixels, test_pixels, pixel_count *
sizeof(rgb_pixel_t));
384 int adaptive_iterations = calculate_adaptive_iterations(pixel_count, 10.0);
385 printf(
"Benchmarking MONO %dx%d (%zu pixels) using %d adaptive iterations (ignoring passed iterations)...\n", width,
386 height, pixel_count, adaptive_iterations);
389 ensure_default_palette_ready();
391 for (
int i = 0; i < adaptive_iterations; i++) {
394 SAFE_FREE(result_str);
396 result.scalar_time = (time_ns_to_s(
time_get_ns()) - start_mono) / adaptive_iterations;
402 for (
int i = 0; i < adaptive_iterations; i++) {
405 SAFE_FREE(result_str);
407 result.sse2_time = (time_ns_to_s(
time_get_ns()) - start_sse2) / adaptive_iterations;
410#if SIMD_SUPPORT_SSSE3
414 for (
int i = 0; i < adaptive_iterations; i++) {
417 SAFE_FREE(result_str);
419 result.ssse3_time = (time_ns_to_s(
time_get_ns()) - start_ssse3) / adaptive_iterations;
426 for (
int i = 0; i < adaptive_iterations; i++) {
429 SAFE_FREE(result_str);
431 result.avx2_time = (time_ns_to_s(
time_get_ns()) - start_avx2) / adaptive_iterations;
439 for (
int i = 0; i < adaptive_iterations; i++) {
442 SAFE_FREE(result_str);
444 result.neon_time = (time_ns_to_s(
time_get_ns()) - start_neon) / adaptive_iterations;
449 result.sve_time = 0.0;
453 double best_time = result.scalar_time;
454 result.best_method =
"scalar";
457 if (result.sse2_time > 0 && result.sse2_time < best_time) {
458 best_time = result.sse2_time;
459 result.best_method =
"SSE2";
463#if SIMD_SUPPORT_SSSE3
464 if (result.ssse3_time > 0 && result.ssse3_time < best_time) {
465 best_time = result.ssse3_time;
466 result.best_method =
"SSSE3";
471 if (result.avx2_time > 0 && result.avx2_time < best_time) {
472 best_time = result.avx2_time;
473 result.best_method =
"AVX2";
478 if (result.neon_time > 0 && result.neon_time < best_time) {
479 best_time = result.neon_time;
480 result.best_method =
"NEON";
484 result.speedup_best = result.scalar_time / best_time;
487 if (result.sve_time > 0 && result.sve_time < best_time) {
488 best_time = result.sve_time;
489 result.best_method =
"SVE";
495 SAFE_FREE(test_pixels);
496 SAFE_FREE(output_buffer);
502 simd_benchmark_t result = {0};
506 if (checked_size_mul((
size_t)width, (
size_t)height, &pixel_count) != ASCIICHAT_OK) {
507 log_error(
"Image dimensions %d x %d too large (overflow)", width, height);
513 size_t output_buffer_size = pixel_count * 30 + (size_t)width * 10;
516 rgb_pixel_t *test_pixels;
518 test_pixels = SAFE_CALLOC_SIMD(pixel_count,
sizeof(rgb_pixel_t), rgb_pixel_t *);
519 output_buffer = SAFE_MALLOC(output_buffer_size,
char *);
522 image_t *frame =
image_new(width, height);
524 SAFE_FREE(test_pixels);
525 SAFE_FREE(output_buffer);
530 printf(
"Using coherent gradient data for realistic color testing\n");
532 for (
size_t i = 0; i < pixel_count; i++) {
536 int base_r = (x * 255) / width;
537 int base_g = (y * 255) / height;
538 int base_b = ((x + y) * 127) / (width + height);
541 int temp_r = base_r + (rand() % 32 - 16);
542 int temp_g = base_g + (rand() % 32 - 16);
543 int temp_b = base_b + (rand() % 32 - 16);
545 test_pixels[i].r = clamp_rgb(temp_r);
546 test_pixels[i].g = clamp_rgb(temp_g);
547 test_pixels[i].b = clamp_rgb(temp_b);
551 frame->pixels = test_pixels;
553 const char *mode_str = background_mode ?
"background" :
"foreground";
554 printf(
"Benchmarking COLOR %s %dx%d (%zu pixels) x %d iterations...\n", mode_str, width, height, pixel_count,
559 for (
int i = 0; i < iterations; i++) {
562 SAFE_FREE(result_str);
564 result.scalar_time = time_ns_to_s(
time_get_ns()) - start;
569 for (
int i = 0; i < iterations; i++) {
570 char *ascii_output = render_ascii_sse2_unified_optimized(frame, background_mode,
true,
DEFAULT_ASCII_PALETTE);
572 SAFE_FREE(ascii_output);
574 result.sse2_time = time_ns_to_s(
time_get_ns()) - start;
577#if SIMD_SUPPORT_SSSE3
580 for (
int i = 0; i < iterations; i++) {
581 char *ascii_output = render_ascii_ssse3_unified_optimized(frame, background_mode,
true,
DEFAULT_ASCII_PALETTE);
583 SAFE_FREE(ascii_output);
585 result.ssse3_time = time_ns_to_s(
time_get_ns()) - start;
591 for (
int i = 0; i < iterations; i++) {
592 char *ascii_output = render_ascii_avx2_unified_optimized(frame, background_mode,
true,
DEFAULT_ASCII_PALETTE);
594 SAFE_FREE(ascii_output);
596 result.avx2_time = time_ns_to_s(
time_get_ns()) - start;
602 for (
int i = 0; i < iterations; i++) {
604 image_t temp_image = {.pixels = test_pixels, .w = width, .h = height, .alloc_method = IMAGE_ALLOC_SIMD};
605 char *ascii_output = render_ascii_neon_unified_optimized(&temp_image, background_mode,
true,
DEFAULT_ASCII_PALETTE);
607 SAFE_FREE(ascii_output);
609 result.neon_time = time_ns_to_s(
time_get_ns()) - start;
613 double best_time = result.scalar_time;
614 result.best_method =
"scalar";
617 if (result.sse2_time > 0 && result.sse2_time < best_time) {
618 best_time = result.sse2_time;
619 result.best_method =
"SSE2";
623#if SIMD_SUPPORT_SSSE3
624 if (result.ssse3_time > 0 && result.ssse3_time < best_time) {
625 best_time = result.ssse3_time;
626 result.best_method =
"SSSE3";
631 if (result.avx2_time > 0 && result.avx2_time < best_time) {
632 best_time = result.avx2_time;
633 result.best_method =
"AVX2";
638 if (result.neon_time > 0 && result.neon_time < best_time) {
639 best_time = result.neon_time;
640 result.best_method =
"NEON";
644 result.speedup_best = result.scalar_time / best_time;
647 frame->pixels = NULL;
649 SAFE_FREE(test_pixels);
650 SAFE_FREE(output_buffer);
657 const image_t *source_image,
bool use_256color) {
658 simd_benchmark_t result = {0};
659 (void)background_mode;
664 if (checked_size_mul((
size_t)width, (size_t)height, &pixel_count) != ASCIICHAT_OK) {
665 log_error(
"Image dimensions %d x %d too large (overflow)", width, height);
670 rgb_pixel_t *test_pixels;
672 const size_t output_buffer_size = pixel_count * 16;
673 test_pixels = SAFE_CALLOC_SIMD(pixel_count,
sizeof(rgb_pixel_t), rgb_pixel_t *);
674 output_buffer = SAFE_MALLOC(output_buffer_size,
char *);
676 if (source_image && source_image->pixels) {
677 printf(
"Using provided image data (%dx%d) for testing\n", source_image->w, source_image->h);
680 if (source_image->w == width && source_image->h == height) {
682 for (
size_t i = 0; i < pixel_count; i++) {
683 test_pixels[i].r = source_image->pixels[i].r;
684 test_pixels[i].g = source_image->pixels[i].g;
685 test_pixels[i].b = source_image->pixels[i].b;
689 for (
int y = 0; y < height; y++) {
690 for (
int x = 0; x < width; x++) {
691 int src_x = (x * source_image->w) / width;
692 int src_y = (y * source_image->h) / height;
694 size_t src_idx = (size_t)src_y * (
size_t)source_image->w + (size_t)src_x;
695 size_t dst_idx = (size_t)y * (
size_t)width + (size_t)x;
697 if (src_idx < (
size_t)source_image->w * (size_t)source_image->h) {
698 test_pixels[dst_idx].r = source_image->pixels[src_idx].r;
699 test_pixels[dst_idx].g = source_image->pixels[src_idx].g;
700 test_pixels[dst_idx].b = source_image->pixels[src_idx].b;
704 printf(
"Resized image data from %dx%d to %dx%d\n", source_image->w, source_image->h, width, height);
708 printf(
"No source image provided, using synthetic gradient data\n");
710 for (
size_t i = 0; i < pixel_count; i++) {
713 int base_r = (x * 255 / width);
714 int base_g = (y * 255 / height);
715 int base_b = ((x + y) * 127 / (width + height));
717 int temp_r = base_r + (rand() % 16 - 8);
718 int temp_g = base_g + (rand() % 16 - 8);
719 int temp_b = base_b + (rand() % 16 - 8);
721 test_pixels[i].r = clamp_rgb(temp_r);
722 test_pixels[i].g = clamp_rgb(temp_g);
723 test_pixels[i].b = clamp_rgb(temp_b);
728 int adaptive_iterations = calculate_adaptive_iterations(pixel_count, 10.0);
729 printf(
"Benchmarking %dx%d (%zu pixels) using %d adaptive iterations (ignoring passed iterations)...\n", width,
730 height, pixel_count, adaptive_iterations);
733 image_t *frame =
image_new(width, height);
734 memcpy(frame->pixels, test_pixels, pixel_count *
sizeof(rgb_pixel_t));
737 ensure_default_palette_ready();
739 for (
int i = 0; i < iterations; i++) {
742 SAFE_FREE(result_str);
744 result.scalar_time = (time_ns_to_s(
time_get_ns()) - start_scalar) / iterations;
749 ensure_default_palette_ready();
750 double start_sse2_color = time_ns_to_s(
time_get_ns());
751 for (
int i = 0; i < iterations; i++) {
752 char *result_str = render_ascii_sse2_unified_optimized(frame, background_mode, use_256color,
DEFAULT_ASCII_PALETTE);
754 SAFE_FREE(result_str);
756 result.sse2_time = (time_ns_to_s(
time_get_ns()) - start_sse2_color) / iterations;
759#if SIMD_SUPPORT_SSSE3
762 ensure_default_palette_ready();
763 double start_ssse3_color = time_ns_to_s(
time_get_ns());
764 for (
int i = 0; i < iterations; i++) {
768 SAFE_FREE(result_str);
770 result.ssse3_time = (time_ns_to_s(
time_get_ns()) - start_ssse3_color) / iterations;
776 ensure_default_palette_ready();
777 double start_avx2_color = time_ns_to_s(
time_get_ns());
778 for (
int i = 0; i < iterations; i++) {
779 char *result_str = render_ascii_avx2_unified_optimized(frame, background_mode, use_256color,
DEFAULT_ASCII_PALETTE);
781 SAFE_FREE(result_str);
783 result.avx2_time = (time_ns_to_s(
time_get_ns()) - start_avx2_color) / iterations;
789 ensure_default_palette_ready();
790 double start_neon_color = time_ns_to_s(
time_get_ns());
791 for (
int i = 0; i < iterations; i++) {
792 char *result_str = render_ascii_neon_unified_optimized(frame, background_mode, use_256color,
DEFAULT_ASCII_PALETTE);
794 SAFE_FREE(result_str);
796 result.neon_time = (time_ns_to_s(
time_get_ns()) - start_neon_color) / iterations;
802 ensure_default_palette_ready();
803 double start_sve_color = time_ns_to_s(
time_get_ns());
804 for (
int i = 0; i < iterations; i++) {
805 char *result_str = render_ascii_sve_unified_optimized(frame, background_mode, use_256color,
DEFAULT_ASCII_PALETTE);
807 SAFE_FREE(result_str);
809 result.sve_time = (time_ns_to_s(
time_get_ns()) - start_sve_color) / iterations;
813 double best_time = result.scalar_time;
814 result.best_method =
"scalar";
817 if (result.sse2_time > 0 && result.sse2_time < best_time) {
818 best_time = result.sse2_time;
819 result.best_method =
"SSE2";
823#if SIMD_SUPPORT_SSSE3
824 if (result.ssse3_time > 0 && result.ssse3_time < best_time) {
825 best_time = result.ssse3_time;
826 result.best_method =
"SSSE3";
831 if (result.avx2_time > 0 && result.avx2_time < best_time) {
832 best_time = result.avx2_time;
833 result.best_method =
"AVX2";
838 if (result.neon_time > 0 && result.neon_time < best_time) {
839 best_time = result.neon_time;
840 result.best_method =
"NEON";
844 result.speedup_best = result.scalar_time / best_time;
847 if (result.sve_time > 0 && result.sve_time < best_time) {
848 best_time = result.sve_time;
849 result.best_method =
"SVE";
854 SAFE_FREE(test_pixels);
855 SAFE_FREE(output_buffer);
862 bool background_mode,
const image_t *source_image,
865 simd_benchmark_t result = {0};
870 if (checked_size_mul((
size_t)width, (size_t)height, &pixel_count) != ASCIICHAT_OK) {
871 log_error(
"Image dimensions %d x %d too large (overflow)", width, height);
875 size_t output_buffer_size = pixel_count * 30 + (size_t)width * 10;
878 rgb_pixel_t *test_pixels;
880 test_pixels = SAFE_CALLOC_SIMD(pixel_count,
sizeof(rgb_pixel_t), rgb_pixel_t *);
881 output_buffer = SAFE_MALLOC(output_buffer_size,
char *);
884 int adaptive_iterations = calculate_adaptive_iterations(pixel_count, 10.0);
886 const char *mode_str = background_mode ?
"background" :
"foreground";
891 printf(
"Using provided source image data for COLOR %s %dx%d benchmarking with %d iterations...\n", mode_str, width,
892 height, adaptive_iterations);
895 if (source_image->w == width && source_image->h == height) {
897 for (
size_t i = 0; i < pixel_count; i++) {
898 test_pixels[i].r = source_image->pixels[i].r;
899 test_pixels[i].g = source_image->pixels[i].g;
900 test_pixels[i].b = source_image->pixels[i].b;
904 float x_ratio = (float)source_image->w / width;
905 float y_ratio = (float)source_image->h / height;
907 for (
int y = 0; y < height; y++) {
908 for (
int x = 0; x < width; x++) {
909 int src_x = (int)(x * x_ratio);
910 int src_y = (int)(y * y_ratio);
913 if (src_x >= source_image->w)
914 src_x = source_image->w - 1;
915 if (src_y >= source_image->h)
916 src_y = source_image->h - 1;
919 size_t src_idx = (size_t)src_y * (
size_t)source_image->w + (size_t)src_x;
920 size_t dst_idx = (size_t)y * (
size_t)width + (size_t)x;
922 test_pixels[dst_idx].r = source_image->pixels[src_idx].r;
923 test_pixels[dst_idx].g = source_image->pixels[src_idx].g;
924 test_pixels[dst_idx].b = source_image->pixels[src_idx].b;
930 printf(
"Using synthetic gradient data for COLOR %s %dx%d benchmarking with %d iterations...\n", mode_str, width,
931 height, adaptive_iterations);
934 for (
size_t i = 0; i < pixel_count; i++) {
937 int base_r = (x * 255) / width;
938 int base_g = (y * 255) / height;
939 int base_b = ((x + y) * 127) / (width + height);
941 int temp_r = base_r + (rand() % 32 - 16);
942 int temp_g = base_g + (rand() % 32 - 16);
943 int temp_b = base_b + (rand() % 32 - 16);
945 test_pixels[i].r = clamp_rgb(temp_r);
946 test_pixels[i].g = clamp_rgb(temp_g);
947 test_pixels[i].b = clamp_rgb(temp_b);
951 printf(
"Benchmarking COLOR %s conversion using %d iterations...\n", mode_str, adaptive_iterations);
959 for (
int i = 0; i < adaptive_iterations; i++) {
960 image_t *test_image =
image_new(width, height);
961 if (test_image == NULL) {
962 SAFE_FREE(test_pixels);
963 SAFE_FREE(output_buffer);
964 FATAL(ERROR_MEMORY,
"Failed to allocate test_image in benchmark iteration %d", i);
966 memcpy(test_image->pixels, test_pixels, pixel_count *
sizeof(rgb_pixel_t));
970 SAFE_FREE(result_ascii);
973 result.scalar_time = time_ns_to_s(
time_get_ns()) - start;
976 double best_time = result.scalar_time;
977 result.best_method =
"scalar";
981 for (
int i = 0; i < adaptive_iterations; i++) {
982 image_t *test_image =
image_new(width, height);
984 memcpy(test_image->pixels, test_pixels, pixel_count *
sizeof(rgb_pixel_t));
986 render_ascii_sse2_unified_optimized(test_image, background_mode, use_256color,
DEFAULT_ASCII_PALETTE);
988 SAFE_FREE(result_str);
992 result.sse2_time = time_ns_to_s(
time_get_ns()) - start;
995#if SIMD_SUPPORT_SSSE3
997 for (
int i = 0; i < adaptive_iterations; i++) {
998 image_t *test_image =
image_new(width, height);
1000 memcpy(test_image->pixels, test_pixels, pixel_count *
sizeof(rgb_pixel_t));
1002 render_ascii_ssse3_unified_optimized(test_image, background_mode, use_256color,
DEFAULT_ASCII_PALETTE);
1004 SAFE_FREE(result_str);
1008 result.ssse3_time = time_ns_to_s(
time_get_ns()) - start;
1011#if SIMD_SUPPORT_AVX2
1013 for (
int i = 0; i < adaptive_iterations; i++) {
1014 image_t *test_image =
image_new(width, height);
1016 memcpy(test_image->pixels, test_pixels, pixel_count *
sizeof(rgb_pixel_t));
1018 render_ascii_avx2_unified_optimized(test_image, background_mode, use_256color,
DEFAULT_ASCII_PALETTE);
1020 SAFE_FREE(result_str);
1024 result.avx2_time = time_ns_to_s(
time_get_ns()) - start;
1027#if SIMD_SUPPORT_NEON
1029 for (
int i = 0; i < adaptive_iterations; i++) {
1031 image_t temp_image = {.pixels = test_pixels, .w = width, .h = height, .alloc_method = IMAGE_ALLOC_SIMD};
1033 render_ascii_neon_unified_optimized(&temp_image, background_mode, use_256color,
DEFAULT_ASCII_PALETTE);
1037 result.neon_time = time_ns_to_s(
time_get_ns()) - start;
1042 for (
int i = 0; i < adaptive_iterations; i++) {
1044 image_t temp_image = {.pixels = test_pixels, .w = width, .h = height, .alloc_method = IMAGE_ALLOC_SIMD};
1046 render_ascii_sve_unified_optimized(&temp_image, background_mode, use_256color,
DEFAULT_ASCII_PALETTE);
1050 result.sve_time = time_ns_to_s(
time_get_ns()) - start;
1053#if SIMD_SUPPORT_SSE2
1054 if (result.sse2_time > 0 && result.sse2_time < best_time) {
1055 best_time = result.sse2_time;
1056 result.best_method =
"SSE2";
1060#if SIMD_SUPPORT_SSSE3
1061 if (result.ssse3_time > 0 && result.ssse3_time < best_time) {
1062 best_time = result.ssse3_time;
1063 result.best_method =
"SSSE3";
1067#if SIMD_SUPPORT_AVX2
1068 if (result.avx2_time > 0 && result.avx2_time < best_time) {
1069 best_time = result.avx2_time;
1070 result.best_method =
"AVX2";
1074#if SIMD_SUPPORT_NEON
1075 if (result.neon_time > 0 && result.neon_time < best_time) {
1076 best_time = result.neon_time;
1077 result.best_method =
"NEON";
1082 result.scalar_time /= adaptive_iterations;
1083 if (result.sse2_time > 0)
1084 result.sse2_time /= adaptive_iterations;
1085 if (result.ssse3_time > 0)
1086 result.ssse3_time /= adaptive_iterations;
1087 if (result.avx2_time > 0)
1088 result.avx2_time /= adaptive_iterations;
1089 if (result.neon_time > 0)
1090 result.neon_time /= adaptive_iterations;
1092 best_time = result.scalar_time;
1094#if SIMD_SUPPORT_SSE2
1095 if (result.sse2_time > 0 && result.sse2_time < best_time)
1096 best_time = result.sse2_time;
1098#if SIMD_SUPPORT_SSSE3
1099 if (result.ssse3_time > 0 && result.ssse3_time < best_time)
1100 best_time = result.ssse3_time;
1102#if SIMD_SUPPORT_AVX2
1103 if (result.avx2_time > 0 && result.avx2_time < best_time)
1104 best_time = result.avx2_time;
1106#if SIMD_SUPPORT_NEON
1107 if (result.neon_time > 0 && result.neon_time < best_time)
1108 best_time = result.neon_time;
1111 if (result.sve_time > 0 && result.sve_time < best_time)
1112 best_time = result.sve_time;
1115 result.speedup_best = result.scalar_time / best_time;
1117 printf(
"------------\n");
1118 printf(
"scalar: %f\n", result.scalar_time);
1119 if (result.sse2_time > 0)
1120 printf(
"SSE2: %f\n", result.sse2_time);
1121 if (result.ssse3_time > 0)
1122 printf(
"SSSE3: %f\n", result.ssse3_time);
1123 if (result.avx2_time > 0)
1124 printf(
"avx2: %f\n", result.avx2_time);
1125 if (result.neon_time > 0)
1126 printf(
"neon: %f\n", result.neon_time);
1127 if (result.sve_time > 0)
1128 printf(
"sve: %f\n", result.sve_time);
1129 printf(
"Best method: %s, time: %f (%.2fx speedup (<1.0 = bad))\n", result.best_method, best_time,
1130 result.speedup_best);
1131 printf(
"------------\n");
1134 SAFE_FREE(test_pixels);
1135 SAFE_FREE(output_buffer);