12#include <ascii-chat/video/simd/avx2.h>
13#include <ascii-chat/video/simd/common.h>
14#include <ascii-chat/common.h>
15#include <ascii-chat/video/output_buffer.h>
16#include <ascii-chat/video/ansi_fast.h>
17#include <ascii-chat/util/overflow.h>
23static inline char *emit_set_256_color_fg_simple(
char *pos, uint8_t color_idx) {
31 if (color_idx >= 100) {
32 *pos++ =
'0' + (color_idx / 100);
33 *pos++ =
'0' + ((color_idx / 10) % 10);
34 *pos++ =
'0' + (color_idx % 10);
35 }
else if (color_idx >= 10) {
36 *pos++ =
'0' + (color_idx / 10);
37 *pos++ =
'0' + (color_idx % 10);
39 *pos++ =
'0' + color_idx;
45static inline char *emit_set_256_color_bg_simple(
char *pos, uint8_t color_idx) {
53 if (color_idx >= 100) {
54 *pos++ =
'0' + (color_idx / 100);
55 *pos++ =
'0' + ((color_idx / 10) % 10);
56 *pos++ =
'0' + (color_idx % 10);
57 }
else if (color_idx >= 10) {
58 *pos++ =
'0' + (color_idx / 10);
59 *pos++ =
'0' + (color_idx % 10);
61 *pos++ =
'0' + color_idx;
67static inline char *emit_set_truecolor_fg_simple(
char *pos, uint8_t r, uint8_t g, uint8_t b) {
76 *pos++ =
'0' + (r / 100);
77 *pos++ =
'0' + ((r / 10) % 10);
78 *pos++ =
'0' + (r % 10);
80 *pos++ =
'0' + (r / 10);
81 *pos++ =
'0' + (r % 10);
87 *pos++ =
'0' + (g / 100);
88 *pos++ =
'0' + ((g / 10) % 10);
89 *pos++ =
'0' + (g % 10);
91 *pos++ =
'0' + (g / 10);
92 *pos++ =
'0' + (g % 10);
98 *pos++ =
'0' + (b / 100);
99 *pos++ =
'0' + ((b / 10) % 10);
100 *pos++ =
'0' + (b % 10);
101 }
else if (b >= 10) {
102 *pos++ =
'0' + (b / 10);
103 *pos++ =
'0' + (b % 10);
111static inline char *emit_set_truecolor_bg_simple(
char *pos, uint8_t r, uint8_t g, uint8_t b) {
120 *pos++ =
'0' + (r / 100);
121 *pos++ =
'0' + ((r / 10) % 10);
122 *pos++ =
'0' + (r % 10);
123 }
else if (r >= 10) {
124 *pos++ =
'0' + (r / 10);
125 *pos++ =
'0' + (r % 10);
131 *pos++ =
'0' + (g / 100);
132 *pos++ =
'0' + ((g / 10) % 10);
133 *pos++ =
'0' + (g % 10);
134 }
else if (g >= 10) {
135 *pos++ =
'0' + (g / 10);
136 *pos++ =
'0' + (g % 10);
142 *pos++ =
'0' + (b / 100);
143 *pos++ =
'0' + ((b / 10) % 10);
144 *pos++ =
'0' + (b % 10);
145 }
else if (b >= 10) {
146 *pos++ =
'0' + (b / 10);
147 *pos++ =
'0' + (b % 10);
156static inline char *emit_rle_count(
char *pos, uint32_t rep_count) {
161 if (rep_count >= 1000) {
162 *pos++ =
'0' + (rep_count / 1000);
163 *pos++ =
'0' + ((rep_count / 100) % 10);
164 *pos++ =
'0' + ((rep_count / 10) % 10);
165 *pos++ =
'0' + (rep_count % 10);
166 }
else if (rep_count >= 100) {
167 *pos++ =
'0' + (rep_count / 100);
168 *pos++ =
'0' + ((rep_count / 10) % 10);
169 *pos++ =
'0' + (rep_count % 10);
170 }
else if (rep_count >= 10) {
171 *pos++ =
'0' + (rep_count / 10);
172 *pos++ =
'0' + (rep_count % 10);
174 *pos++ =
'0' + rep_count;
184THREAD_LOCAL ALIGNED_32 uint8_t avx2_r_buffer[32];
185THREAD_LOCAL ALIGNED_32 uint8_t avx2_g_buffer[32];
186THREAD_LOCAL ALIGNED_32 uint8_t avx2_b_buffer[32];
187THREAD_LOCAL ALIGNED_32 uint8_t avx2_luminance_buffer[32];
191static inline void avx2_load_rgb32_optimized(
const rgb_pixel_t *__restrict pixels, uint8_t *__restrict r_out,
192 uint8_t *__restrict g_out, uint8_t *__restrict b_out) {
194 for (
int i = 0; i < 32; i++) {
195 r_out[i] = pixels[i].r;
196 g_out[i] = pixels[i].g;
197 b_out[i] = pixels[i].b;
202static inline void avx2_compute_luminance_32(
const uint8_t *r_vals,
const uint8_t *g_vals,
const uint8_t *b_vals,
203 uint8_t *luminance_out) {
205 __m256i r_all = _mm256_loadu_si256((
const __m256i_u *)r_vals);
206 __m256i g_all = _mm256_loadu_si256((
const __m256i_u *)g_vals);
207 __m256i b_all = _mm256_loadu_si256((
const __m256i_u *)b_vals);
210 __m256i r_lo = _mm256_unpacklo_epi8(r_all, _mm256_setzero_si256());
211 __m256i g_lo = _mm256_unpacklo_epi8(g_all, _mm256_setzero_si256());
212 __m256i b_lo = _mm256_unpacklo_epi8(b_all, _mm256_setzero_si256());
214 __m256i luma_16_lo = _mm256_mullo_epi16(r_lo, _mm256_set1_epi16(77));
215 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_mullo_epi16(g_lo, _mm256_set1_epi16(150)));
216 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_mullo_epi16(b_lo, _mm256_set1_epi16(29)));
217 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_set1_epi16(128));
218 luma_16_lo = _mm256_srli_epi16(luma_16_lo, 8);
221 __m256i r_hi = _mm256_unpackhi_epi8(r_all, _mm256_setzero_si256());
222 __m256i g_hi = _mm256_unpackhi_epi8(g_all, _mm256_setzero_si256());
223 __m256i b_hi = _mm256_unpackhi_epi8(b_all, _mm256_setzero_si256());
225 __m256i luma_16_hi = _mm256_mullo_epi16(r_hi, _mm256_set1_epi16(77));
226 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_mullo_epi16(g_hi, _mm256_set1_epi16(150)));
227 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_mullo_epi16(b_hi, _mm256_set1_epi16(29)));
228 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_set1_epi16(128));
229 luma_16_hi = _mm256_srli_epi16(luma_16_hi, 8);
232 __m256i luma_packed = _mm256_packus_epi16(luma_16_lo, luma_16_hi);
239 _mm256_storeu_si256((__m256i_u *)luminance_out, luma_packed);
243char *render_ascii_image_monochrome_avx2(
const image_t *image,
const char *ascii_chars) {
244 if (!image || !image->pixels || !ascii_chars) {
248 const int h = image->h;
249 const int w = image->w;
251 if (h <= 0 || w <= 0) {
258 log_error(
"Failed to get UTF-8 palette cache");
262 const rgb_pixel_t *pixels = (
const rgb_pixel_t *)image->pixels;
267 size_t output_size = (size_t)h * ((
size_t)w * 12 + 1);
269 char *output = SAFE_MALLOC(output_size,
char *);
271 log_error(
"Failed to allocate output buffer for AVX2 rendering");
278 for (
int y = 0; y < h; y++) {
279 const rgb_pixel_t *row_pixels = &pixels[y * w];
285 avx2_load_rgb32_optimized(&row_pixels[x], avx2_r_buffer, avx2_g_buffer, avx2_b_buffer);
286 avx2_compute_luminance_32(avx2_r_buffer, avx2_g_buffer, avx2_b_buffer, avx2_luminance_buffer);
291 const uint8_t luma_idx = avx2_luminance_buffer[i] >> 2;
292 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
296 while (run_end < 32 && x + run_end < w) {
297 const uint8_t next_luma_idx = avx2_luminance_buffer[run_end] >> 2;
298 if (next_luma_idx != luma_idx)
302 int run = run_end - i;
305 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
306 pos += char_info->byte_len;
309 pos = emit_rle_count(pos, run - 1);
312 for (
int k = 1; k < run; k++) {
313 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
314 pos += char_info->byte_len;
324 const rgb_pixel_t *p = &row_pixels[x];
325 const int luminance = (LUMA_RED * p->r + LUMA_GREEN * p->g + LUMA_BLUE * p->b + 128) >> 8;
326 const uint8_t luma_idx = luminance >> 2;
327 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
332 const rgb_pixel_t *next_p = &row_pixels[j];
333 const int next_luminance = (LUMA_RED * next_p->r + LUMA_GREEN * next_p->g + LUMA_BLUE * next_p->b + 128) >> 8;
334 const uint8_t next_luma_idx = next_luminance >> 2;
335 if (next_luma_idx != luma_idx)
342 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
343 pos += char_info->byte_len;
346 pos = emit_rle_count(pos, run - 1);
348 for (
int k = 1; k < run; k++) {
349 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
350 pos += char_info->byte_len;
372char *render_ascii_avx2_unified_optimized(
const image_t *image,
bool use_background,
bool use_256color,
373 const char *ascii_chars) {
374 if (!image || !image->pixels) {
378 const int width = image->w;
379 const int height = image->h;
381 if (width <= 0 || height <= 0) {
383 empty = SAFE_MALLOC(1,
char *);
391 log_error(
"Failed to get UTF-8 palette cache for AVX2 color");
396 size_t bytes_per_pixel = use_256color ? 10u : 25u;
399 size_t height_times_width;
400 if (checked_size_mul((
size_t)height, (
size_t)width, &height_times_width) != ASCIICHAT_OK) {
401 log_error(
"Buffer size overflow: height * width overflow");
405 size_t pixel_data_size;
406 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
407 log_error(
"Buffer size overflow: (height * width) * bytes_per_pixel overflow");
411 size_t height_times_16;
412 if (checked_size_mul((
size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
413 log_error(
"Buffer size overflow: height * 16 overflow");
418 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
419 log_error(
"Buffer size overflow: pixel_data + height*16 overflow");
424 if (checked_size_add(temp, 1024u, &output_size) != ASCIICHAT_OK) {
425 log_error(
"Buffer size overflow: total output size overflow");
429 char *output = SAFE_MALLOC(output_size,
char *);
431 log_error(
"Failed to allocate output buffer for AVX2 color rendering");
436 const rgb_pixel_t *pixels_data = (
const rgb_pixel_t *)image->pixels;
439 int curR = -1, curG = -1, curB = -1;
440 int cur_color_idx = -1;
444 for (
int y = 0; y < height; y++) {
445 const rgb_pixel_t *row_pixels = &pixels_data[y * width];
449 while (x + 31 < width) {
452 avx2_load_rgb32_optimized(&row_pixels[x], avx2_r_buffer, avx2_g_buffer, avx2_b_buffer);
453 avx2_compute_luminance_32(avx2_r_buffer, avx2_g_buffer, avx2_b_buffer, avx2_luminance_buffer);
459 const uint8_t R = avx2_r_buffer[i];
460 const uint8_t G = avx2_g_buffer[i];
461 const uint8_t B = avx2_b_buffer[i];
462 const uint8_t luma_idx = avx2_luminance_buffer[i] >> 2;
464 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
466 const uint8_t char_idx = utf8_cache->char_index_ramp[luma_idx];
473 while (i + run < 32 && x + run < width) {
474 const uint8_t next_R = avx2_r_buffer[i + run];
475 const uint8_t next_G = avx2_g_buffer[i + run];
476 const uint8_t next_B = avx2_b_buffer[i + run];
477 const uint8_t next_luma_idx = avx2_luminance_buffer[i + run] >> 2;
478 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
479 if (next_char_idx != char_idx)
487 if (color_idx != cur_color_idx) {
488 if (use_background) {
489 pos = emit_set_256_color_bg_simple(pos, color_idx);
491 pos = emit_set_256_color_fg_simple(pos, color_idx);
493 cur_color_idx = color_idx;
497 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
498 pos += char_info->byte_len;
501 pos = emit_rle_count(pos, run - 1);
503 for (
int k = 1; k < run; k++) {
504 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
505 pos += char_info->byte_len;
513 while (i + run < 32 && x + run < width) {
514 const uint8_t next_R = avx2_r_buffer[i + run];
515 const uint8_t next_G = avx2_g_buffer[i + run];
516 const uint8_t next_B = avx2_b_buffer[i + run];
517 const uint8_t next_luma_idx = avx2_luminance_buffer[i + run] >> 2;
518 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
519 if (next_char_idx != char_idx)
521 if (next_R != R || next_G != G || next_B != B)
527 if ((
int)R != curR || (
int)G != curG || (
int)B != curB) {
528 if (use_background) {
529 pos = emit_set_truecolor_bg_simple(pos, R, G, B);
531 pos = emit_set_truecolor_fg_simple(pos, R, G, B);
539 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
540 pos += char_info->byte_len;
543 pos = emit_rle_count(pos, run - 1);
545 for (
int k = 1; k < run; k++) {
546 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
547 pos += char_info->byte_len;
558 const rgb_pixel_t *p = &row_pixels[x];
559 const uint8_t R = p->r, G = p->g, B = p->b;
560 const int luminance = (LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + 128) >> 8;
561 const uint8_t luma_idx = luminance >> 2;
563 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
565 const uint8_t char_idx = utf8_cache->char_index_ramp[luma_idx];
572 while (x + run < width) {
573 const rgb_pixel_t *next_p = &row_pixels[x + run];
574 const int next_luminance = (LUMA_RED * next_p->r + LUMA_GREEN * next_p->g + LUMA_BLUE * next_p->b + 128) >> 8;
575 const uint8_t next_luma_idx = next_luminance >> 2;
576 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
577 if (next_char_idx != char_idx)
585 if (color_idx != cur_color_idx) {
586 if (use_background) {
587 pos = emit_set_256_color_bg_simple(pos, color_idx);
589 pos = emit_set_256_color_fg_simple(pos, color_idx);
591 cur_color_idx = color_idx;
595 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
596 pos += char_info->byte_len;
599 pos = emit_rle_count(pos, run - 1);
601 for (
int k = 1; k < run; k++) {
602 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
603 pos += char_info->byte_len;
611 while (x + run < width) {
612 const rgb_pixel_t *next_p = &row_pixels[x + run];
613 const int next_luminance = (LUMA_RED * next_p->r + LUMA_GREEN * next_p->g + LUMA_BLUE * next_p->b + 128) >> 8;
614 const uint8_t next_luma_idx = next_luminance >> 2;
615 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
616 if (next_char_idx != char_idx)
618 if (next_p->r != R || next_p->g != G || next_p->b != B)
624 if ((
int)R != curR || (int)G != curG || (
int)B != curB) {
625 if (use_background) {
626 pos = emit_set_truecolor_bg_simple(pos, R, G, B);
628 pos = emit_set_truecolor_fg_simple(pos, R, G, B);
636 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
637 pos += char_info->byte_len;
640 pos = emit_rle_count(pos, run - 1);
642 for (
int k = 1; k < run; k++) {
643 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
644 pos += char_info->byte_len;
656 if (y < height - 1) {
667void avx2_caches_destroy(
void) {
669 log_dev(
"AVX2_CACHE: AVX2 optimized caches cleaned up");
uint8_t rgb_to_256color(uint8_t r, uint8_t g, uint8_t b)
bool rep_is_profitable(uint32_t runlen)
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)