14#include "../output_buffer.h"
15#include "../ansi_fast.h"
22static inline char *emit_set_256_color_fg_simple(
char *pos,
uint8_t color_idx) {
30 if (color_idx >= 100) {
31 *pos++ =
'0' + (color_idx / 100);
32 *pos++ =
'0' + ((color_idx / 10) % 10);
33 *pos++ =
'0' + (color_idx % 10);
34 }
else if (color_idx >= 10) {
35 *pos++ =
'0' + (color_idx / 10);
36 *pos++ =
'0' + (color_idx % 10);
38 *pos++ =
'0' + color_idx;
44static inline char *emit_set_256_color_bg_simple(
char *pos,
uint8_t color_idx) {
52 if (color_idx >= 100) {
53 *pos++ =
'0' + (color_idx / 100);
54 *pos++ =
'0' + ((color_idx / 10) % 10);
55 *pos++ =
'0' + (color_idx % 10);
56 }
else if (color_idx >= 10) {
57 *pos++ =
'0' + (color_idx / 10);
58 *pos++ =
'0' + (color_idx % 10);
60 *pos++ =
'0' + color_idx;
75 *pos++ =
'0' + (r / 100);
76 *pos++ =
'0' + ((r / 10) % 10);
77 *pos++ =
'0' + (r % 10);
79 *pos++ =
'0' + (r / 10);
80 *pos++ =
'0' + (r % 10);
86 *pos++ =
'0' + (g / 100);
87 *pos++ =
'0' + ((g / 10) % 10);
88 *pos++ =
'0' + (g % 10);
90 *pos++ =
'0' + (g / 10);
91 *pos++ =
'0' + (g % 10);
97 *pos++ =
'0' + (b / 100);
98 *pos++ =
'0' + ((b / 10) % 10);
99 *pos++ =
'0' + (b % 10);
100 }
else if (b >= 10) {
101 *pos++ =
'0' + (b / 10);
102 *pos++ =
'0' + (b % 10);
119 *pos++ =
'0' + (r / 100);
120 *pos++ =
'0' + ((r / 10) % 10);
121 *pos++ =
'0' + (r % 10);
122 }
else if (r >= 10) {
123 *pos++ =
'0' + (r / 10);
124 *pos++ =
'0' + (r % 10);
130 *pos++ =
'0' + (g / 100);
131 *pos++ =
'0' + ((g / 10) % 10);
132 *pos++ =
'0' + (g % 10);
133 }
else if (g >= 10) {
134 *pos++ =
'0' + (g / 10);
135 *pos++ =
'0' + (g % 10);
141 *pos++ =
'0' + (b / 100);
142 *pos++ =
'0' + ((b / 10) % 10);
143 *pos++ =
'0' + (b % 10);
144 }
else if (b >= 10) {
145 *pos++ =
'0' + (b / 10);
146 *pos++ =
'0' + (b % 10);
155static inline char *emit_rle_count(
char *pos,
uint32_t rep_count) {
160 if (rep_count >= 1000) {
161 *pos++ =
'0' + (rep_count / 1000);
162 *pos++ =
'0' + ((rep_count / 100) % 10);
163 *pos++ =
'0' + ((rep_count / 10) % 10);
164 *pos++ =
'0' + (rep_count % 10);
165 }
else if (rep_count >= 100) {
166 *pos++ =
'0' + (rep_count / 100);
167 *pos++ =
'0' + ((rep_count / 10) % 10);
168 *pos++ =
'0' + (rep_count % 10);
169 }
else if (rep_count >= 10) {
170 *pos++ =
'0' + (rep_count / 10);
171 *pos++ =
'0' + (rep_count % 10);
173 *pos++ =
'0' + rep_count;
189static inline void avx2_load_rgb32_optimized(
const rgb_pixel_t *__restrict pixels,
uint8_t *__restrict r_out,
192 for (
int i = 0; i < 32; i++) {
193 r_out[i] = pixels[i].r;
194 g_out[i] = pixels[i].g;
195 b_out[i] = pixels[i].b;
200static inline void avx2_compute_luminance_32(
const uint8_t *r_vals,
const uint8_t *g_vals,
const uint8_t *b_vals,
203 __m256i r_all = _mm256_loadu_si256((__m256i *)r_vals);
204 __m256i g_all = _mm256_loadu_si256((__m256i *)g_vals);
205 __m256i b_all = _mm256_loadu_si256((__m256i *)b_vals);
208 __m256i r_lo = _mm256_unpacklo_epi8(r_all, _mm256_setzero_si256());
209 __m256i g_lo = _mm256_unpacklo_epi8(g_all, _mm256_setzero_si256());
210 __m256i b_lo = _mm256_unpacklo_epi8(b_all, _mm256_setzero_si256());
212 __m256i luma_16_lo = _mm256_mullo_epi16(r_lo, _mm256_set1_epi16(77));
213 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_mullo_epi16(g_lo, _mm256_set1_epi16(150)));
214 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_mullo_epi16(b_lo, _mm256_set1_epi16(29)));
215 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_set1_epi16(128));
216 luma_16_lo = _mm256_srli_epi16(luma_16_lo, 8);
219 __m256i r_hi = _mm256_unpackhi_epi8(r_all, _mm256_setzero_si256());
220 __m256i g_hi = _mm256_unpackhi_epi8(g_all, _mm256_setzero_si256());
221 __m256i b_hi = _mm256_unpackhi_epi8(b_all, _mm256_setzero_si256());
223 __m256i luma_16_hi = _mm256_mullo_epi16(r_hi, _mm256_set1_epi16(77));
224 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_mullo_epi16(g_hi, _mm256_set1_epi16(150)));
225 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_mullo_epi16(b_hi, _mm256_set1_epi16(29)));
226 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_set1_epi16(128));
227 luma_16_hi = _mm256_srli_epi16(luma_16_hi, 8);
230 __m256i luma_packed = _mm256_packus_epi16(luma_16_lo, luma_16_hi);
236 __m256i luma_final = _mm256_permute4x64_epi64(luma_packed, 0xD8);
238 _mm256_storeu_si256((__m256i *)luminance_out, luma_final);
242char *render_ascii_image_monochrome_avx2(
const image_t *image,
const char *ascii_chars) {
243 if (!image || !image->
pixels || !ascii_chars) {
247 const int h = image->
h;
248 const int w = image->
w;
250 if (h <= 0 || w <= 0) {
257 log_error(
"Failed to get UTF-8 palette cache");
261 const rgb_pixel_t *pixels = (
const rgb_pixel_t *)image->
pixels;
266 size_t output_size = (size_t)h * ((
size_t)w * 12 + 1);
270 log_error(
"Failed to allocate output buffer for AVX2 rendering");
277 for (
int y = 0; y < h; y++) {
278 const rgb_pixel_t *row_pixels = &pixels[y * w];
284 avx2_load_rgb32_optimized(&row_pixels[x], avx2_r_buffer, avx2_g_buffer, avx2_b_buffer);
285 avx2_compute_luminance_32(avx2_r_buffer, avx2_g_buffer, avx2_b_buffer, avx2_luminance_buffer);
290 const uint8_t luma_idx = avx2_luminance_buffer[i] >> 2;
295 while (run_end < 32 && x + run_end < w) {
296 const uint8_t next_luma_idx = avx2_luminance_buffer[run_end] >> 2;
297 if (next_luma_idx != luma_idx)
301 int run = run_end - i;
308 pos = emit_rle_count(pos, run - 1);
311 for (
int k = 1; k < run; k++) {
323 const rgb_pixel_t *p = &row_pixels[x];
325 const uint8_t luma_idx = luminance >> 2;
331 const rgb_pixel_t *next_p = &row_pixels[j];
333 const uint8_t next_luma_idx = next_luminance >> 2;
334 if (next_luma_idx != luma_idx)
345 pos = emit_rle_count(pos, run - 1);
347 for (
int k = 1; k < run; k++) {
371char *render_ascii_avx2_unified_optimized(
const image_t *image,
bool use_background,
bool use_256color,
372 const char *ascii_chars) {
373 if (!image || !image->
pixels) {
377 const int width = image->
w;
378 const int height = image->
h;
380 if (width <= 0 || height <= 0) {
390 log_error(
"Failed to get UTF-8 palette cache for AVX2 color");
395 size_t bytes_per_pixel = use_256color ? 10u : 25u;
398 size_t height_times_width;
399 if (checked_size_mul((
size_t)height, (
size_t)width, &height_times_width) !=
ASCIICHAT_OK) {
400 log_error(
"Buffer size overflow: height * width overflow");
404 size_t pixel_data_size;
405 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) !=
ASCIICHAT_OK) {
406 log_error(
"Buffer size overflow: (height * width) * bytes_per_pixel overflow");
410 size_t height_times_16;
411 if (checked_size_mul((
size_t)height, 16u, &height_times_16) !=
ASCIICHAT_OK) {
412 log_error(
"Buffer size overflow: height * 16 overflow");
417 if (checked_size_add(pixel_data_size, height_times_16, &temp) !=
ASCIICHAT_OK) {
418 log_error(
"Buffer size overflow: pixel_data + height*16 overflow");
423 if (checked_size_add(temp, 1024u, &output_size) !=
ASCIICHAT_OK) {
424 log_error(
"Buffer size overflow: total output size overflow");
430 log_error(
"Failed to allocate output buffer for AVX2 color rendering");
435 const rgb_pixel_t *pixels_data = (
const rgb_pixel_t *)image->
pixels;
438 int curR = -1, curG = -1, curB = -1;
439 int cur_color_idx = -1;
443 for (
int y = 0; y < height; y++) {
444 const rgb_pixel_t *row_pixels = &pixels_data[y * width];
448 while (x + 31 < width) {
451 avx2_load_rgb32_optimized(&row_pixels[x], avx2_r_buffer, avx2_g_buffer, avx2_b_buffer);
452 avx2_compute_luminance_32(avx2_r_buffer, avx2_g_buffer, avx2_b_buffer, avx2_luminance_buffer);
457 const uint8_t R = avx2_r_buffer[i];
458 const uint8_t G = avx2_g_buffer[i];
459 const uint8_t B = avx2_b_buffer[i];
460 const uint8_t luma_idx = avx2_luminance_buffer[i] >> 2;
469 while (i + run < 32 && x + run < width) {
470 const uint8_t next_R = avx2_r_buffer[i + run];
471 const uint8_t next_G = avx2_g_buffer[i + run];
472 const uint8_t next_B = avx2_b_buffer[i + run];
473 const uint8_t next_luma_idx = avx2_luminance_buffer[i + run] >> 2;
475 if (next_char_idx != char_idx)
483 if (color_idx != cur_color_idx) {
484 if (use_background) {
485 pos = emit_set_256_color_bg_simple(pos, color_idx);
487 pos = emit_set_256_color_fg_simple(pos, color_idx);
489 cur_color_idx = color_idx;
497 pos = emit_rle_count(pos, run - 1);
499 for (
int k = 1; k < run; k++) {
509 while (i + run < 32 && x + run < width) {
510 const uint8_t next_R = avx2_r_buffer[i + run];
511 const uint8_t next_G = avx2_g_buffer[i + run];
512 const uint8_t next_B = avx2_b_buffer[i + run];
513 const uint8_t next_luma_idx = avx2_luminance_buffer[i + run] >> 2;
515 if (next_char_idx != char_idx)
517 if (next_R != R || next_G != G || next_B != B)
523 if ((
int)R != curR || (
int)G != curG || (
int)B != curB) {
524 if (use_background) {
525 pos = emit_set_truecolor_bg_simple(pos, R, G, B);
527 pos = emit_set_truecolor_fg_simple(pos, R, G, B);
539 pos = emit_rle_count(pos, run - 1);
541 for (
int k = 1; k < run; k++) {
554 const rgb_pixel_t *p = &row_pixels[x];
555 const uint8_t R = p->r, G = p->g, B = p->b;
557 const uint8_t luma_idx = luminance >> 2;
566 while (x + run < width) {
567 const rgb_pixel_t *next_p = &row_pixels[x + run];
569 const uint8_t next_luma_idx = next_luminance >> 2;
571 if (next_char_idx != char_idx)
579 if (color_idx != cur_color_idx) {
580 if (use_background) {
581 pos = emit_set_256_color_bg_simple(pos, color_idx);
583 pos = emit_set_256_color_fg_simple(pos, color_idx);
585 cur_color_idx = color_idx;
593 pos = emit_rle_count(pos, run - 1);
595 for (
int k = 1; k < run; k++) {
605 while (x + run < width) {
606 const rgb_pixel_t *next_p = &row_pixels[x + run];
608 const uint8_t next_luma_idx = next_luminance >> 2;
610 if (next_char_idx != char_idx)
612 if (next_p->r != R || next_p->g != G || next_p->b != B)
618 if ((
int)R != curR || (int)G != curG || (
int)B != curB) {
619 if (use_background) {
620 pos = emit_set_truecolor_bg_simple(pos, R, G, B);
622 pos = emit_set_truecolor_fg_simple(pos, R, G, B);
634 pos = emit_rle_count(pos, run - 1);
636 for (
int k = 1; k < run; k++) {
650 if (y < height - 1) {
661void avx2_caches_destroy(
void) {
663 log_debug(
"AVX2_CACHE: AVX2 optimized caches cleaned up");
AVX2-optimized ASCII rendering functions.
#define SAFE_MALLOC(size, cast)
#define log_error(...)
Log an ERROR message.
#define log_debug(...)
Log a DEBUG message.
#define LUMA_BLUE
Luminance blue coefficient (0.114 * 256 = 29)
#define LUMA_GREEN
Luminance green coefficient (0.587 * 256 = 150)
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)
Get or create UTF-8 palette cache.
bool rep_is_profitable(uint32_t runlen)
Check if run-length encoding is profitable.
uint8_t char_index_ramp[64]
uint8_t rgb_to_256color(uint8_t r, uint8_t g, uint8_t b)
Convert RGB to 256-color palette index.
#define LUMA_RED
Luminance red coefficient (0.299 * 256 = 77)
✅ Safe Integer Arithmetic and Overflow Detection
int w
Image width in pixels (must be > 0)
int h
Image height in pixels (must be > 0)
rgb_pixel_t * pixels
Pixel data array (width * height RGB pixels, row-major order)
UTF-8 character structure.
UTF-8 palette cache structure.
Common SIMD utilities and structures.