ascii-chat 0.8.38
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
avx2.c
Go to the documentation of this file.
1
7#include <stdio.h>
8#include <stdlib.h>
9#include <string.h>
10#include <stdint.h>
11#include <stdbool.h>
12#include <ascii-chat/video/simd/avx2.h>
13#include <ascii-chat/video/simd/common.h>
14#include <ascii-chat/common.h>
15#include <ascii-chat/video/output_buffer.h>
16#include <ascii-chat/video/ansi_fast.h>
17#include <ascii-chat/util/overflow.h>
18
19#if SIMD_SUPPORT_AVX2
20#include <immintrin.h>
21
22// Simple emission functions for direct buffer writing
23static inline char *emit_set_256_color_fg_simple(char *pos, uint8_t color_idx) {
24 *pos++ = '\x1b';
25 *pos++ = '[';
26 *pos++ = '3';
27 *pos++ = '8';
28 *pos++ = ';';
29 *pos++ = '5';
30 *pos++ = ';';
31 if (color_idx >= 100) {
32 *pos++ = '0' + (color_idx / 100);
33 *pos++ = '0' + ((color_idx / 10) % 10);
34 *pos++ = '0' + (color_idx % 10);
35 } else if (color_idx >= 10) {
36 *pos++ = '0' + (color_idx / 10);
37 *pos++ = '0' + (color_idx % 10);
38 } else {
39 *pos++ = '0' + color_idx;
40 }
41 *pos++ = 'm';
42 return pos;
43}
44
45static inline char *emit_set_256_color_bg_simple(char *pos, uint8_t color_idx) {
46 *pos++ = '\x1b';
47 *pos++ = '[';
48 *pos++ = '4';
49 *pos++ = '8';
50 *pos++ = ';';
51 *pos++ = '5';
52 *pos++ = ';';
53 if (color_idx >= 100) {
54 *pos++ = '0' + (color_idx / 100);
55 *pos++ = '0' + ((color_idx / 10) % 10);
56 *pos++ = '0' + (color_idx % 10);
57 } else if (color_idx >= 10) {
58 *pos++ = '0' + (color_idx / 10);
59 *pos++ = '0' + (color_idx % 10);
60 } else {
61 *pos++ = '0' + color_idx;
62 }
63 *pos++ = 'm';
64 return pos;
65}
66
67static inline char *emit_set_truecolor_fg_simple(char *pos, uint8_t r, uint8_t g, uint8_t b) {
68 *pos++ = '\x1b';
69 *pos++ = '[';
70 *pos++ = '3';
71 *pos++ = '8';
72 *pos++ = ';';
73 *pos++ = '2';
74 *pos++ = ';';
75 if (r >= 100) {
76 *pos++ = '0' + (r / 100);
77 *pos++ = '0' + ((r / 10) % 10);
78 *pos++ = '0' + (r % 10);
79 } else if (r >= 10) {
80 *pos++ = '0' + (r / 10);
81 *pos++ = '0' + (r % 10);
82 } else {
83 *pos++ = '0' + r;
84 }
85 *pos++ = ';';
86 if (g >= 100) {
87 *pos++ = '0' + (g / 100);
88 *pos++ = '0' + ((g / 10) % 10);
89 *pos++ = '0' + (g % 10);
90 } else if (g >= 10) {
91 *pos++ = '0' + (g / 10);
92 *pos++ = '0' + (g % 10);
93 } else {
94 *pos++ = '0' + g;
95 }
96 *pos++ = ';';
97 if (b >= 100) {
98 *pos++ = '0' + (b / 100);
99 *pos++ = '0' + ((b / 10) % 10);
100 *pos++ = '0' + (b % 10);
101 } else if (b >= 10) {
102 *pos++ = '0' + (b / 10);
103 *pos++ = '0' + (b % 10);
104 } else {
105 *pos++ = '0' + b;
106 }
107 *pos++ = 'm';
108 return pos;
109}
110
111static inline char *emit_set_truecolor_bg_simple(char *pos, uint8_t r, uint8_t g, uint8_t b) {
112 *pos++ = '\x1b';
113 *pos++ = '[';
114 *pos++ = '4';
115 *pos++ = '8';
116 *pos++ = ';';
117 *pos++ = '2';
118 *pos++ = ';';
119 if (r >= 100) {
120 *pos++ = '0' + (r / 100);
121 *pos++ = '0' + ((r / 10) % 10);
122 *pos++ = '0' + (r % 10);
123 } else if (r >= 10) {
124 *pos++ = '0' + (r / 10);
125 *pos++ = '0' + (r % 10);
126 } else {
127 *pos++ = '0' + r;
128 }
129 *pos++ = ';';
130 if (g >= 100) {
131 *pos++ = '0' + (g / 100);
132 *pos++ = '0' + ((g / 10) % 10);
133 *pos++ = '0' + (g % 10);
134 } else if (g >= 10) {
135 *pos++ = '0' + (g / 10);
136 *pos++ = '0' + (g % 10);
137 } else {
138 *pos++ = '0' + g;
139 }
140 *pos++ = ';';
141 if (b >= 100) {
142 *pos++ = '0' + (b / 100);
143 *pos++ = '0' + ((b / 10) % 10);
144 *pos++ = '0' + (b % 10);
145 } else if (b >= 10) {
146 *pos++ = '0' + (b / 10);
147 *pos++ = '0' + (b % 10);
148 } else {
149 *pos++ = '0' + b;
150 }
151 *pos++ = 'm';
152 return pos;
153}
154
155// Helper function to emit RLE repeat count (handles any count up to 9999)
156static inline char *emit_rle_count(char *pos, uint32_t rep_count) {
157 *pos++ = '\x1b';
158 *pos++ = '[';
159
160 // Handle up to 4 digits (max 9999)
161 if (rep_count >= 1000) {
162 *pos++ = '0' + (rep_count / 1000);
163 *pos++ = '0' + ((rep_count / 100) % 10);
164 *pos++ = '0' + ((rep_count / 10) % 10);
165 *pos++ = '0' + (rep_count % 10);
166 } else if (rep_count >= 100) {
167 *pos++ = '0' + (rep_count / 100);
168 *pos++ = '0' + ((rep_count / 10) % 10);
169 *pos++ = '0' + (rep_count % 10);
170 } else if (rep_count >= 10) {
171 *pos++ = '0' + (rep_count / 10);
172 *pos++ = '0' + (rep_count % 10);
173 } else {
174 *pos++ = '0' + rep_count;
175 }
176 *pos++ = 'b';
177
178 return pos;
179}
180
181// Thread-local storage for AVX2 working buffers
182// These stay in L1 cache and are reused across function calls
183// Non-static for shared library compatibility (still thread-local)
184THREAD_LOCAL ALIGNED_32 uint8_t avx2_r_buffer[32];
185THREAD_LOCAL ALIGNED_32 uint8_t avx2_g_buffer[32];
186THREAD_LOCAL ALIGNED_32 uint8_t avx2_b_buffer[32];
187THREAD_LOCAL ALIGNED_32 uint8_t avx2_luminance_buffer[32];
188
189// Optimized AVX2 function to load 32 RGB pixels and separate channels
190// Uses simple loop that auto-vectorizes to VMOVDQU + VPSHUFB
191static inline void avx2_load_rgb32_optimized(const rgb_pixel_t *__restrict pixels, uint8_t *__restrict r_out,
192 uint8_t *__restrict g_out, uint8_t *__restrict b_out) {
193 // Simple loop that compiler auto-vectorizes into efficient SIMD
194 for (int i = 0; i < 32; i++) {
195 r_out[i] = pixels[i].r;
196 g_out[i] = pixels[i].g;
197 b_out[i] = pixels[i].b;
198 }
199}
200
201// AVX2 function to compute luminance for 32 pixels
202static inline void avx2_compute_luminance_32(const uint8_t *r_vals, const uint8_t *g_vals, const uint8_t *b_vals,
203 uint8_t *luminance_out) {
204 // Load all 32 RGB values into AVX2 registers
205 __m256i r_all = _mm256_loadu_si256((const __m256i_u *)r_vals);
206 __m256i g_all = _mm256_loadu_si256((const __m256i_u *)g_vals);
207 __m256i b_all = _mm256_loadu_si256((const __m256i_u *)b_vals);
208
209 // Process low 16 pixels with accurate coefficients (16-bit math to prevent overflow)
210 __m256i r_lo = _mm256_unpacklo_epi8(r_all, _mm256_setzero_si256());
211 __m256i g_lo = _mm256_unpacklo_epi8(g_all, _mm256_setzero_si256());
212 __m256i b_lo = _mm256_unpacklo_epi8(b_all, _mm256_setzero_si256());
213
214 __m256i luma_16_lo = _mm256_mullo_epi16(r_lo, _mm256_set1_epi16(77));
215 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_mullo_epi16(g_lo, _mm256_set1_epi16(150)));
216 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_mullo_epi16(b_lo, _mm256_set1_epi16(29)));
217 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_set1_epi16(128));
218 luma_16_lo = _mm256_srli_epi16(luma_16_lo, 8);
219
220 // Process high 16 pixels with accurate coefficients
221 __m256i r_hi = _mm256_unpackhi_epi8(r_all, _mm256_setzero_si256());
222 __m256i g_hi = _mm256_unpackhi_epi8(g_all, _mm256_setzero_si256());
223 __m256i b_hi = _mm256_unpackhi_epi8(b_all, _mm256_setzero_si256());
224
225 __m256i luma_16_hi = _mm256_mullo_epi16(r_hi, _mm256_set1_epi16(77));
226 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_mullo_epi16(g_hi, _mm256_set1_epi16(150)));
227 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_mullo_epi16(b_hi, _mm256_set1_epi16(29)));
228 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_set1_epi16(128));
229 luma_16_hi = _mm256_srli_epi16(luma_16_hi, 8);
230
231 // Pack back to 8-bit
232 __m256i luma_packed = _mm256_packus_epi16(luma_16_lo, luma_16_hi);
233
234 // After unpack and pack operations, bytes are already in correct order [0-31]
235 // luma_16_lo contains pixels 0-7 (lower 128) and 16-23 (upper 128)
236 // luma_16_hi contains pixels 8-15 (lower 128) and 24-31 (upper 128)
237 // packus produces: [0-7, 8-15] in lower 128, [16-23, 24-31] in upper 128
238 // No permute needed - this is already the correct sequential order
239 _mm256_storeu_si256((__m256i_u *)luminance_out, luma_packed);
240}
241
242// Single-pass AVX2 monochrome renderer with immediate emission
243char *render_ascii_image_monochrome_avx2(const image_t *image, const char *ascii_chars) {
244 if (!image || !image->pixels || !ascii_chars) {
245 return NULL;
246 }
247
248 const int h = image->h;
249 const int w = image->w;
250
251 if (h <= 0 || w <= 0) {
252 return NULL;
253 }
254
255 // Get cached UTF-8 character mappings
256 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
257 if (!utf8_cache) {
258 log_error("Failed to get UTF-8 palette cache");
259 return NULL;
260 }
261
262 const rgb_pixel_t *pixels = (const rgb_pixel_t *)image->pixels;
263
264 // Use malloc for output buffer (will be freed by caller)
265 // Each pixel can produce: 4 bytes UTF-8 + 8 bytes RLE escape (\x1b[9999b) = 12 bytes max
266 // Plus 1 newline per row
267 size_t output_size = (size_t)h * ((size_t)w * 12 + 1);
268
269 char *output = SAFE_MALLOC(output_size, char *);
270 if (!output) {
271 log_error("Failed to allocate output buffer for AVX2 rendering");
272 return NULL;
273 }
274
275 char *pos = output;
276
277 // Process row by row for better cache locality
278 for (int y = 0; y < h; y++) {
279 const rgb_pixel_t *row_pixels = &pixels[y * w];
280 int x = 0;
281
282 // AVX2 fast path: process 32 pixels at a time
283 while (x + 31 < w) {
284 // Process 32 pixels with AVX2 using thread-local buffers
285 avx2_load_rgb32_optimized(&row_pixels[x], avx2_r_buffer, avx2_g_buffer, avx2_b_buffer);
286 avx2_compute_luminance_32(avx2_r_buffer, avx2_g_buffer, avx2_b_buffer, avx2_luminance_buffer);
287
288 // Convert to character indices and emit immediately
289 int i = 0;
290 while (i < 32) {
291 const uint8_t luma_idx = avx2_luminance_buffer[i] >> 2; // 0-63
292 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
293
294 // Find run length within this chunk
295 int run_end = i + 1;
296 while (run_end < 32 && x + run_end < w) {
297 const uint8_t next_luma_idx = avx2_luminance_buffer[run_end] >> 2;
298 if (next_luma_idx != luma_idx)
299 break;
300 run_end++;
301 }
302 int run = run_end - i;
303
304 // Emit UTF-8 character with RLE
305 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
306 pos += char_info->byte_len;
307
308 if (rep_is_profitable(run)) {
309 pos = emit_rle_count(pos, run - 1);
310 } else {
311 // Emit remaining characters
312 for (int k = 1; k < run; k++) {
313 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
314 pos += char_info->byte_len;
315 }
316 }
317 i = run_end;
318 }
319 x += 32;
320 }
321
322 // Scalar processing for remaining pixels (< 32)
323 while (x < w) {
324 const rgb_pixel_t *p = &row_pixels[x];
325 const int luminance = (LUMA_RED * p->r + LUMA_GREEN * p->g + LUMA_BLUE * p->b + 128) >> 8;
326 const uint8_t luma_idx = luminance >> 2; // 0-255 -> 0-63
327 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
328
329 // Find run length for RLE
330 int j = x + 1;
331 while (j < w) {
332 const rgb_pixel_t *next_p = &row_pixels[j];
333 const int next_luminance = (LUMA_RED * next_p->r + LUMA_GREEN * next_p->g + LUMA_BLUE * next_p->b + 128) >> 8;
334 const uint8_t next_luma_idx = next_luminance >> 2;
335 if (next_luma_idx != luma_idx)
336 break;
337 j++;
338 }
339 int run = j - x;
340
341 // Emit UTF-8 character with RLE
342 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
343 pos += char_info->byte_len;
344
345 if (rep_is_profitable(run)) {
346 pos = emit_rle_count(pos, run - 1);
347 } else {
348 for (int k = 1; k < run; k++) {
349 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
350 pos += char_info->byte_len;
351 }
352 }
353 x = j;
354 }
355
356 // Add reset sequence and newline after each row (except last)
357 *pos++ = '\x1b';
358 *pos++ = '[';
359 *pos++ = '0';
360 *pos++ = 'm';
361 if (y < h - 1) {
362 *pos++ = '\n';
363 }
364 }
365
366 *pos = '\0'; // Null terminate
367
368 return output;
369}
370
371// Single-pass AVX2 color renderer with immediate emission
372char *render_ascii_avx2_unified_optimized(const image_t *image, bool use_background, bool use_256color,
373 const char *ascii_chars) {
374 if (!image || !image->pixels) {
375 return NULL;
376 }
377
378 const int width = image->w;
379 const int height = image->h;
380
381 if (width <= 0 || height <= 0) {
382 char *empty;
383 empty = SAFE_MALLOC(1, char *);
384 empty[0] = '\0';
385 return empty;
386 }
387
388 // Get cached UTF-8 character mappings
389 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
390 if (!utf8_cache) {
391 log_error("Failed to get UTF-8 palette cache for AVX2 color");
392 return NULL;
393 }
394
395 // Use malloc for output buffer (will be freed by caller)
396 size_t bytes_per_pixel = use_256color ? 10u : 25u; // Conservative estimates
397
398 // Calculate buffer size with overflow checking
399 size_t height_times_width;
400 if (checked_size_mul((size_t)height, (size_t)width, &height_times_width) != ASCIICHAT_OK) {
401 log_error("Buffer size overflow: height * width overflow");
402 return NULL;
403 }
404
405 size_t pixel_data_size;
406 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
407 log_error("Buffer size overflow: (height * width) * bytes_per_pixel overflow");
408 return NULL;
409 }
410
411 size_t height_times_16;
412 if (checked_size_mul((size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
413 log_error("Buffer size overflow: height * 16 overflow");
414 return NULL;
415 }
416
417 size_t temp;
418 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
419 log_error("Buffer size overflow: pixel_data + height*16 overflow");
420 return NULL;
421 }
422
423 size_t output_size;
424 if (checked_size_add(temp, 1024u, &output_size) != ASCIICHAT_OK) {
425 log_error("Buffer size overflow: total output size overflow");
426 return NULL;
427 }
428
429 char *output = SAFE_MALLOC(output_size, char *);
430 if (!output) {
431 log_error("Failed to allocate output buffer for AVX2 color rendering");
432 return NULL;
433 }
434
435 char *pos = output;
436 const rgb_pixel_t *pixels_data = (const rgb_pixel_t *)image->pixels;
437
438 // Track current color state
439 int curR = -1, curG = -1, curB = -1;
440 int cur_color_idx = -1;
441
442 // Generate output row by row with single-pass processing
443
444 for (int y = 0; y < height; y++) {
445 const rgb_pixel_t *row_pixels = &pixels_data[y * width];
446 int x = 0;
447
448 // AVX2 fast path: process 32 pixels at a time
449 while (x + 31 < width) {
450
451 // Process 32 pixels with AVX2 using thread-local buffers
452 avx2_load_rgb32_optimized(&row_pixels[x], avx2_r_buffer, avx2_g_buffer, avx2_b_buffer);
453 avx2_compute_luminance_32(avx2_r_buffer, avx2_g_buffer, avx2_b_buffer, avx2_luminance_buffer);
454
455 // Process each pixel in the chunk
456 int i = 0;
457 (void)x;
458 while (i < 32) {
459 const uint8_t R = avx2_r_buffer[i];
460 const uint8_t G = avx2_g_buffer[i];
461 const uint8_t B = avx2_b_buffer[i];
462 const uint8_t luma_idx = avx2_luminance_buffer[i] >> 2;
463 // Use luma_idx directly to index cache64 (0-63), not char_index (0-char_count)
464 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
465 // For RLE comparison, we need char_idx
466 const uint8_t char_idx = utf8_cache->char_index_ramp[luma_idx];
467
468 if (use_256color) {
469 uint8_t color_idx = rgb_to_256color(R, G, B);
470
471 // Find run length
472 int run = 1;
473 while (i + run < 32 && x + run < width) {
474 const uint8_t next_R = avx2_r_buffer[i + run];
475 const uint8_t next_G = avx2_g_buffer[i + run];
476 const uint8_t next_B = avx2_b_buffer[i + run];
477 const uint8_t next_luma_idx = avx2_luminance_buffer[i + run] >> 2;
478 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
479 if (next_char_idx != char_idx)
480 break;
481 if (rgb_to_256color(next_R, next_G, next_B) != color_idx)
482 break;
483 run++;
484 }
485
486 // Set color if changed
487 if (color_idx != cur_color_idx) {
488 if (use_background) {
489 pos = emit_set_256_color_bg_simple(pos, color_idx);
490 } else {
491 pos = emit_set_256_color_fg_simple(pos, color_idx);
492 }
493 cur_color_idx = color_idx;
494 }
495
496 // Emit character with RLE
497 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
498 pos += char_info->byte_len;
499
500 if (rep_is_profitable(run)) {
501 pos = emit_rle_count(pos, run - 1);
502 } else {
503 for (int k = 1; k < run; k++) {
504 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
505 pos += char_info->byte_len;
506 }
507 }
508 i += run;
509 } else {
510 // Truecolor mode
511 // Find run length
512 int run = 1;
513 while (i + run < 32 && x + run < width) {
514 const uint8_t next_R = avx2_r_buffer[i + run];
515 const uint8_t next_G = avx2_g_buffer[i + run];
516 const uint8_t next_B = avx2_b_buffer[i + run];
517 const uint8_t next_luma_idx = avx2_luminance_buffer[i + run] >> 2;
518 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
519 if (next_char_idx != char_idx)
520 break;
521 if (next_R != R || next_G != G || next_B != B)
522 break;
523 run++;
524 }
525
526 // Set color if changed
527 if ((int)R != curR || (int)G != curG || (int)B != curB) {
528 if (use_background) {
529 pos = emit_set_truecolor_bg_simple(pos, R, G, B);
530 } else {
531 pos = emit_set_truecolor_fg_simple(pos, R, G, B);
532 }
533 curR = R;
534 curG = G;
535 curB = B;
536 }
537
538 // Emit character with RLE
539 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
540 pos += char_info->byte_len;
541
542 if (rep_is_profitable(run)) {
543 pos = emit_rle_count(pos, run - 1);
544 } else {
545 for (int k = 1; k < run; k++) {
546 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
547 pos += char_info->byte_len;
548 }
549 }
550 i += run;
551 }
552 }
553 x += 32;
554 }
555
556 // Scalar processing for remaining pixels (< 32)
557 while (x < width) {
558 const rgb_pixel_t *p = &row_pixels[x];
559 const uint8_t R = p->r, G = p->g, B = p->b;
560 const int luminance = (LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + 128) >> 8;
561 const uint8_t luma_idx = luminance >> 2;
562 // Use luma_idx directly to index cache64 (0-63), not char_index (0-char_count)
563 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
564 // For RLE comparison, we need char_idx
565 const uint8_t char_idx = utf8_cache->char_index_ramp[luma_idx];
566
567 if (use_256color) {
568 uint8_t color_idx = rgb_to_256color(R, G, B);
569
570 // Find run length
571 int run = 1;
572 while (x + run < width) {
573 const rgb_pixel_t *next_p = &row_pixels[x + run];
574 const int next_luminance = (LUMA_RED * next_p->r + LUMA_GREEN * next_p->g + LUMA_BLUE * next_p->b + 128) >> 8;
575 const uint8_t next_luma_idx = next_luminance >> 2;
576 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
577 if (next_char_idx != char_idx)
578 break;
579 if (rgb_to_256color(next_p->r, next_p->g, next_p->b) != color_idx)
580 break;
581 run++;
582 }
583
584 // Set color if changed
585 if (color_idx != cur_color_idx) {
586 if (use_background) {
587 pos = emit_set_256_color_bg_simple(pos, color_idx);
588 } else {
589 pos = emit_set_256_color_fg_simple(pos, color_idx);
590 }
591 cur_color_idx = color_idx;
592 }
593
594 // Emit character with RLE
595 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
596 pos += char_info->byte_len;
597
598 if (rep_is_profitable(run)) {
599 pos = emit_rle_count(pos, run - 1);
600 } else {
601 for (int k = 1; k < run; k++) {
602 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
603 pos += char_info->byte_len;
604 }
605 }
606 x += run;
607 } else {
608 // Truecolor mode
609 // Find run length
610 int run = 1;
611 while (x + run < width) {
612 const rgb_pixel_t *next_p = &row_pixels[x + run];
613 const int next_luminance = (LUMA_RED * next_p->r + LUMA_GREEN * next_p->g + LUMA_BLUE * next_p->b + 128) >> 8;
614 const uint8_t next_luma_idx = next_luminance >> 2;
615 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
616 if (next_char_idx != char_idx)
617 break;
618 if (next_p->r != R || next_p->g != G || next_p->b != B)
619 break;
620 run++;
621 }
622
623 // Set color if changed
624 if ((int)R != curR || (int)G != curG || (int)B != curB) {
625 if (use_background) {
626 pos = emit_set_truecolor_bg_simple(pos, R, G, B);
627 } else {
628 pos = emit_set_truecolor_fg_simple(pos, R, G, B);
629 }
630 curR = R;
631 curG = G;
632 curB = B;
633 }
634
635 // Emit character with RLE
636 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
637 pos += char_info->byte_len;
638
639 if (rep_is_profitable(run)) {
640 pos = emit_rle_count(pos, run - 1);
641 } else {
642 for (int k = 1; k < run; k++) {
643 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
644 pos += char_info->byte_len;
645 }
646 }
647 x += run;
648 }
649 }
650
651 // Add reset sequence and newline after each row (except last)
652 *pos++ = '\x1b';
653 *pos++ = '[';
654 *pos++ = '0';
655 *pos++ = 'm';
656 if (y < height - 1) {
657 *pos++ = '\n';
658 }
659 }
660
661 *pos = '\0'; // Null terminate
662
663 return output;
664}
665
666// Destroy AVX2 cache resources (called at program shutdown)
667void avx2_caches_destroy(void) {
668 // AVX2 currently uses shared caches from common.c, so no specific cleanup needed
669 log_dev("AVX2_CACHE: AVX2 optimized caches cleaned up");
670}
671
672#endif /* SIMD_SUPPORT_AVX2 */
uint8_t rgb_to_256color(uint8_t r, uint8_t g, uint8_t b)
Definition ansi_fast.c:230
bool rep_is_profitable(uint32_t runlen)
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)