ascii-chat 0.6.0
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
avx2.c
Go to the documentation of this file.
1
7#include <stdio.h>
8#include <stdlib.h>
9#include <string.h>
10#include <stdint.h>
11#include <stdbool.h>
12#include "avx2.h"
13#include "common.h"
14#include "../output_buffer.h"
15#include "../ansi_fast.h"
16#include "util/overflow.h"
17
18#if SIMD_SUPPORT_AVX2
19#include <immintrin.h>
20
21// Simple emission functions for direct buffer writing
22static inline char *emit_set_256_color_fg_simple(char *pos, uint8_t color_idx) {
23 *pos++ = '\x1b';
24 *pos++ = '[';
25 *pos++ = '3';
26 *pos++ = '8';
27 *pos++ = ';';
28 *pos++ = '5';
29 *pos++ = ';';
30 if (color_idx >= 100) {
31 *pos++ = '0' + (color_idx / 100);
32 *pos++ = '0' + ((color_idx / 10) % 10);
33 *pos++ = '0' + (color_idx % 10);
34 } else if (color_idx >= 10) {
35 *pos++ = '0' + (color_idx / 10);
36 *pos++ = '0' + (color_idx % 10);
37 } else {
38 *pos++ = '0' + color_idx;
39 }
40 *pos++ = 'm';
41 return pos;
42}
43
44static inline char *emit_set_256_color_bg_simple(char *pos, uint8_t color_idx) {
45 *pos++ = '\x1b';
46 *pos++ = '[';
47 *pos++ = '4';
48 *pos++ = '8';
49 *pos++ = ';';
50 *pos++ = '5';
51 *pos++ = ';';
52 if (color_idx >= 100) {
53 *pos++ = '0' + (color_idx / 100);
54 *pos++ = '0' + ((color_idx / 10) % 10);
55 *pos++ = '0' + (color_idx % 10);
56 } else if (color_idx >= 10) {
57 *pos++ = '0' + (color_idx / 10);
58 *pos++ = '0' + (color_idx % 10);
59 } else {
60 *pos++ = '0' + color_idx;
61 }
62 *pos++ = 'm';
63 return pos;
64}
65
66static inline char *emit_set_truecolor_fg_simple(char *pos, uint8_t r, uint8_t g, uint8_t b) {
67 *pos++ = '\x1b';
68 *pos++ = '[';
69 *pos++ = '3';
70 *pos++ = '8';
71 *pos++ = ';';
72 *pos++ = '2';
73 *pos++ = ';';
74 if (r >= 100) {
75 *pos++ = '0' + (r / 100);
76 *pos++ = '0' + ((r / 10) % 10);
77 *pos++ = '0' + (r % 10);
78 } else if (r >= 10) {
79 *pos++ = '0' + (r / 10);
80 *pos++ = '0' + (r % 10);
81 } else {
82 *pos++ = '0' + r;
83 }
84 *pos++ = ';';
85 if (g >= 100) {
86 *pos++ = '0' + (g / 100);
87 *pos++ = '0' + ((g / 10) % 10);
88 *pos++ = '0' + (g % 10);
89 } else if (g >= 10) {
90 *pos++ = '0' + (g / 10);
91 *pos++ = '0' + (g % 10);
92 } else {
93 *pos++ = '0' + g;
94 }
95 *pos++ = ';';
96 if (b >= 100) {
97 *pos++ = '0' + (b / 100);
98 *pos++ = '0' + ((b / 10) % 10);
99 *pos++ = '0' + (b % 10);
100 } else if (b >= 10) {
101 *pos++ = '0' + (b / 10);
102 *pos++ = '0' + (b % 10);
103 } else {
104 *pos++ = '0' + b;
105 }
106 *pos++ = 'm';
107 return pos;
108}
109
110static inline char *emit_set_truecolor_bg_simple(char *pos, uint8_t r, uint8_t g, uint8_t b) {
111 *pos++ = '\x1b';
112 *pos++ = '[';
113 *pos++ = '4';
114 *pos++ = '8';
115 *pos++ = ';';
116 *pos++ = '2';
117 *pos++ = ';';
118 if (r >= 100) {
119 *pos++ = '0' + (r / 100);
120 *pos++ = '0' + ((r / 10) % 10);
121 *pos++ = '0' + (r % 10);
122 } else if (r >= 10) {
123 *pos++ = '0' + (r / 10);
124 *pos++ = '0' + (r % 10);
125 } else {
126 *pos++ = '0' + r;
127 }
128 *pos++ = ';';
129 if (g >= 100) {
130 *pos++ = '0' + (g / 100);
131 *pos++ = '0' + ((g / 10) % 10);
132 *pos++ = '0' + (g % 10);
133 } else if (g >= 10) {
134 *pos++ = '0' + (g / 10);
135 *pos++ = '0' + (g % 10);
136 } else {
137 *pos++ = '0' + g;
138 }
139 *pos++ = ';';
140 if (b >= 100) {
141 *pos++ = '0' + (b / 100);
142 *pos++ = '0' + ((b / 10) % 10);
143 *pos++ = '0' + (b % 10);
144 } else if (b >= 10) {
145 *pos++ = '0' + (b / 10);
146 *pos++ = '0' + (b % 10);
147 } else {
148 *pos++ = '0' + b;
149 }
150 *pos++ = 'm';
151 return pos;
152}
153
154// Helper function to emit RLE repeat count (handles any count up to 9999)
155static inline char *emit_rle_count(char *pos, uint32_t rep_count) {
156 *pos++ = '\x1b';
157 *pos++ = '[';
158
159 // Handle up to 4 digits (max 9999)
160 if (rep_count >= 1000) {
161 *pos++ = '0' + (rep_count / 1000);
162 *pos++ = '0' + ((rep_count / 100) % 10);
163 *pos++ = '0' + ((rep_count / 10) % 10);
164 *pos++ = '0' + (rep_count % 10);
165 } else if (rep_count >= 100) {
166 *pos++ = '0' + (rep_count / 100);
167 *pos++ = '0' + ((rep_count / 10) % 10);
168 *pos++ = '0' + (rep_count % 10);
169 } else if (rep_count >= 10) {
170 *pos++ = '0' + (rep_count / 10);
171 *pos++ = '0' + (rep_count % 10);
172 } else {
173 *pos++ = '0' + rep_count;
174 }
175 *pos++ = 'b';
176
177 return pos;
178}
179
180// Thread-local storage for AVX2 working buffers
181// These stay in L1 cache and are reused across function calls
182static THREAD_LOCAL ALIGNED_32 uint8_t avx2_r_buffer[32];
183static THREAD_LOCAL ALIGNED_32 uint8_t avx2_g_buffer[32];
184static THREAD_LOCAL ALIGNED_32 uint8_t avx2_b_buffer[32];
185static THREAD_LOCAL ALIGNED_32 uint8_t avx2_luminance_buffer[32];
186
187// Optimized AVX2 function to load 32 RGB pixels and separate channels
188// Uses simple loop that auto-vectorizes to VMOVDQU + VPSHUFB
189static inline void avx2_load_rgb32_optimized(const rgb_pixel_t *__restrict pixels, uint8_t *__restrict r_out,
190 uint8_t *__restrict g_out, uint8_t *__restrict b_out) {
191 // Simple loop that compiler auto-vectorizes into efficient SIMD
192 for (int i = 0; i < 32; i++) {
193 r_out[i] = pixels[i].r;
194 g_out[i] = pixels[i].g;
195 b_out[i] = pixels[i].b;
196 }
197}
198
199// AVX2 function to compute luminance for 32 pixels
200static inline void avx2_compute_luminance_32(const uint8_t *r_vals, const uint8_t *g_vals, const uint8_t *b_vals,
201 uint8_t *luminance_out) {
202 // Load all 32 RGB values into AVX2 registers
203 __m256i r_all = _mm256_loadu_si256((__m256i *)r_vals);
204 __m256i g_all = _mm256_loadu_si256((__m256i *)g_vals);
205 __m256i b_all = _mm256_loadu_si256((__m256i *)b_vals);
206
207 // Process low 16 pixels with accurate coefficients (16-bit math to prevent overflow)
208 __m256i r_lo = _mm256_unpacklo_epi8(r_all, _mm256_setzero_si256());
209 __m256i g_lo = _mm256_unpacklo_epi8(g_all, _mm256_setzero_si256());
210 __m256i b_lo = _mm256_unpacklo_epi8(b_all, _mm256_setzero_si256());
211
212 __m256i luma_16_lo = _mm256_mullo_epi16(r_lo, _mm256_set1_epi16(77));
213 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_mullo_epi16(g_lo, _mm256_set1_epi16(150)));
214 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_mullo_epi16(b_lo, _mm256_set1_epi16(29)));
215 luma_16_lo = _mm256_add_epi16(luma_16_lo, _mm256_set1_epi16(128));
216 luma_16_lo = _mm256_srli_epi16(luma_16_lo, 8);
217
218 // Process high 16 pixels with accurate coefficients
219 __m256i r_hi = _mm256_unpackhi_epi8(r_all, _mm256_setzero_si256());
220 __m256i g_hi = _mm256_unpackhi_epi8(g_all, _mm256_setzero_si256());
221 __m256i b_hi = _mm256_unpackhi_epi8(b_all, _mm256_setzero_si256());
222
223 __m256i luma_16_hi = _mm256_mullo_epi16(r_hi, _mm256_set1_epi16(77));
224 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_mullo_epi16(g_hi, _mm256_set1_epi16(150)));
225 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_mullo_epi16(b_hi, _mm256_set1_epi16(29)));
226 luma_16_hi = _mm256_add_epi16(luma_16_hi, _mm256_set1_epi16(128));
227 luma_16_hi = _mm256_srli_epi16(luma_16_hi, 8);
228
229 // Pack back to 8-bit
230 __m256i luma_packed = _mm256_packus_epi16(luma_16_lo, luma_16_hi);
231
232 // Fix the 128-bit lane-local packing: [lo0..7, hi0..7, lo8..15, hi8..15] -> [lo0..15, hi0..15]
233 // After packing, bytes are in [Q0,Q1,Q2,Q3] = [0-7, 16-23, 8-15, 24-31]
234 // We want [0-15, 16-31] = [Q0,Q2,Q1,Q3]
235 // Use permute4x64 with 0xD8 = 0b11011000 = (3,1,2,0) to swap middle quarters
236 __m256i luma_final = _mm256_permute4x64_epi64(luma_packed, 0xD8);
237
238 _mm256_storeu_si256((__m256i *)luminance_out, luma_final);
239}
240
241// Single-pass AVX2 monochrome renderer with immediate emission
242char *render_ascii_image_monochrome_avx2(const image_t *image, const char *ascii_chars) {
243 if (!image || !image->pixels || !ascii_chars) {
244 return NULL;
245 }
246
247 const int h = image->h;
248 const int w = image->w;
249
250 if (h <= 0 || w <= 0) {
251 return NULL;
252 }
253
254 // Get cached UTF-8 character mappings
255 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
256 if (!utf8_cache) {
257 log_error("Failed to get UTF-8 palette cache");
258 return NULL;
259 }
260
261 const rgb_pixel_t *pixels = (const rgb_pixel_t *)image->pixels;
262
263 // Use malloc for output buffer (will be freed by caller)
264 // Each pixel can produce: 4 bytes UTF-8 + 8 bytes RLE escape (\x1b[9999b) = 12 bytes max
265 // Plus 1 newline per row
266 size_t output_size = (size_t)h * ((size_t)w * 12 + 1);
267
268 char *output = SAFE_MALLOC(output_size, char *);
269 if (!output) {
270 log_error("Failed to allocate output buffer for AVX2 rendering");
271 return NULL;
272 }
273
274 char *pos = output;
275
276 // Process row by row for better cache locality
277 for (int y = 0; y < h; y++) {
278 const rgb_pixel_t *row_pixels = &pixels[y * w];
279 int x = 0;
280
281 // AVX2 fast path: process 32 pixels at a time
282 while (x + 31 < w) {
283 // Process 32 pixels with AVX2 using thread-local buffers
284 avx2_load_rgb32_optimized(&row_pixels[x], avx2_r_buffer, avx2_g_buffer, avx2_b_buffer);
285 avx2_compute_luminance_32(avx2_r_buffer, avx2_g_buffer, avx2_b_buffer, avx2_luminance_buffer);
286
287 // Convert to character indices and emit immediately
288 int i = 0;
289 while (i < 32) {
290 const uint8_t luma_idx = avx2_luminance_buffer[i] >> 2; // 0-63
291 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
292
293 // Find run length within this chunk
294 int run_end = i + 1;
295 while (run_end < 32 && x + run_end < w) {
296 const uint8_t next_luma_idx = avx2_luminance_buffer[run_end] >> 2;
297 if (next_luma_idx != luma_idx)
298 break;
299 run_end++;
300 }
301 int run = run_end - i;
302
303 // Emit UTF-8 character with RLE
304 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
305 pos += char_info->byte_len;
306
307 if (rep_is_profitable(run)) {
308 pos = emit_rle_count(pos, run - 1);
309 } else {
310 // Emit remaining characters
311 for (int k = 1; k < run; k++) {
312 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
313 pos += char_info->byte_len;
314 }
315 }
316 i = run_end;
317 }
318 x += 32;
319 }
320
321 // Scalar processing for remaining pixels (< 32)
322 while (x < w) {
323 const rgb_pixel_t *p = &row_pixels[x];
324 const int luminance = (LUMA_RED * p->r + LUMA_GREEN * p->g + LUMA_BLUE * p->b + 128) >> 8;
325 const uint8_t luma_idx = luminance >> 2; // 0-255 -> 0-63
326 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
327
328 // Find run length for RLE
329 int j = x + 1;
330 while (j < w) {
331 const rgb_pixel_t *next_p = &row_pixels[j];
332 const int next_luminance = (LUMA_RED * next_p->r + LUMA_GREEN * next_p->g + LUMA_BLUE * next_p->b + 128) >> 8;
333 const uint8_t next_luma_idx = next_luminance >> 2;
334 if (next_luma_idx != luma_idx)
335 break;
336 j++;
337 }
338 int run = j - x;
339
340 // Emit UTF-8 character with RLE
341 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
342 pos += char_info->byte_len;
343
344 if (rep_is_profitable(run)) {
345 pos = emit_rle_count(pos, run - 1);
346 } else {
347 for (int k = 1; k < run; k++) {
348 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
349 pos += char_info->byte_len;
350 }
351 }
352 x = j;
353 }
354
355 // Add reset sequence and newline after each row (except last)
356 *pos++ = '\x1b';
357 *pos++ = '[';
358 *pos++ = '0';
359 *pos++ = 'm';
360 if (y < h - 1) {
361 *pos++ = '\n';
362 }
363 }
364
365 *pos = '\0'; // Null terminate
366
367 return output;
368}
369
370// Single-pass AVX2 color renderer with immediate emission
371char *render_ascii_avx2_unified_optimized(const image_t *image, bool use_background, bool use_256color,
372 const char *ascii_chars) {
373 if (!image || !image->pixels) {
374 return NULL;
375 }
376
377 const int width = image->w;
378 const int height = image->h;
379
380 if (width <= 0 || height <= 0) {
381 char *empty;
382 empty = SAFE_MALLOC(1, char *);
383 empty[0] = '\0';
384 return empty;
385 }
386
387 // Get cached UTF-8 character mappings
388 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
389 if (!utf8_cache) {
390 log_error("Failed to get UTF-8 palette cache for AVX2 color");
391 return NULL;
392 }
393
394 // Use malloc for output buffer (will be freed by caller)
395 size_t bytes_per_pixel = use_256color ? 10u : 25u; // Conservative estimates
396
397 // Calculate buffer size with overflow checking
398 size_t height_times_width;
399 if (checked_size_mul((size_t)height, (size_t)width, &height_times_width) != ASCIICHAT_OK) {
400 log_error("Buffer size overflow: height * width overflow");
401 return NULL;
402 }
403
404 size_t pixel_data_size;
405 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
406 log_error("Buffer size overflow: (height * width) * bytes_per_pixel overflow");
407 return NULL;
408 }
409
410 size_t height_times_16;
411 if (checked_size_mul((size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
412 log_error("Buffer size overflow: height * 16 overflow");
413 return NULL;
414 }
415
416 size_t temp;
417 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
418 log_error("Buffer size overflow: pixel_data + height*16 overflow");
419 return NULL;
420 }
421
422 size_t output_size;
423 if (checked_size_add(temp, 1024u, &output_size) != ASCIICHAT_OK) {
424 log_error("Buffer size overflow: total output size overflow");
425 return NULL;
426 }
427
428 char *output = SAFE_MALLOC(output_size, char *);
429 if (!output) {
430 log_error("Failed to allocate output buffer for AVX2 color rendering");
431 return NULL;
432 }
433
434 char *pos = output;
435 const rgb_pixel_t *pixels_data = (const rgb_pixel_t *)image->pixels;
436
437 // Track current color state
438 int curR = -1, curG = -1, curB = -1;
439 int cur_color_idx = -1;
440
441 // Generate output row by row with single-pass processing
442
443 for (int y = 0; y < height; y++) {
444 const rgb_pixel_t *row_pixels = &pixels_data[y * width];
445 int x = 0;
446
447 // AVX2 fast path: process 32 pixels at a time
448 while (x + 31 < width) {
449
450 // Process 32 pixels with AVX2 using thread-local buffers
451 avx2_load_rgb32_optimized(&row_pixels[x], avx2_r_buffer, avx2_g_buffer, avx2_b_buffer);
452 avx2_compute_luminance_32(avx2_r_buffer, avx2_g_buffer, avx2_b_buffer, avx2_luminance_buffer);
453
454 // Process each pixel in the chunk
455 int i = 0;
456 while (i < 32) {
457 const uint8_t R = avx2_r_buffer[i];
458 const uint8_t G = avx2_g_buffer[i];
459 const uint8_t B = avx2_b_buffer[i];
460 const uint8_t luma_idx = avx2_luminance_buffer[i] >> 2;
461 const uint8_t char_idx = utf8_cache->char_index_ramp[luma_idx];
462 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
463
464 if (use_256color) {
465 uint8_t color_idx = rgb_to_256color(R, G, B);
466
467 // Find run length
468 int run = 1;
469 while (i + run < 32 && x + run < width) {
470 const uint8_t next_R = avx2_r_buffer[i + run];
471 const uint8_t next_G = avx2_g_buffer[i + run];
472 const uint8_t next_B = avx2_b_buffer[i + run];
473 const uint8_t next_luma_idx = avx2_luminance_buffer[i + run] >> 2;
474 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
475 if (next_char_idx != char_idx)
476 break;
477 if (rgb_to_256color(next_R, next_G, next_B) != color_idx)
478 break;
479 run++;
480 }
481
482 // Set color if changed
483 if (color_idx != cur_color_idx) {
484 if (use_background) {
485 pos = emit_set_256_color_bg_simple(pos, color_idx);
486 } else {
487 pos = emit_set_256_color_fg_simple(pos, color_idx);
488 }
489 cur_color_idx = color_idx;
490 }
491
492 // Emit character with RLE
493 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
494 pos += char_info->byte_len;
495
496 if (rep_is_profitable(run)) {
497 pos = emit_rle_count(pos, run - 1);
498 } else {
499 for (int k = 1; k < run; k++) {
500 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
501 pos += char_info->byte_len;
502 }
503 }
504 i += run;
505 } else {
506 // Truecolor mode
507 // Find run length
508 int run = 1;
509 while (i + run < 32 && x + run < width) {
510 const uint8_t next_R = avx2_r_buffer[i + run];
511 const uint8_t next_G = avx2_g_buffer[i + run];
512 const uint8_t next_B = avx2_b_buffer[i + run];
513 const uint8_t next_luma_idx = avx2_luminance_buffer[i + run] >> 2;
514 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
515 if (next_char_idx != char_idx)
516 break;
517 if (next_R != R || next_G != G || next_B != B)
518 break;
519 run++;
520 }
521
522 // Set color if changed
523 if ((int)R != curR || (int)G != curG || (int)B != curB) {
524 if (use_background) {
525 pos = emit_set_truecolor_bg_simple(pos, R, G, B);
526 } else {
527 pos = emit_set_truecolor_fg_simple(pos, R, G, B);
528 }
529 curR = R;
530 curG = G;
531 curB = B;
532 }
533
534 // Emit character with RLE
535 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
536 pos += char_info->byte_len;
537
538 if (rep_is_profitable(run)) {
539 pos = emit_rle_count(pos, run - 1);
540 } else {
541 for (int k = 1; k < run; k++) {
542 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
543 pos += char_info->byte_len;
544 }
545 }
546 i += run;
547 }
548 }
549 x += 32;
550 }
551
552 // Scalar processing for remaining pixels (< 32)
553 while (x < width) {
554 const rgb_pixel_t *p = &row_pixels[x];
555 const uint8_t R = p->r, G = p->g, B = p->b;
556 const int luminance = (LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + 128) >> 8;
557 const uint8_t luma_idx = luminance >> 2;
558 const uint8_t char_idx = utf8_cache->char_index_ramp[luma_idx];
559 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
560
561 if (use_256color) {
562 uint8_t color_idx = rgb_to_256color(R, G, B);
563
564 // Find run length
565 int run = 1;
566 while (x + run < width) {
567 const rgb_pixel_t *next_p = &row_pixels[x + run];
568 const int next_luminance = (LUMA_RED * next_p->r + LUMA_GREEN * next_p->g + LUMA_BLUE * next_p->b + 128) >> 8;
569 const uint8_t next_luma_idx = next_luminance >> 2;
570 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
571 if (next_char_idx != char_idx)
572 break;
573 if (rgb_to_256color(next_p->r, next_p->g, next_p->b) != color_idx)
574 break;
575 run++;
576 }
577
578 // Set color if changed
579 if (color_idx != cur_color_idx) {
580 if (use_background) {
581 pos = emit_set_256_color_bg_simple(pos, color_idx);
582 } else {
583 pos = emit_set_256_color_fg_simple(pos, color_idx);
584 }
585 cur_color_idx = color_idx;
586 }
587
588 // Emit character with RLE
589 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
590 pos += char_info->byte_len;
591
592 if (rep_is_profitable(run)) {
593 pos = emit_rle_count(pos, run - 1);
594 } else {
595 for (int k = 1; k < run; k++) {
596 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
597 pos += char_info->byte_len;
598 }
599 }
600 x += run;
601 } else {
602 // Truecolor mode
603 // Find run length
604 int run = 1;
605 while (x + run < width) {
606 const rgb_pixel_t *next_p = &row_pixels[x + run];
607 const int next_luminance = (LUMA_RED * next_p->r + LUMA_GREEN * next_p->g + LUMA_BLUE * next_p->b + 128) >> 8;
608 const uint8_t next_luma_idx = next_luminance >> 2;
609 const uint8_t next_char_idx = utf8_cache->char_index_ramp[next_luma_idx];
610 if (next_char_idx != char_idx)
611 break;
612 if (next_p->r != R || next_p->g != G || next_p->b != B)
613 break;
614 run++;
615 }
616
617 // Set color if changed
618 if ((int)R != curR || (int)G != curG || (int)B != curB) {
619 if (use_background) {
620 pos = emit_set_truecolor_bg_simple(pos, R, G, B);
621 } else {
622 pos = emit_set_truecolor_fg_simple(pos, R, G, B);
623 }
624 curR = R;
625 curG = G;
626 curB = B;
627 }
628
629 // Emit character with RLE
630 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
631 pos += char_info->byte_len;
632
633 if (rep_is_profitable(run)) {
634 pos = emit_rle_count(pos, run - 1);
635 } else {
636 for (int k = 1; k < run; k++) {
637 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
638 pos += char_info->byte_len;
639 }
640 }
641 x += run;
642 }
643 }
644
645 // Add reset sequence and newline after each row (except last)
646 *pos++ = '\x1b';
647 *pos++ = '[';
648 *pos++ = '0';
649 *pos++ = 'm';
650 if (y < height - 1) {
651 *pos++ = '\n';
652 }
653 }
654
655 *pos = '\0'; // Null terminate
656
657 return output;
658}
659
660// Destroy AVX2 cache resources (called at program shutdown)
661void avx2_caches_destroy(void) {
662 // AVX2 currently uses shared caches from common.c, so no specific cleanup needed
663 log_debug("AVX2_CACHE: AVX2 optimized caches cleaned up");
664}
665
666#endif /* SIMD_SUPPORT_AVX2 */
AVX2-optimized ASCII rendering functions.
unsigned int uint32_t
Definition common.h:58
#define SAFE_MALLOC(size, cast)
Definition common.h:208
unsigned char uint8_t
Definition common.h:56
@ ASCIICHAT_OK
Definition error_codes.h:48
#define log_error(...)
Log an ERROR message.
#define log_debug(...)
Log a DEBUG message.
#define ALIGNED_32
32-byte alignment macro (POSIX: attribute((aligned(32))))
#define THREAD_LOCAL
Thread-local storage keyword (POSIX: __thread)
#define LUMA_BLUE
Luminance blue coefficient (0.114 * 256 = 29)
Definition ascii_simd.h:76
#define LUMA_GREEN
Luminance green coefficient (0.587 * 256 = 150)
Definition ascii_simd.h:74
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)
Get or create UTF-8 palette cache.
bool rep_is_profitable(uint32_t runlen)
Check if run-length encoding is profitable.
uint8_t rgb_to_256color(uint8_t r, uint8_t g, uint8_t b)
Convert RGB to 256-color palette index.
Definition ansi_fast.c:199
#define LUMA_RED
Luminance red coefficient (0.299 * 256 = 77)
Definition ascii_simd.h:72
✅ Safe Integer Arithmetic and Overflow Detection
Image structure.
int w
Image width in pixels (must be > 0)
int h
Image height in pixels (must be > 0)
rgb_pixel_t * pixels
Pixel data array (width * height RGB pixels, row-major order)
UTF-8 character structure.
UTF-8 palette cache structure.
Common SIMD utilities and structures.