33static inline void build_neon_lookup_tables(
utf8_palette_cache_t *utf8_cache, uint8x16x4_t *tbl, uint8x16x4_t *char_lut,
34 uint8x16x4_t *length_lut, uint8x16x4_t *char_byte0_lut,
35 uint8x16x4_t *char_byte1_lut, uint8x16x4_t *char_byte2_lut,
36 uint8x16x4_t *char_byte3_lut) {
39 for (
int i = 0; i < 64; i++) {
40 cache64_indices[i] = (
uint8_t)i;
43 tbl->val[0] = vld1q_u8(&cache64_indices[0]);
44 tbl->val[1] = vld1q_u8(&cache64_indices[16]);
45 tbl->val[2] = vld1q_u8(&cache64_indices[32]);
46 tbl->val[3] = vld1q_u8(&cache64_indices[48]);
56 for (
int i = 0; i < 64; i++) {
63 char_lengths[i] = char_info->
byte_len;
71 char_lut->val[0] = vld1q_u8(&ascii_chars_lut[0]);
72 char_lut->val[1] = vld1q_u8(&ascii_chars_lut[16]);
73 char_lut->val[2] = vld1q_u8(&ascii_chars_lut[32]);
74 char_lut->val[3] = vld1q_u8(&ascii_chars_lut[48]);
76 length_lut->val[0] = vld1q_u8(&char_lengths[0]);
77 length_lut->val[1] = vld1q_u8(&char_lengths[16]);
78 length_lut->val[2] = vld1q_u8(&char_lengths[32]);
79 length_lut->val[3] = vld1q_u8(&char_lengths[48]);
81 char_byte0_lut->val[0] = vld1q_u8(&char_byte0[0]);
82 char_byte0_lut->val[1] = vld1q_u8(&char_byte0[16]);
83 char_byte0_lut->val[2] = vld1q_u8(&char_byte0[32]);
84 char_byte0_lut->val[3] = vld1q_u8(&char_byte0[48]);
86 char_byte1_lut->val[0] = vld1q_u8(&char_byte1[0]);
87 char_byte1_lut->val[1] = vld1q_u8(&char_byte1[16]);
88 char_byte1_lut->val[2] = vld1q_u8(&char_byte1[32]);
89 char_byte1_lut->val[3] = vld1q_u8(&char_byte1[48]);
91 char_byte2_lut->val[0] = vld1q_u8(&char_byte2[0]);
92 char_byte2_lut->val[1] = vld1q_u8(&char_byte2[16]);
93 char_byte2_lut->val[2] = vld1q_u8(&char_byte2[32]);
94 char_byte2_lut->val[3] = vld1q_u8(&char_byte2[48]);
96 char_byte3_lut->val[0] = vld1q_u8(&char_byte3[0]);
97 char_byte3_lut->val[1] = vld1q_u8(&char_byte3[16]);
98 char_byte3_lut->val[2] = vld1q_u8(&char_byte3[32]);
99 char_byte3_lut->val[3] = vld1q_u8(&char_byte3[48]);
103void neon_caches_destroy(
void) {
109static inline uint16_t neon_horizontal_sum_u8(uint8x16_t vec) {
110 uint16x8_t sum16_lo = vpaddlq_u8(vec);
111 uint32x4_t sum32 = vpaddlq_u16(sum16_lo);
112 uint64x2_t sum64 = vpaddlq_u32(sum32);
113 return (
uint16_t)(vgetq_lane_u64(sum64, 0) + vgetq_lane_u64(sum64, 1));
117static inline int find_rle_run_length_neon(
const uint8_t *char_buf,
const uint8_t *color_buf,
int start_pos,
122 int remaining = max_len - start_pos - 1;
126 const uint8_t *char_ptr = &char_buf[start_pos + 1];
127 const uint8_t *color_ptr = &color_buf[start_pos + 1];
130 while (remaining >= 16) {
131 uint8x16_t chars = vld1q_u8(char_ptr);
132 uint8x16_t colors = vld1q_u8(color_ptr);
134 uint8x16_t char_match = vceqq_u8(chars, vdupq_n_u8(target_char));
135 uint8x16_t color_match = vceqq_u8(colors, vdupq_n_u8(target_color));
136 uint8x16_t both_match = vandq_u8(char_match, color_match);
140 uint8_t min_match = vminvq_u8(both_match);
142 if (min_match == 0xFF) {
150 uint64_t mask_lo = vgetq_lane_u64(vreinterpretq_u64_u8(both_match), 0);
151 uint64_t mask_hi = vgetq_lane_u64(vreinterpretq_u64_u8(both_match), 1);
153 int matches_found = 0;
155 for (
int i = 0; i < 8; i++) {
156 if ((mask_lo >> (i * 8)) & 0xFF) {
164 if (matches_found == 8) {
165 for (
int i = 0; i < 8; i++) {
166 if ((mask_hi >> (i * 8)) & 0xFF) {
174 run_length += matches_found;
180 while (remaining > 0 && *char_ptr == target_char && *color_ptr == target_color) {
191static inline bool all_same_length_neon(uint8x16_t lengths,
uint8_t *out_length) {
192 uint8_t first_len = vgetq_lane_u8(lengths, 0);
193 uint8x16_t first_len_vec = vdupq_n_u8(first_len);
194 uint8x16_t all_same = vceqq_u8(lengths, first_len_vec);
196 uint64x2_t all_same_64 = vreinterpretq_u64_u8(all_same);
197 uint64_t combined = vgetq_lane_u64(all_same_64, 0) & vgetq_lane_u64(all_same_64, 1);
199 if (combined == 0xFFFFFFFFFFFFFFFF) {
200 *out_length = first_len;
212static uint8_t neon_decimal_table_data[256 * 4];
213static bool neon_decimal_table_initialized =
false;
216void init_neon_decimal_table(
void) {
217 if (neon_decimal_table_initialized)
226 for (
int i = 0; i < 256; i++) {
228 uint8_t *entry = &neon_decimal_table_data[i * 4];
230 entry[1] = (dec->
len >= 1) ? dec->
s[0] :
'0';
231 entry[2] = (dec->
len >= 2) ? dec->
s[1] :
'0';
232 entry[3] = (dec->
len >= 3) ? dec->
s[2] :
'0';
235 neon_decimal_table_initialized =
true;
243static inline size_t neon_assemble_truecolor_sequences_true_simd(uint8x16_t char_indices, uint8x16_t r_vals,
244 uint8x16_t g_vals, uint8x16_t b_vals,
246 size_t buffer_capacity,
bool use_background) {
251 init_neon_decimal_table();
253 char *dst = output_buffer;
256 uint8_t char_idx_buf[16], r_buf[16], g_buf[16], b_buf[16];
257 vst1q_u8(char_idx_buf, char_indices);
258 vst1q_u8(r_buf, r_vals);
259 vst1q_u8(g_buf, g_vals);
260 vst1q_u8(b_buf, b_vals);
262 size_t total_written = 0;
263 const char *prefix = use_background ?
"\033[48;2;" :
"\033[38;2;";
264 const size_t prefix_len = 7;
268 for (
int i = 0; i < 16; i++) {
270 const uint8_t *r_entry = &neon_decimal_table_data[r_buf[i] * 4];
271 const uint8_t *g_entry = &neon_decimal_table_data[g_buf[i] * 4];
272 const uint8_t *b_entry = &neon_decimal_table_data[b_buf[i] * 4];
274 const uint8_t char_idx = char_idx_buf[i];
278 size_t seq_len = prefix_len + r_entry[0] + 1 + g_entry[0] + 1 + b_entry[0] + 1 + char_info->
byte_len;
279 if (total_written >= buffer_capacity - seq_len) {
284 memcpy(dst, prefix, prefix_len);
288 memcpy(dst, &r_entry[1], r_entry[0]);
292 memcpy(dst, &g_entry[1], g_entry[0]);
296 memcpy(dst, &b_entry[1], b_entry[0]);
304 total_written = dst - output_buffer;
307 return total_written;
317static inline void __attribute__((unused)) compact_utf8_vectorized(
uint8_t *padded_data, uint8x16_t lengths,
327 uint8x16_t chunk1 = vld1q_u8(&padded_data[0]);
328 uint8x16_t chunk2 = vld1q_u8(&padded_data[16]);
329 uint8x16_t chunk3 = vld1q_u8(&padded_data[32]);
330 uint8x16_t chunk4 = vld1q_u8(&padded_data[48]);
334 if (total_bytes <= 16) {
335 vst1q_u8((
uint8_t *)*pos, chunk1);
336 }
else if (total_bytes <= 32) {
337 vst1q_u8((
uint8_t *)*pos, chunk1);
338 vst1q_u8((
uint8_t *)*pos + 16, chunk2);
339 }
else if (total_bytes <= 48) {
340 vst1q_u8((
uint8_t *)*pos, chunk1);
341 vst1q_u8((
uint8_t *)*pos + 16, chunk2);
342 vst1q_u8((
uint8_t *)*pos + 32, chunk3);
344 vst1q_u8((
uint8_t *)*pos, chunk1);
345 vst1q_u8((
uint8_t *)*pos + 16, chunk2);
346 vst1q_u8((
uint8_t *)*pos + 32, chunk3);
347 vst1q_u8((
uint8_t *)*pos + 48, chunk4);
358static inline uint8x16_t
__attribute__((unused)) luma_to_idx_nibble_neon(uint8x16_t y) {
359 return vshrq_n_u8(y, 4);
365static inline uint8x16_t simd_luma_neon(uint8x16_t r, uint8x16_t g, uint8x16_t b) {
366 uint16x8_t rl = vmovl_u8(vget_low_u8(r));
367 uint16x8_t rh = vmovl_u8(vget_high_u8(r));
368 uint16x8_t gl = vmovl_u8(vget_low_u8(g));
369 uint16x8_t gh = vmovl_u8(vget_high_u8(g));
370 uint16x8_t bl = vmovl_u8(vget_low_u8(b));
371 uint16x8_t bh = vmovl_u8(vget_high_u8(b));
373 uint32x4_t l0 = vmull_n_u16(vget_low_u16(rl),
LUMA_RED);
374 uint32x4_t l1 = vmull_n_u16(vget_high_u16(rl),
LUMA_RED);
375 l0 = vmlal_n_u16(l0, vget_low_u16(gl),
LUMA_GREEN);
376 l1 = vmlal_n_u16(l1, vget_high_u16(gl),
LUMA_GREEN);
377 l0 = vmlal_n_u16(l0, vget_low_u16(bl),
LUMA_BLUE);
378 l1 = vmlal_n_u16(l1, vget_high_u16(bl),
LUMA_BLUE);
380 uint32x4_t h0 = vmull_n_u16(vget_low_u16(rh),
LUMA_RED);
381 uint32x4_t h1 = vmull_n_u16(vget_high_u16(rh),
LUMA_RED);
382 h0 = vmlal_n_u16(h0, vget_low_u16(gh),
LUMA_GREEN);
383 h1 = vmlal_n_u16(h1, vget_high_u16(gh),
LUMA_GREEN);
384 h0 = vmlal_n_u16(h0, vget_low_u16(bh),
LUMA_BLUE);
385 h1 = vmlal_n_u16(h1, vget_high_u16(bh),
LUMA_BLUE);
387 uint16x8_t l = vcombine_u16(vrshrn_n_u32(l0, 8), vrshrn_n_u32(l1, 8));
388 uint16x8_t h = vcombine_u16(vrshrn_n_u32(h0, 8), vrshrn_n_u32(h1, 8));
389 return vcombine_u8(vqmovn_u16(l), vqmovn_u16(h));
395static inline uint8x16_t
__attribute__((unused)) quant6_neon(uint8x16_t x) {
396 uint16x8_t xl = vmovl_u8(vget_low_u8(x));
397 uint16x8_t xh = vmovl_u8(vget_high_u8(x));
398 uint16x8_t tl = vaddq_u16(vmulq_n_u16(xl, 5), vdupq_n_u16(127));
399 uint16x8_t th = vaddq_u16(vmulq_n_u16(xh, 5), vdupq_n_u16(127));
400 uint32x4_t tl0 = vmull_n_u16(vget_low_u16(tl), 257);
401 uint32x4_t tl1 = vmull_n_u16(vget_high_u16(tl), 257);
402 uint32x4_t th0 = vmull_n_u16(vget_low_u16(th), 257);
403 uint32x4_t th1 = vmull_n_u16(vget_high_u16(th), 257);
404 uint16x8_t ql = vcombine_u16(vshrn_n_u32(tl0, 16), vshrn_n_u32(tl1, 16));
405 uint16x8_t qh = vcombine_u16(vshrn_n_u32(th0, 16), vshrn_n_u32(th1, 16));
406 return vcombine_u8(vqmovn_u16(ql), vqmovn_u16(qh));
410static inline uint8x16_t
__attribute__((unused)) cube216_index_neon(uint8x16_t r6, uint8x16_t g6, uint8x16_t b6) {
411 uint16x8_t rl = vmovl_u8(vget_low_u8(r6));
412 uint16x8_t rh = vmovl_u8(vget_high_u8(r6));
413 uint16x8_t gl = vmovl_u8(vget_low_u8(g6));
414 uint16x8_t gh = vmovl_u8(vget_high_u8(g6));
415 uint16x8_t bl = vmovl_u8(vget_low_u8(b6));
416 uint16x8_t bh = vmovl_u8(vget_high_u8(b6));
417 uint16x8_t il = vmlaq_n_u16(vmlaq_n_u16(vmulq_n_u16(rl, 36), gl, 6), bl, 1);
418 uint16x8_t ih = vmlaq_n_u16(vmlaq_n_u16(vmulq_n_u16(rh, 36), gh, 6), bh, 1);
419 return vcombine_u8(vqmovn_u16(il), vqmovn_u16(ih));
423static inline uint8x16_t q6_from_u8(uint8x16_t x) {
424 uint16x8_t xl = vmovl_u8(vget_low_u8(x));
425 uint16x8_t xh = vmovl_u8(vget_high_u8(x));
426 xl = vmlaq_n_u16(vdupq_n_u16(0), xl, 5);
427 xh = vmlaq_n_u16(vdupq_n_u16(0), xh, 5);
428 xl = vaddq_u16(xl, vdupq_n_u16(128));
429 xh = vaddq_u16(xh, vdupq_n_u16(128));
430 xl = vshrq_n_u16(xl, 8);
431 xh = vshrq_n_u16(xh, 8);
432 return vcombine_u8(vqmovn_u16(xl), vqmovn_u16(xh));
436#ifndef CUBE_GRAY_THRESHOLD
437#define CUBE_GRAY_THRESHOLD 10
441static inline uint8x16_t apply_ordered_dither(uint8x16_t color,
int pixel_offset,
uint8_t dither_strength) {
443 static const uint8_t bayer4x4[16] = {0, 8, 2, 10, 12, 4, 14, 6, 3, 11, 1, 9, 15, 7, 13, 5};
446 const uint8x16_t dither_matrix = vld1q_u8(bayer4x4);
450 for (
int i = 0; i < 16; i++) {
451 pos_indices[i] = (pixel_offset + i) & 15;
453 const uint8x16_t position_vec = vld1q_u8(pos_indices);
456 uint8x16_t dither_values = vqtbl1q_u8(dither_matrix, position_vec);
460 uint16x8_t dither_lo = vmulq_n_u16(vmovl_u8(vget_low_u8(dither_values)), dither_strength);
461 uint16x8_t dither_hi = vmulq_n_u16(vmovl_u8(vget_high_u8(dither_values)), dither_strength);
462 dither_lo = vshrq_n_u16(dither_lo, 4);
463 dither_hi = vshrq_n_u16(dither_hi, 4);
464 uint8x16_t scaled_dither = vcombine_u8(vqmovn_u16(dither_lo), vqmovn_u16(dither_hi));
467 return vqaddq_u8(color, scaled_dither);
470uint8x16_t palette256_index_dithered_neon(uint8x16_t r, uint8x16_t g, uint8x16_t b,
int pixel_offset) {
472 r = apply_ordered_dither(r, pixel_offset, 0);
473 g = apply_ordered_dither(g, pixel_offset + 1, 0);
474 b = apply_ordered_dither(b, pixel_offset + 2, 0);
477 uint8x16_t R6 = q6_from_u8(r);
478 uint8x16_t G6 = q6_from_u8(g);
479 uint8x16_t B6 = q6_from_u8(b);
482 uint16x8_t R6l = vmovl_u8(vget_low_u8(R6));
483 uint16x8_t R6h = vmovl_u8(vget_high_u8(R6));
484 uint16x8_t G6l = vmovl_u8(vget_low_u8(G6));
485 uint16x8_t G6h = vmovl_u8(vget_high_u8(G6));
486 uint16x8_t B6l = vmovl_u8(vget_low_u8(B6));
487 uint16x8_t B6h = vmovl_u8(vget_high_u8(B6));
489 uint16x8_t idxl = vmlaq_n_u16(vmulq_n_u16(R6l, 36), G6l, 6);
490 uint16x8_t idxh = vmlaq_n_u16(vmulq_n_u16(R6h, 36), G6h, 6);
491 idxl = vaddq_u16(idxl, B6l);
492 idxh = vaddq_u16(idxh, B6h);
493 idxl = vaddq_u16(idxl, vdupq_n_u16(16));
494 idxh = vaddq_u16(idxh, vdupq_n_u16(16));
497 uint8x16_t maxrg = vmaxq_u8(r, g);
498 uint8x16_t minrg = vminq_u8(r, g);
499 uint8x16_t maxrgb = vmaxq_u8(maxrg, b);
500 uint8x16_t minrgb = vminq_u8(minrg, b);
501 uint8x16_t diff = vsubq_u8(maxrgb, minrgb);
503 uint8x16_t is_gray = vcltq_u8(diff, thr);
506 uint8x16_t Y = simd_luma_neon(r, g, b);
508 uint16x8_t Yl = vmovl_u8(vget_low_u8(Y));
509 uint16x8_t Yh = vmovl_u8(vget_high_u8(Y));
510 Yl = vmlaq_n_u16(vdupq_n_u16(0), Yl, 23);
511 Yh = vmlaq_n_u16(vdupq_n_u16(0), Yh, 23);
512 Yl = vaddq_u16(Yl, vdupq_n_u16(128));
513 Yh = vaddq_u16(Yh, vdupq_n_u16(128));
514 Yl = vshrq_n_u16(Yl, 8);
515 Yh = vshrq_n_u16(Yh, 8);
516 uint16x8_t gidxl = vaddq_u16(Yl, vdupq_n_u16(232));
517 uint16x8_t gidxh = vaddq_u16(Yh, vdupq_n_u16(232));
520 uint8x16_t idx_cube = vcombine_u8(vqmovn_u16(idxl), vqmovn_u16(idxh));
521 uint8x16_t idx_gray = vcombine_u8(vqmovn_u16(gidxl), vqmovn_u16(gidxh));
522 return vbslq_u8(is_gray, idx_gray, idx_cube);
529char *render_ascii_image_monochrome_neon(
const image_t *image,
const char *ascii_chars) {
530 if (!image || !image->
pixels || !ascii_chars) {
534 const int h = image->
h;
535 const int w = image->
w;
537 if (h <= 0 || w <= 0) {
544 log_error(
"Failed to get UTF-8 palette cache");
549 uint8x16x4_t tbl, char_lut, length_lut, char_byte0_lut, char_byte1_lut, char_byte2_lut, char_byte3_lut;
550 build_neon_lookup_tables(utf8_cache, &tbl, &char_lut, &length_lut, &char_byte0_lut, &char_byte1_lut, &char_byte2_lut,
554 const size_t max_char_bytes = 4;
557 size_t w_times_bytes;
558 if (checked_size_mul((
size_t)w, max_char_bytes, &w_times_bytes) !=
ASCIICHAT_OK) {
559 log_error(
"Buffer size overflow: width too large for UTF-8 encoding");
563 size_t w_times_bytes_plus_one;
564 if (checked_size_add(w_times_bytes, 1, &w_times_bytes_plus_one) !=
ASCIICHAT_OK) {
565 log_error(
"Buffer size overflow: width * bytes + 1 overflow");
570 if (checked_size_mul((
size_t)h, w_times_bytes_plus_one, &len) !=
ASCIICHAT_OK) {
571 log_error(
"Buffer size overflow: height * (width * bytes + 1) overflow");
577 if (output == NULL) {
582 const rgb_pixel_t *pixels = (
const rgb_pixel_t *)image->
pixels;
585 for (
int y = 0; y < h; y++) {
586 const rgb_pixel_t *row = &pixels[y * w];
590 for (; x + 15 < w; x += 16) {
592 uint8x16x3_t rgb = vld3q_u8((
const uint8_t *)(row + x));
595 uint16x8_t luma_lo = vmull_u8(vget_low_u8(rgb.val[0]), vdup_n_u8(
LUMA_RED));
596 luma_lo = vmlal_u8(luma_lo, vget_low_u8(rgb.val[1]), vdup_n_u8(
LUMA_GREEN));
597 luma_lo = vmlal_u8(luma_lo, vget_low_u8(rgb.val[2]), vdup_n_u8(
LUMA_BLUE));
598 luma_lo = vaddq_u16(luma_lo, vdupq_n_u16(128));
599 luma_lo = vshrq_n_u16(luma_lo, 8);
601 uint16x8_t luma_hi = vmull_u8(vget_high_u8(rgb.val[0]), vdup_n_u8(
LUMA_RED));
602 luma_hi = vmlal_u8(luma_hi, vget_high_u8(rgb.val[1]), vdup_n_u8(
LUMA_GREEN));
603 luma_hi = vmlal_u8(luma_hi, vget_high_u8(rgb.val[2]), vdup_n_u8(
LUMA_BLUE));
604 luma_hi = vaddq_u16(luma_hi, vdupq_n_u16(128));
605 luma_hi = vshrq_n_u16(luma_hi, 8);
608 uint8x16_t luminance = vcombine_u8(vmovn_u16(luma_lo), vmovn_u16(luma_hi));
612 uint8x16_t luma_buckets = vshrq_n_u8(luminance, 2);
613 uint8x16_t char_indices = vqtbl4q_u8(tbl, luma_buckets);
618 uint8x16_t char_lengths = vqtbl4q_u8(length_lut, char_indices);
622 if (all_same_length_neon(char_lengths, &uniform_length)) {
624 if (uniform_length == 1) {
626 uint8x16_t ascii_output = vqtbl4q_u8(char_lut, char_indices);
627 vst1q_u8((
uint8_t *)pos, ascii_output);
630 }
else if (uniform_length == 4) {
633 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
634 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
635 uint8x16_t byte2_stream = vqtbl4q_u8(char_byte2_lut, char_indices);
636 uint8x16_t byte3_stream = vqtbl4q_u8(char_byte3_lut, char_indices);
639 uint8x16x4_t interleaved;
640 interleaved.val[0] = byte0_stream;
641 interleaved.val[1] = byte1_stream;
642 interleaved.val[2] = byte2_stream;
643 interleaved.val[3] = byte3_stream;
646 vst4q_u8((
uint8_t *)pos, interleaved);
649 }
else if (uniform_length == 2) {
651 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
652 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
655 uint8x16x2_t interleaved_2byte;
656 interleaved_2byte.val[0] = byte0_stream;
657 interleaved_2byte.val[1] = byte1_stream;
659 vst2q_u8((
uint8_t *)pos, interleaved_2byte);
662 }
else if (uniform_length == 3) {
664 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
665 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
666 uint8x16_t byte2_stream = vqtbl4q_u8(char_byte2_lut, char_indices);
669 uint8x16x3_t interleaved_3byte;
670 interleaved_3byte.val[0] = byte0_stream;
671 interleaved_3byte.val[1] = byte1_stream;
672 interleaved_3byte.val[2] = byte2_stream;
674 vst3q_u8((
uint8_t *)pos, interleaved_3byte);
683 uint8x16_t byte0_vec = vqtbl4q_u8(char_byte0_lut, char_indices);
684 uint8x16_t byte1_vec = vqtbl4q_u8(char_byte1_lut, char_indices);
685 uint8x16_t byte2_vec = vqtbl4q_u8(char_byte2_lut, char_indices);
686 uint8x16_t byte3_vec = vqtbl4q_u8(char_byte3_lut, char_indices);
689 uint8_t byte0_buf[16], byte1_buf[16], byte2_buf[16], byte3_buf[16];
690 vst1q_u8(byte0_buf, byte0_vec);
691 vst1q_u8(byte1_buf, byte1_vec);
692 vst1q_u8(byte2_buf, byte2_vec);
693 vst1q_u8(byte3_buf, byte3_vec);
698 vst1q_u8(char_idx_buf, char_indices);
700 for (
int i = 0; i < 16; i++) {
701 const uint8_t char_idx = char_idx_buf[i];
705 *pos++ = byte0_buf[i];
707 *pos++ = byte1_buf[i];
709 *pos++ = byte2_buf[i];
711 *pos++ = byte3_buf[i];
718 const rgb_pixel_t pixel = row[x];
720 const uint8_t luma_idx = luminance >> 2;
749char *render_ascii_neon_unified_optimized(
const image_t *image,
bool use_background,
bool use_256color,
750 const char *ascii_chars) {
751 if (!image || !image->
pixels) {
755 const int width = image->
w;
756 const int height = image->
h;
758 if (width <= 0 || height <= 0) {
767 size_t bytes_per_pixel = use_256color ? 6u : 8u;
770 size_t height_times_width;
771 if (checked_size_mul((
size_t)height, (
size_t)width, &height_times_width) !=
ASCIICHAT_OK) {
772 log_error(
"Buffer size overflow: height * width overflow");
776 size_t pixel_data_size;
777 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) !=
ASCIICHAT_OK) {
778 log_error(
"Buffer size overflow: (height * width) * bytes_per_pixel overflow");
782 size_t height_times_16;
783 if (checked_size_mul((
size_t)height, 16u, &height_times_16) !=
ASCIICHAT_OK) {
784 log_error(
"Buffer size overflow: height * 16 overflow");
789 if (checked_size_add(pixel_data_size, height_times_16, &temp) !=
ASCIICHAT_OK) {
790 log_error(
"Buffer size overflow: pixel_data + height*16 overflow");
795 log_error(
"Buffer size overflow: total capacity overflow");
806 log_error(
"Failed to get UTF-8 palette cache for NEON color");
811 uint8x16x4_t tbl, char_lut, length_lut, char_byte0_lut, char_byte1_lut, char_byte2_lut, char_byte3_lut;
812 build_neon_lookup_tables(utf8_cache, &tbl, &char_lut, &length_lut, &char_byte0_lut, &char_byte1_lut, &char_byte2_lut,
818 (void)char_byte0_lut;
819 (void)char_byte1_lut;
820 (void)char_byte2_lut;
821 (void)char_byte3_lut;
824 int curR = -1, curG = -1, curB = -1;
825 int cur_color_idx = -1;
827 for (
int y = 0; y < height; y++) {
828 const rgb_pixel_t *row = &((
const rgb_pixel_t *)image->
pixels)[y * width];
832 while (x + 16 <= width) {
835 uint8x16x3_t pix = vld3q_u8(p);
838 uint16x8_t ylo = vmull_u8(vget_low_u8(pix.val[0]), vdup_n_u8(
LUMA_RED));
839 ylo = vmlal_u8(ylo, vget_low_u8(pix.val[1]), vdup_n_u8(
LUMA_GREEN));
840 ylo = vmlal_u8(ylo, vget_low_u8(pix.val[2]), vdup_n_u8(
LUMA_BLUE));
842 ylo = vshrq_n_u16(ylo, 8);
844 uint16x8_t yhi = vmull_u8(vget_high_u8(pix.val[0]), vdup_n_u8(
LUMA_RED));
845 yhi = vmlal_u8(yhi, vget_high_u8(pix.val[1]), vdup_n_u8(
LUMA_GREEN));
846 yhi = vmlal_u8(yhi, vget_high_u8(pix.val[2]), vdup_n_u8(
LUMA_BLUE));
848 yhi = vshrq_n_u16(yhi, 8);
850 uint8x16_t y8 = vcombine_u8(vmovn_u16(ylo), vmovn_u16(yhi));
851 uint8x16_t idx = vshrq_n_u8(y8, 2);
854 uint8x16_t char_indices = vqtbl4q_u8(tbl, idx);
858 uint8_t char_idx_buf[16], color_indices[16];
859 vst1q_u8(char_idx_buf, char_indices);
862 uint8x16_t color_indices_vec = palette256_index_dithered_neon(pix.val[0], pix.val[1], pix.val[2], x);
863 vst1q_u8(color_indices, color_indices_vec);
866 for (
int i = 0; i < 16;) {
867 const uint8_t char_idx = char_idx_buf[i];
869 const uint8_t color_idx = color_indices[i];
873 (
uint32_t)find_rle_run_length_neon(char_idx_buf, color_indices, i, 16, char_idx, color_idx);
875 if (color_idx != cur_color_idx) {
876 if (use_background) {
881 cur_color_idx = color_idx;
888 for (
uint32_t k = 1; k < run; k++) {
896 char temp_buffer[16 * 50];
897 size_t vectorized_length =
898 neon_assemble_truecolor_sequences_true_simd(char_indices, pix.val[0], pix.val[1], pix.val[2], utf8_cache,
899 temp_buffer,
sizeof(temp_buffer), use_background);
902 ob_write(&ob, temp_buffer, vectorized_length);
909 const rgb_pixel_t *p = &row[x];
910 uint32_t R = p->r, G = p->g, B = p->b;
921 const rgb_pixel_t *q = &row[j];
922 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
926 if (luma_idx2 != luma_idx || color_idx2 != color_idx)
932 if (color_idx != cur_color_idx) {
933 if (use_background) {
938 cur_color_idx = color_idx;
946 for (
uint32_t k = 1; k < run; k++) {
955 const rgb_pixel_t *q = &row[j];
956 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
959 if (luma_idx2 != luma_idx || R2 != R || G2 != G || B2 != B)
965 if ((
int)R != curR || (int)G != curG || (
int)B != curB) {
966 if (use_background) {
981 for (
uint32_t k = 1; k < run; k++) {
991 if (y < height - 1) {
994 curR = curG = curB = -1;
1005char *rgb_to_truecolor_halfblocks_neon(
const uint8_t *rgb,
int width,
int height,
int stride_bytes) {
1007 if (width <= 0 || height <= 0)
1009 if (stride_bytes <= 0)
1010 stride_bytes = width * 3;
1014 size_t est_cells = (size_t)width * ((
size_t)(height + 1) / 2);
1015 ob.
cap = est_cells * 14u + (size_t)((height + 1) / 2) * 8u + 64u;
1021 int cur_fr = -1, cur_fg = -1, cur_fb = -1;
1022 int cur_br = -1, cur_bg = -1, cur_bb = -1;
1025 for (
int y = 0; y < height; y += 2) {
1026 const uint8_t *rowT = rgb + (size_t)y * (
size_t)stride_bytes;
1027 const uint8_t *rowB = (y + 1 < height) ? rowT + (
size_t)stride_bytes : NULL;
1030 while (x + 16 <= width) {
1032 const uint8_t *pT = rowT + (size_t)x * 3u;
1033 uint8x16x3_t top = vld3q_u8(pT);
1037 const uint8_t *pB = rowB + (size_t)x * 3u;
1041 bot.val[0] = top.val[0];
1042 bot.val[1] = top.val[1];
1043 bot.val[2] = top.val[2];
1047 uint8_t Rt[16], Gt[16], Bt[16], Rb[16], Gb[16], Bb[16];
1048 vst1q_u8(Rt, top.val[0]);
1049 vst1q_u8(Gt, top.val[1]);
1050 vst1q_u8(Bt, top.val[2]);
1051 vst1q_u8(Rb, bot.val[0]);
1052 vst1q_u8(Gb, bot.val[1]);
1053 vst1q_u8(Bb, bot.val[2]);
1056 for (
int i = 0; i < 16;) {
1057 uint8_t rT = Rt[i], gT = Gt[i], bT = Bt[i];
1058 uint8_t rB = Rb[i], gB = Gb[i], bB = Bb[i];
1061 const uint8_t glyph_utf8[3] = {0xE2, 0x96, 0x80};
1065 for (; j < 16; ++j) {
1066 if (!(Rt[j] == rT && Gt[j] == gT && Bt[j] == bT && Rb[j] == rB && Gb[j] == gB && Bb[j] == bB))
1072 bool is_transparent = (rT == 0 && gT == 0 && bT == 0 && rB == 0 && gB == 0 && bB == 0);
1074 if (is_transparent) {
1076 if (cur_fr != -1 || cur_fg != -1 || cur_fb != -1 || cur_br != -1 || cur_bg != -1 || cur_bb != -1) {
1078 cur_fr = cur_fg = cur_fb = -1;
1079 cur_br = cur_bg = cur_bb = -1;
1086 for (
uint32_t k = 1; k < run; ++k) {
1092 if (cur_fr != rT || cur_fg != gT || cur_fb != bT) {
1098 if (cur_br != rB || cur_bg != gB || cur_bb != bB) {
1106 ob_write(&ob, (
const char *)glyph_utf8, 3);
1110 for (
uint32_t k = 1; k < run; ++k) {
1111 ob_write(&ob, (
const char *)glyph_utf8, 3);
1122 for (; x < width;) {
1123 const uint8_t *pT = rowT + (size_t)x * 3u;
1124 const uint8_t *pB = rowB ? rowB + (size_t)x * 3u : NULL;
1126 uint8_t rT = pT[0], gT = pT[1], bT = pT[2];
1127 uint8_t rB = rT, gB = gT, bB = bT;
1136 for (; j < width; ++j) {
1137 const uint8_t *qT = rowT + (size_t)j * 3u;
1138 const uint8_t *qB = rowB ? rowB + (size_t)j * 3u : NULL;
1139 uint8_t rT2 = qT[0], gT2 = qT[1], bT2 = qT[2];
1140 uint8_t rB2 = qB ? qB[0] : rT2, gB2 = qB ? qB[1] : gT2, bB2 = qB ? qB[2] : bT2;
1141 if (!((rT2 == rT && gT2 == gT && bT2 == bT) && (rB2 == rB && gB2 == gB && bB2 == bB)))
1147 bool is_transparent = (rT == 0 && gT == 0 && bT == 0 && rB == 0 && gB == 0 && bB == 0);
1149 if (is_transparent) {
1151 if (cur_fr != -1 || cur_fg != -1 || cur_fb != -1 || cur_br != -1 || cur_bg != -1 || cur_bb != -1) {
1153 cur_fr = cur_fg = cur_fb = -1;
1154 cur_br = cur_bg = cur_bb = -1;
1161 for (
uint32_t k = 1; k < run; ++k) {
1167 if (cur_fr != rT || cur_fg != gT || cur_fb != bT) {
1173 if (cur_br != rB || cur_bg != gB || cur_bb != bB) {
1181 static const char HB[3] = {(char)0xE2, (
char)0x96, (char)0x80};
1186 for (
uint32_t k = 1; k < run; ++k) {
1198 if (y + 2 < height) {
1201 cur_fr = cur_fg = cur_fb = -1;
1202 cur_br = cur_bg = cur_bb = -1;
Fast ANSI escape sequence generation.
SIMD-optimized ASCII conversion interface.
#define CUBE_GRAY_THRESHOLD
#define SAFE_MALLOC_SIMD(size, cast)
#define SAFE_MALLOC(size, cast)
unsigned long long uint64_t
#define log_error(...)
Log an ERROR message.
global_dec3_cache_t g_dec3_cache
Global decimal cache instance.
#define LUMA_BLUE
Luminance blue coefficient (0.114 * 256 = 29)
void emit_set_256_color_bg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color background ANSI sequence.
#define LUMA_GREEN
Luminance green coefficient (0.587 * 256 = 150)
void emit_set_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit background color sequence (auto-select mode)
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)
Get or create UTF-8 palette cache.
void init_dec3(void)
Initialize decimal lookup table.
void emit_set_256_color_fg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color foreground ANSI sequence.
void ob_term(outbuf_t *ob)
Append null terminator to buffer.
#define LUMA_THRESHOLD
Luminance threshold for rounding.
void ob_putc(outbuf_t *ob, char c)
Append a character to buffer.
bool rep_is_profitable(uint32_t runlen)
Check if run-length encoding is profitable.
void emit_set_truecolor_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor foreground ANSI sequence.
void emit_rep(outbuf_t *ob, uint32_t extra)
Emit run-length encoded sequence.
void ob_write(outbuf_t *ob, const char *s, size_t n)
Append a string to buffer.
void emit_reset(outbuf_t *ob)
Emit ANSI reset sequence.
void emit_set_truecolor_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor background ANSI sequence.
void emit_set_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit foreground color sequence (auto-select mode)
uint8_t rgb_to_256color(uint8_t r, uint8_t g, uint8_t b)
Convert RGB to 256-color palette index.
#define LUMA_RED
Luminance red coefficient (0.299 * 256 = 77)
🔢 Mathematical Utility Functions
NEON-optimized ASCII rendering functions.
Dynamic Output Buffer with ANSI Sequence Support.
✅ Safe Integer Arithmetic and Overflow Detection
Decimal conversion cache structure (1-3 digits)
int w
Image width in pixels (must be > 0)
int h
Image height in pixels (must be > 0)
rgb_pixel_t * pixels
Pixel data array (width * height RGB pixels, row-major order)
Dynamic output buffer (auto-expanding)
size_t cap
Buffer capacity in bytes (maximum length before reallocation)
char * buf
Buffer pointer (allocated, owned by caller, must be freed)
UTF-8 character structure.
UTF-8 palette cache structure.
⏱️ High-precision timing utilities using sokol_time.h and uthash
Common SIMD utilities and structures.