20#include <ascii-chat/common.h>
21#include <ascii-chat/video/simd/neon.h>
22#include <ascii-chat/video/simd/ascii_simd.h>
23#include <ascii-chat/video/image.h>
24#include <ascii-chat/video/simd/common.h>
25#include <ascii-chat/video/output_buffer.h>
26#include <ascii-chat/video/ansi_fast.h>
27#include <ascii-chat/util/overflow.h>
28#include <ascii-chat/platform/init.h>
29#include <ascii-chat/util/time.h>
30#include <ascii-chat/log/logging.h>
36static inline void build_neon_lookup_tables(utf8_palette_cache_t *utf8_cache, uint8x16x4_t *tbl, uint8x16x4_t *char_lut,
37 uint8x16x4_t *length_lut, uint8x16x4_t *char_byte0_lut,
38 uint8x16x4_t *char_byte1_lut, uint8x16x4_t *char_byte2_lut,
39 uint8x16x4_t *char_byte3_lut) {
41 uint8_t cache64_indices[64];
42 for (
int i = 0; i < 64; i++) {
43 cache64_indices[i] = (uint8_t)i;
46 tbl->val[0] = vld1q_u8(&cache64_indices[0]);
47 tbl->val[1] = vld1q_u8(&cache64_indices[16]);
48 tbl->val[2] = vld1q_u8(&cache64_indices[32]);
49 tbl->val[3] = vld1q_u8(&cache64_indices[48]);
52 uint8_t ascii_chars_lut[64];
53 uint8_t char_lengths[64];
54 uint8_t char_byte0[64];
55 uint8_t char_byte1[64];
56 uint8_t char_byte2[64];
57 uint8_t char_byte3[64];
59 for (
int i = 0; i < 64; i++) {
60 const utf8_char_t *char_info = &utf8_cache->cache64[i];
63 ascii_chars_lut[i] = char_info->utf8_bytes[0];
66 char_lengths[i] = char_info->byte_len;
67 char_byte0[i] = char_info->utf8_bytes[0];
68 char_byte1[i] = char_info->byte_len > 1 ? char_info->utf8_bytes[1] : 0;
69 char_byte2[i] = char_info->byte_len > 2 ? char_info->utf8_bytes[2] : 0;
70 char_byte3[i] = char_info->byte_len > 3 ? char_info->utf8_bytes[3] : 0;
74 char_lut->val[0] = vld1q_u8(&ascii_chars_lut[0]);
75 char_lut->val[1] = vld1q_u8(&ascii_chars_lut[16]);
76 char_lut->val[2] = vld1q_u8(&ascii_chars_lut[32]);
77 char_lut->val[3] = vld1q_u8(&ascii_chars_lut[48]);
79 length_lut->val[0] = vld1q_u8(&char_lengths[0]);
80 length_lut->val[1] = vld1q_u8(&char_lengths[16]);
81 length_lut->val[2] = vld1q_u8(&char_lengths[32]);
82 length_lut->val[3] = vld1q_u8(&char_lengths[48]);
84 char_byte0_lut->val[0] = vld1q_u8(&char_byte0[0]);
85 char_byte0_lut->val[1] = vld1q_u8(&char_byte0[16]);
86 char_byte0_lut->val[2] = vld1q_u8(&char_byte0[32]);
87 char_byte0_lut->val[3] = vld1q_u8(&char_byte0[48]);
89 char_byte1_lut->val[0] = vld1q_u8(&char_byte1[0]);
90 char_byte1_lut->val[1] = vld1q_u8(&char_byte1[16]);
91 char_byte1_lut->val[2] = vld1q_u8(&char_byte1[32]);
92 char_byte1_lut->val[3] = vld1q_u8(&char_byte1[48]);
94 char_byte2_lut->val[0] = vld1q_u8(&char_byte2[0]);
95 char_byte2_lut->val[1] = vld1q_u8(&char_byte2[16]);
96 char_byte2_lut->val[2] = vld1q_u8(&char_byte2[32]);
97 char_byte2_lut->val[3] = vld1q_u8(&char_byte2[48]);
99 char_byte3_lut->val[0] = vld1q_u8(&char_byte3[0]);
100 char_byte3_lut->val[1] = vld1q_u8(&char_byte3[16]);
101 char_byte3_lut->val[2] = vld1q_u8(&char_byte3[32]);
102 char_byte3_lut->val[3] = vld1q_u8(&char_byte3[48]);
106void neon_caches_destroy(
void) {
112static inline int find_rle_run_length_neon(
const uint8_t *char_buf,
const uint8_t *color_buf,
int start_pos,
113 int max_len, uint8_t target_char, uint8_t target_color) {
117 int remaining = max_len - start_pos - 1;
121 const uint8_t *char_ptr = &char_buf[start_pos + 1];
122 const uint8_t *color_ptr = &color_buf[start_pos + 1];
125 while (remaining >= 16) {
126 uint8x16_t chars = vld1q_u8(char_ptr);
127 uint8x16_t colors = vld1q_u8(color_ptr);
129 uint8x16_t char_match = vceqq_u8(chars, vdupq_n_u8(target_char));
130 uint8x16_t color_match = vceqq_u8(colors, vdupq_n_u8(target_color));
131 uint8x16_t both_match = vandq_u8(char_match, color_match);
135 uint8_t min_match = vminvq_u8(both_match);
137 if (min_match == 0xFF) {
145 uint64_t mask_lo = vgetq_lane_u64(vreinterpretq_u64_u8(both_match), 0);
146 uint64_t mask_hi = vgetq_lane_u64(vreinterpretq_u64_u8(both_match), 1);
148 int matches_found = 0;
150 for (
int i = 0; i < 8; i++) {
151 if ((mask_lo >> (i * 8)) & 0xFF) {
159 if (matches_found == 8) {
160 for (
int i = 0; i < 8; i++) {
161 if ((mask_hi >> (i * 8)) & 0xFF) {
169 run_length += matches_found;
175 while (remaining > 0 && *char_ptr == target_char && *color_ptr == target_color) {
186static inline bool all_same_length_neon(uint8x16_t lengths, uint8_t *out_length) {
187 uint8_t first_len = vgetq_lane_u8(lengths, 0);
188 uint8x16_t first_len_vec = vdupq_n_u8(first_len);
189 uint8x16_t all_same = vceqq_u8(lengths, first_len_vec);
191 uint64x2_t all_same_64 = vreinterpretq_u64_u8(all_same);
192 uint64_t combined = vgetq_lane_u64(all_same_64, 0) & vgetq_lane_u64(all_same_64, 1);
194 if (combined == 0xFFFFFFFFFFFFFFFF) {
195 *out_length = first_len;
207static uint8_t neon_decimal_table_data[256 * 4];
208static bool neon_decimal_table_initialized =
false;
210static static_mutex_t g_neon_table_init_mutex = STATIC_MUTEX_INIT;
214void init_neon_decimal_table(
void) {
215 static_mutex_lock(&g_neon_table_init_mutex);
218 if (neon_decimal_table_initialized) {
219 static_mutex_unlock(&g_neon_table_init_mutex);
229 for (
int i = 0; i < 256; i++) {
231 uint8_t *entry = &neon_decimal_table_data[i * 4];
233 entry[1] = (dec->len >= 1) ? dec->s[0] :
'0';
234 entry[2] = (dec->len >= 2) ? dec->s[1] :
'0';
235 entry[3] = (dec->len >= 3) ? dec->s[2] :
'0';
238 neon_decimal_table_initialized =
true;
239 static_mutex_unlock(&g_neon_table_init_mutex);
247static inline size_t neon_assemble_truecolor_sequences_true_simd(uint8x16_t char_indices, uint8x16_t r_vals,
248 uint8x16_t g_vals, uint8x16_t b_vals,
249 utf8_palette_cache_t *utf8_cache,
char *output_buffer,
250 size_t buffer_capacity,
bool use_background) {
257 char *dst = output_buffer;
260 uint8_t char_idx_buf[16], r_buf[16], g_buf[16], b_buf[16];
261 vst1q_u8(char_idx_buf, char_indices);
262 vst1q_u8(r_buf, r_vals);
263 vst1q_u8(g_buf, g_vals);
264 vst1q_u8(b_buf, b_vals);
266 size_t total_written = 0;
267 const char *prefix = use_background ?
"\033[48;2;" :
"\033[38;2;";
268 const size_t prefix_len = 7;
272 for (
int i = 0; i < 16; i++) {
274 const uint8_t *r_entry = &neon_decimal_table_data[r_buf[i] * 4];
275 const uint8_t *g_entry = &neon_decimal_table_data[g_buf[i] * 4];
276 const uint8_t *b_entry = &neon_decimal_table_data[b_buf[i] * 4];
278 const uint8_t char_idx = char_idx_buf[i];
279 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
282 size_t seq_len = prefix_len + r_entry[0] + 1 + g_entry[0] + 1 + b_entry[0] + 1 + char_info->byte_len;
283 if (total_written >= buffer_capacity - seq_len) {
288 memcpy(dst, prefix, prefix_len);
292 memcpy(dst, &r_entry[1], r_entry[0]);
296 memcpy(dst, &g_entry[1], g_entry[0]);
300 memcpy(dst, &b_entry[1], b_entry[0]);
305 memcpy(dst, char_info->utf8_bytes, char_info->byte_len);
306 dst += char_info->byte_len;
308 total_written = dst - output_buffer;
311 return total_written;
326static inline uint8x16_t simd_luma_neon(uint8x16_t r, uint8x16_t g, uint8x16_t b) {
327 uint16x8_t rl = vmovl_u8(vget_low_u8(r));
328 uint16x8_t rh = vmovl_u8(vget_high_u8(r));
329 uint16x8_t gl = vmovl_u8(vget_low_u8(g));
330 uint16x8_t gh = vmovl_u8(vget_high_u8(g));
331 uint16x8_t bl = vmovl_u8(vget_low_u8(b));
332 uint16x8_t bh = vmovl_u8(vget_high_u8(b));
334 uint32x4_t l0 = vmull_n_u16(vget_low_u16(rl), LUMA_RED);
335 uint32x4_t l1 = vmull_n_u16(vget_high_u16(rl), LUMA_RED);
336 l0 = vmlal_n_u16(l0, vget_low_u16(gl), LUMA_GREEN);
337 l1 = vmlal_n_u16(l1, vget_high_u16(gl), LUMA_GREEN);
338 l0 = vmlal_n_u16(l0, vget_low_u16(bl), LUMA_BLUE);
339 l1 = vmlal_n_u16(l1, vget_high_u16(bl), LUMA_BLUE);
341 uint32x4_t h0 = vmull_n_u16(vget_low_u16(rh), LUMA_RED);
342 uint32x4_t h1 = vmull_n_u16(vget_high_u16(rh), LUMA_RED);
343 h0 = vmlal_n_u16(h0, vget_low_u16(gh), LUMA_GREEN);
344 h1 = vmlal_n_u16(h1, vget_high_u16(gh), LUMA_GREEN);
345 h0 = vmlal_n_u16(h0, vget_low_u16(bh), LUMA_BLUE);
346 h1 = vmlal_n_u16(h1, vget_high_u16(bh), LUMA_BLUE);
348 uint16x8_t l = vcombine_u16(vrshrn_n_u32(l0, 8), vrshrn_n_u32(l1, 8));
349 uint16x8_t h = vcombine_u16(vrshrn_n_u32(h0, 8), vrshrn_n_u32(h1, 8));
350 return vcombine_u8(vqmovn_u16(l), vqmovn_u16(h));
356static inline uint8x16_t q6_from_u8(uint8x16_t x) {
357 uint16x8_t xl = vmovl_u8(vget_low_u8(x));
358 uint16x8_t xh = vmovl_u8(vget_high_u8(x));
359 xl = vmlaq_n_u16(vdupq_n_u16(0), xl, 5);
360 xh = vmlaq_n_u16(vdupq_n_u16(0), xh, 5);
361 xl = vaddq_u16(xl, vdupq_n_u16(128));
362 xh = vaddq_u16(xh, vdupq_n_u16(128));
363 xl = vshrq_n_u16(xl, 8);
364 xh = vshrq_n_u16(xh, 8);
365 return vcombine_u8(vqmovn_u16(xl), vqmovn_u16(xh));
369#ifndef CUBE_GRAY_THRESHOLD
370#define CUBE_GRAY_THRESHOLD 10
374static inline uint8x16_t apply_ordered_dither(uint8x16_t color,
int pixel_offset, uint8_t dither_strength) {
376 static const uint8_t bayer4x4[16] = {0, 8, 2, 10, 12, 4, 14, 6, 3, 11, 1, 9, 15, 7, 13, 5};
379 const uint8x16_t dither_matrix = vld1q_u8(bayer4x4);
382 uint8_t pos_indices[16];
383 for (
int i = 0; i < 16; i++) {
384 pos_indices[i] = (pixel_offset + i) & 15;
386 const uint8x16_t position_vec = vld1q_u8(pos_indices);
389 uint8x16_t dither_values = vqtbl1q_u8(dither_matrix, position_vec);
393 uint16x8_t dither_lo = vmulq_n_u16(vmovl_u8(vget_low_u8(dither_values)), dither_strength);
394 uint16x8_t dither_hi = vmulq_n_u16(vmovl_u8(vget_high_u8(dither_values)), dither_strength);
395 dither_lo = vshrq_n_u16(dither_lo, 4);
396 dither_hi = vshrq_n_u16(dither_hi, 4);
397 uint8x16_t scaled_dither = vcombine_u8(vqmovn_u16(dither_lo), vqmovn_u16(dither_hi));
400 return vqaddq_u8(color, scaled_dither);
403uint8x16_t palette256_index_dithered_neon(uint8x16_t r, uint8x16_t g, uint8x16_t b,
int pixel_offset) {
405 r = apply_ordered_dither(r, pixel_offset, 0);
406 g = apply_ordered_dither(g, pixel_offset + 1, 0);
407 b = apply_ordered_dither(b, pixel_offset + 2, 0);
410 uint8x16_t R6 = q6_from_u8(r);
411 uint8x16_t G6 = q6_from_u8(g);
412 uint8x16_t B6 = q6_from_u8(b);
415 uint16x8_t R6l = vmovl_u8(vget_low_u8(R6));
416 uint16x8_t R6h = vmovl_u8(vget_high_u8(R6));
417 uint16x8_t G6l = vmovl_u8(vget_low_u8(G6));
418 uint16x8_t G6h = vmovl_u8(vget_high_u8(G6));
419 uint16x8_t B6l = vmovl_u8(vget_low_u8(B6));
420 uint16x8_t B6h = vmovl_u8(vget_high_u8(B6));
422 uint16x8_t idxl = vmlaq_n_u16(vmulq_n_u16(R6l, 36), G6l, 6);
423 uint16x8_t idxh = vmlaq_n_u16(vmulq_n_u16(R6h, 36), G6h, 6);
424 idxl = vaddq_u16(idxl, B6l);
425 idxh = vaddq_u16(idxh, B6h);
426 idxl = vaddq_u16(idxl, vdupq_n_u16(16));
427 idxh = vaddq_u16(idxh, vdupq_n_u16(16));
430 uint8x16_t maxrg = vmaxq_u8(r, g);
431 uint8x16_t minrg = vminq_u8(r, g);
432 uint8x16_t maxrgb = vmaxq_u8(maxrg, b);
433 uint8x16_t minrgb = vminq_u8(minrg, b);
434 uint8x16_t diff = vsubq_u8(maxrgb, minrgb);
436 uint8x16_t is_gray = vcltq_u8(diff, thr);
439 uint8x16_t Y = simd_luma_neon(r, g, b);
441 uint16x8_t Yl = vmovl_u8(vget_low_u8(Y));
442 uint16x8_t Yh = vmovl_u8(vget_high_u8(Y));
443 Yl = vmlaq_n_u16(vdupq_n_u16(0), Yl, 23);
444 Yh = vmlaq_n_u16(vdupq_n_u16(0), Yh, 23);
445 Yl = vaddq_u16(Yl, vdupq_n_u16(128));
446 Yh = vaddq_u16(Yh, vdupq_n_u16(128));
447 Yl = vshrq_n_u16(Yl, 8);
448 Yh = vshrq_n_u16(Yh, 8);
449 uint16x8_t gidxl = vaddq_u16(Yl, vdupq_n_u16(232));
450 uint16x8_t gidxh = vaddq_u16(Yh, vdupq_n_u16(232));
453 uint8x16_t idx_cube = vcombine_u8(vqmovn_u16(idxl), vqmovn_u16(idxh));
454 uint8x16_t idx_gray = vcombine_u8(vqmovn_u16(gidxl), vqmovn_u16(gidxh));
455 return vbslq_u8(is_gray, idx_gray, idx_cube);
462char *render_ascii_image_monochrome_neon(
const image_t *image,
const char *ascii_chars) {
463 if (!image || !image->pixels || !ascii_chars) {
467 const int h = image->h;
468 const int w = image->w;
470 if (h <= 0 || w <= 0) {
477 log_error(
"Failed to get UTF-8 palette cache");
482 uint8x16x4_t tbl, char_lut, length_lut, char_byte0_lut, char_byte1_lut, char_byte2_lut, char_byte3_lut;
483 build_neon_lookup_tables(utf8_cache, &tbl, &char_lut, &length_lut, &char_byte0_lut, &char_byte1_lut, &char_byte2_lut,
487 const size_t max_char_bytes = 4;
490 size_t w_times_bytes;
491 if (checked_size_mul((
size_t)w, max_char_bytes, &w_times_bytes) != ASCIICHAT_OK) {
492 log_error(
"Buffer size overflow: width too large for UTF-8 encoding");
496 size_t w_times_bytes_plus_one;
497 if (checked_size_add(w_times_bytes, 1, &w_times_bytes_plus_one) != ASCIICHAT_OK) {
498 log_error(
"Buffer size overflow: width * bytes + 1 overflow");
503 if (checked_size_mul((
size_t)h, w_times_bytes_plus_one, &len) != ASCIICHAT_OK) {
504 log_error(
"Buffer size overflow: height * (width * bytes + 1) overflow");
509 char *output = SAFE_MALLOC_SIMD(len,
char *);
510 if (output == NULL) {
515 const rgb_pixel_t *pixels = (
const rgb_pixel_t *)image->pixels;
518 for (
int y = 0; y < h; y++) {
519 const rgb_pixel_t *row = &pixels[y * w];
523 for (; x + 15 < w; x += 16) {
525 uint8x16x3_t rgb = vld3q_u8((
const uint8_t *)(row + x));
528 uint16x8_t luma_lo = vmull_u8(vget_low_u8(rgb.val[0]), vdup_n_u8(LUMA_RED));
529 luma_lo = vmlal_u8(luma_lo, vget_low_u8(rgb.val[1]), vdup_n_u8(LUMA_GREEN));
530 luma_lo = vmlal_u8(luma_lo, vget_low_u8(rgb.val[2]), vdup_n_u8(LUMA_BLUE));
531 luma_lo = vaddq_u16(luma_lo, vdupq_n_u16(128));
532 luma_lo = vshrq_n_u16(luma_lo, 8);
534 uint16x8_t luma_hi = vmull_u8(vget_high_u8(rgb.val[0]), vdup_n_u8(LUMA_RED));
535 luma_hi = vmlal_u8(luma_hi, vget_high_u8(rgb.val[1]), vdup_n_u8(LUMA_GREEN));
536 luma_hi = vmlal_u8(luma_hi, vget_high_u8(rgb.val[2]), vdup_n_u8(LUMA_BLUE));
537 luma_hi = vaddq_u16(luma_hi, vdupq_n_u16(128));
538 luma_hi = vshrq_n_u16(luma_hi, 8);
541 uint8x16_t luminance = vcombine_u8(vmovn_u16(luma_lo), vmovn_u16(luma_hi));
545 uint8x16_t luma_buckets = vshrq_n_u8(luminance, 2);
546 uint8x16_t char_indices = vqtbl4q_u8(tbl, luma_buckets);
551 uint8x16_t char_lengths = vqtbl4q_u8(length_lut, char_indices);
554 uint8_t uniform_length;
555 if (all_same_length_neon(char_lengths, &uniform_length)) {
557 if (uniform_length == 1) {
559 uint8x16_t ascii_output = vqtbl4q_u8(char_lut, char_indices);
560 vst1q_u8((uint8_t *)pos, ascii_output);
563 }
else if (uniform_length == 4) {
566 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
567 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
568 uint8x16_t byte2_stream = vqtbl4q_u8(char_byte2_lut, char_indices);
569 uint8x16_t byte3_stream = vqtbl4q_u8(char_byte3_lut, char_indices);
572 uint8x16x4_t interleaved;
573 interleaved.val[0] = byte0_stream;
574 interleaved.val[1] = byte1_stream;
575 interleaved.val[2] = byte2_stream;
576 interleaved.val[3] = byte3_stream;
579 vst4q_u8((uint8_t *)pos, interleaved);
582 }
else if (uniform_length == 2) {
584 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
585 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
588 uint8x16x2_t interleaved_2byte;
589 interleaved_2byte.val[0] = byte0_stream;
590 interleaved_2byte.val[1] = byte1_stream;
592 vst2q_u8((uint8_t *)pos, interleaved_2byte);
595 }
else if (uniform_length == 3) {
597 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
598 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
599 uint8x16_t byte2_stream = vqtbl4q_u8(char_byte2_lut, char_indices);
602 uint8x16x3_t interleaved_3byte;
603 interleaved_3byte.val[0] = byte0_stream;
604 interleaved_3byte.val[1] = byte1_stream;
605 interleaved_3byte.val[2] = byte2_stream;
607 vst3q_u8((uint8_t *)pos, interleaved_3byte);
616 uint8x16_t byte0_vec = vqtbl4q_u8(char_byte0_lut, char_indices);
617 uint8x16_t byte1_vec = vqtbl4q_u8(char_byte1_lut, char_indices);
618 uint8x16_t byte2_vec = vqtbl4q_u8(char_byte2_lut, char_indices);
619 uint8x16_t byte3_vec = vqtbl4q_u8(char_byte3_lut, char_indices);
622 uint8_t byte0_buf[16], byte1_buf[16], byte2_buf[16], byte3_buf[16];
623 vst1q_u8(byte0_buf, byte0_vec);
624 vst1q_u8(byte1_buf, byte1_vec);
625 vst1q_u8(byte2_buf, byte2_vec);
626 vst1q_u8(byte3_buf, byte3_vec);
630 uint8_t char_idx_buf[16];
631 vst1q_u8(char_idx_buf, char_indices);
633 for (
int i = 0; i < 16; i++) {
634 const uint8_t char_idx = char_idx_buf[i];
635 const uint8_t byte_len = utf8_cache->cache64[char_idx].byte_len;
638 *pos++ = byte0_buf[i];
640 *pos++ = byte1_buf[i];
642 *pos++ = byte2_buf[i];
644 *pos++ = byte3_buf[i];
651 const rgb_pixel_t pixel = row[x];
652 const uint8_t luminance = (LUMA_RED * pixel.r + LUMA_GREEN * pixel.g + LUMA_BLUE * pixel.b + 128) >> 8;
653 const uint8_t luma_idx = luminance >> 2;
654 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
656 if (char_info->byte_len == 1) {
657 *pos++ = char_info->utf8_bytes[0];
660 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
661 pos += char_info->byte_len;
682char *render_ascii_neon_unified_optimized(
const image_t *image,
bool use_background,
bool use_256color,
683 const char *ascii_chars) {
684 if (!image || !image->pixels) {
688 const int width = image->w;
689 const int height = image->h;
691 if (width <= 0 || height <= 0) {
693 empty = SAFE_MALLOC(1,
char *);
700 size_t bytes_per_pixel = use_256color ? 6u : 8u;
703 size_t height_times_width;
704 if (checked_size_mul((
size_t)height, (
size_t)width, &height_times_width) != ASCIICHAT_OK) {
705 log_error(
"Buffer size overflow: height * width overflow");
709 size_t pixel_data_size;
710 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
711 log_error(
"Buffer size overflow: (height * width) * bytes_per_pixel overflow");
715 size_t height_times_16;
716 if (checked_size_mul((
size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
717 log_error(
"Buffer size overflow: height * 16 overflow");
722 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
723 log_error(
"Buffer size overflow: pixel_data + height*16 overflow");
727 if (checked_size_add(temp, 64u, &ob.cap) != ASCIICHAT_OK) {
728 log_error(
"Buffer size overflow: total capacity overflow");
732 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
736 START_TIMER(
"neon_utf8_cache");
740 log_error(
"Failed to get UTF-8 palette cache for NEON color");
743 STOP_TIMER_AND_LOG_EVERY(dev, 3 * NS_PER_SEC_INT, 3 * NS_PER_MS_INT,
"neon_utf8_cache",
744 "NEON_UTF8_CACHE: Complete (%.2f ms)");
746 START_TIMER(
"neon_lookup_tables");
748 uint8x16x4_t tbl, char_lut, length_lut, char_byte0_lut, char_byte1_lut, char_byte2_lut, char_byte3_lut;
749 build_neon_lookup_tables(utf8_cache, &tbl, &char_lut, &length_lut, &char_byte0_lut, &char_byte1_lut, &char_byte2_lut,
751 STOP_TIMER_AND_LOG_EVERY(dev, 3 * NS_PER_SEC_INT, 3 * NS_PER_MS_INT,
"neon_lookup_tables",
752 "NEON_LOOKUP_TABLES: Complete (%.2f ms)");
757 (void)char_byte0_lut;
758 (void)char_byte1_lut;
759 (void)char_byte2_lut;
760 (void)char_byte3_lut;
762 START_TIMER(
"neon_main_loop");
766 int chunks_256color = 0, chunks_truecolor = 0;
769 init_neon_decimal_table();
772 for (
int y = 0; y < height; y++) {
774 int curR = -1, curG = -1, curB = -1;
775 int cur_color_idx = -1;
777 const rgb_pixel_t *row = &((
const rgb_pixel_t *)image->pixels)[y * width];
781 while (x + 16 <= width) {
783 const uint8_t *p = (
const uint8_t *)(row + x);
784 uint8x16x3_t pix = vld3q_u8(p);
787 uint16x8_t ylo = vmull_u8(vget_low_u8(pix.val[0]), vdup_n_u8(LUMA_RED));
788 ylo = vmlal_u8(ylo, vget_low_u8(pix.val[1]), vdup_n_u8(LUMA_GREEN));
789 ylo = vmlal_u8(ylo, vget_low_u8(pix.val[2]), vdup_n_u8(LUMA_BLUE));
790 ylo = vaddq_u16(ylo, vdupq_n_u16(LUMA_THRESHOLD));
791 ylo = vshrq_n_u16(ylo, 8);
793 uint16x8_t yhi = vmull_u8(vget_high_u8(pix.val[0]), vdup_n_u8(LUMA_RED));
794 yhi = vmlal_u8(yhi, vget_high_u8(pix.val[1]), vdup_n_u8(LUMA_GREEN));
795 yhi = vmlal_u8(yhi, vget_high_u8(pix.val[2]), vdup_n_u8(LUMA_BLUE));
796 yhi = vaddq_u16(yhi, vdupq_n_u16(LUMA_THRESHOLD));
797 yhi = vshrq_n_u16(yhi, 8);
799 uint8x16_t y8 = vcombine_u8(vmovn_u16(ylo), vmovn_u16(yhi));
800 uint8x16_t idx = vshrq_n_u8(y8, 2);
803 uint8x16_t char_indices = vqtbl4q_u8(tbl, idx);
808 uint8_t char_idx_buf[16], color_indices[16];
809 vst1q_u8(char_idx_buf, char_indices);
812 uint8x16_t color_indices_vec = palette256_index_dithered_neon(pix.val[0], pix.val[1], pix.val[2], x);
813 vst1q_u8(color_indices, color_indices_vec);
816 for (
int i = 0; i < 16;) {
817 const uint8_t char_idx = char_idx_buf[i];
818 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
819 const uint8_t color_idx = color_indices[i];
823 (uint32_t)find_rle_run_length_neon(char_idx_buf, color_indices, i, 16, char_idx, color_idx);
825 if (color_idx != cur_color_idx) {
826 if (use_background) {
831 cur_color_idx = color_idx;
834 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
838 for (uint32_t k = 1; k < run; k++) {
839 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
847 char temp_buffer[16 * 50];
848 size_t vectorized_length =
849 neon_assemble_truecolor_sequences_true_simd(char_indices, pix.val[0], pix.val[1], pix.val[2], utf8_cache,
850 temp_buffer,
sizeof(temp_buffer), use_background);
853 ob_write(&ob, temp_buffer, vectorized_length);
860 const rgb_pixel_t *p = &row[x];
861 uint32_t R = p->r, G = p->g, B = p->b;
862 uint8_t Y = (uint8_t)((LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + LUMA_THRESHOLD) >> 8);
863 uint8_t luma_idx = Y >> 2;
864 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
868 uint8_t color_idx =
rgb_to_256color((uint8_t)R, (uint8_t)G, (uint8_t)B);
872 const rgb_pixel_t *q = &row[j];
873 uint32_t
R2 = q->r, G2 = q->g, B2 = q->b;
874 uint8_t Y2 = (uint8_t)((LUMA_RED *
R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
875 uint8_t luma_idx2 = Y2 >> 2;
877 if (luma_idx2 != luma_idx || color_idx2 != color_idx)
881 uint32_t run = (uint32_t)(j - x);
883 if (color_idx != cur_color_idx) {
884 if (use_background) {
889 cur_color_idx = color_idx;
893 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
897 for (uint32_t k = 1; k < run; k++) {
898 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
906 const rgb_pixel_t *q = &row[j];
907 uint32_t
R2 = q->r, G2 = q->g, B2 = q->b;
908 uint8_t Y2 = (uint8_t)((LUMA_RED *
R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
909 uint8_t luma_idx2 = Y2 >> 2;
910 if (luma_idx2 != luma_idx ||
R2 != R || G2 != G || B2 != B)
914 uint32_t run = (uint32_t)(j - x);
916 if ((
int)R != curR || (int)G != curG || (
int)B != curB) {
917 if (use_background) {
928 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
932 for (uint32_t k = 1; k < run; k++) {
933 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
942 if (y < height - 1) {
948 uint64_t loop_time_ms = (loop_end_ns - loop_start_ns) / NS_PER_MS_INT;
949 log_dev(
"NEON_MAIN_LOOP_ACTUAL: %llu ms for %d rows, %d width", loop_time_ms, height, width);
953 "NEON_MAIN_LOOP processed %d rows x %d width = %d pixels in %llu ms (256color: %d chunks, truecolor: %d chunks)",
954 height, width, height * width, loop_time_ms, chunks_256color, chunks_truecolor);
956 STOP_TIMER_AND_LOG_EVERY(dev, 3 * NS_PER_SEC_INT, 5 * NS_PER_MS_INT,
"neon_main_loop",
957 "NEON_MAIN_LOOP: Complete (%.2f ms)");
966char *rgb_to_truecolor_halfblocks_neon(
const uint8_t *rgb,
int width,
int height,
int stride_bytes) {
968 if (width <= 0 || height <= 0)
970 if (stride_bytes <= 0)
971 stride_bytes = width * 3;
975 size_t est_cells = (size_t)width * ((
size_t)(height + 1) / 2);
976 ob.cap = est_cells * 14u + (size_t)((height + 1) / 2) * 8u + 64u;
977 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
982 int cur_fr = -1, cur_fg = -1, cur_fb = -1;
983 int cur_br = -1, cur_bg = -1, cur_bb = -1;
986 for (
int y = 0; y < height; y += 2) {
987 const uint8_t *rowT = rgb + (size_t)y * (
size_t)stride_bytes;
988 const uint8_t *rowB = (y + 1 < height) ? rowT + (
size_t)stride_bytes : NULL;
991 while (x + 16 <= width) {
993 const uint8_t *pT = rowT + (size_t)x * 3u;
994 uint8x16x3_t top = vld3q_u8(pT);
998 const uint8_t *pB = rowB + (size_t)x * 3u;
1002 bot.val[0] = top.val[0];
1003 bot.val[1] = top.val[1];
1004 bot.val[2] = top.val[2];
1008 uint8_t Rt[16], Gt[16], Bt[16], Rb[16], Gb[16], Bb[16];
1009 vst1q_u8(Rt, top.val[0]);
1010 vst1q_u8(Gt, top.val[1]);
1011 vst1q_u8(Bt, top.val[2]);
1012 vst1q_u8(Rb, bot.val[0]);
1013 vst1q_u8(Gb, bot.val[1]);
1014 vst1q_u8(Bb, bot.val[2]);
1017 for (
int i = 0; i < 16;) {
1018 uint8_t rT = Rt[i], gT = Gt[i], bT = Bt[i];
1019 uint8_t rB = Rb[i], gB = Gb[i], bB = Bb[i];
1022 const uint8_t glyph_utf8[3] = {0xE2, 0x96, 0x80};
1026 for (; j < 16; ++j) {
1027 if (!(Rt[j] == rT && Gt[j] == gT && Bt[j] == bT && Rb[j] == rB && Gb[j] == gB && Bb[j] == bB))
1030 uint32_t run = (uint32_t)(j - i);
1033 bool is_transparent = (rT == 0 && gT == 0 && bT == 0 && rB == 0 && gB == 0 && bB == 0);
1035 if (is_transparent) {
1037 if (cur_fr != -1 || cur_fg != -1 || cur_fb != -1 || cur_br != -1 || cur_bg != -1 || cur_bb != -1) {
1039 cur_fr = cur_fg = cur_fb = -1;
1040 cur_br = cur_bg = cur_bb = -1;
1047 for (uint32_t k = 1; k < run; ++k) {
1053 if (cur_fr != rT || cur_fg != gT || cur_fb != bT) {
1059 if (cur_br != rB || cur_bg != gB || cur_bb != bB) {
1067 ob_write(&ob, (
const char *)glyph_utf8, 3);
1071 for (uint32_t k = 1; k < run; ++k) {
1072 ob_write(&ob, (
const char *)glyph_utf8, 3);
1083 for (; x < width;) {
1084 const uint8_t *pT = rowT + (size_t)x * 3u;
1085 const uint8_t *pB = rowB ? rowB + (size_t)x * 3u : NULL;
1087 uint8_t rT = pT[0], gT = pT[1], bT = pT[2];
1088 uint8_t rB = rT, gB = gT, bB = bT;
1097 for (; j < width; ++j) {
1098 const uint8_t *qT = rowT + (size_t)j * 3u;
1099 const uint8_t *qB = rowB ? rowB + (size_t)j * 3u : NULL;
1100 uint8_t rT2 = qT[0], gT2 = qT[1], bT2 = qT[2];
1101 uint8_t rB2 = qB ? qB[0] : rT2, gB2 = qB ? qB[1] : gT2, bB2 = qB ? qB[2] : bT2;
1102 if (!((rT2 == rT && gT2 == gT && bT2 == bT) && (rB2 == rB && gB2 == gB && bB2 == bB)))
1105 uint32_t run = (uint32_t)(j - x);
1108 bool is_transparent = (rT == 0 && gT == 0 && bT == 0 && rB == 0 && gB == 0 && bB == 0);
1110 if (is_transparent) {
1112 if (cur_fr != -1 || cur_fg != -1 || cur_fb != -1 || cur_br != -1 || cur_bg != -1 || cur_bb != -1) {
1114 cur_fr = cur_fg = cur_fb = -1;
1115 cur_br = cur_bg = cur_bb = -1;
1122 for (uint32_t k = 1; k < run; ++k) {
1128 if (cur_fr != rT || cur_fg != gT || cur_fb != bT) {
1134 if (cur_br != rB || cur_bg != gB || cur_bb != bB) {
1142 static const char HB[3] = {(char)0xE2, (
char)0x96, (char)0x80};
1147 for (uint32_t k = 1; k < run; ++k) {
1159 if (y + 2 < height) {
1162 cur_fr = cur_fg = cur_fb = -1;
1163 cur_br = cur_bg = cur_bb = -1;
1176void image_flip_horizontal_neon(image_t *image) {
1177 if (!image || !image->pixels || image->w < 2) {
1182 for (
int y = 0; y < image->h; y++) {
1183 rgb_pixel_t *row = &image->pixels[y * image->w];
1184 int width = image->w;
1189 int right_pix = width - 1;
1192 while (left_pix + 3 < right_pix - 3) {
1194 uint32_t *left_ptr = (uint32_t *)&row[left_pix];
1195 uint32_t *right_ptr = (uint32_t *)&row[right_pix - 3];
1197 uint32x2_t left_0 = vld1_u32(left_ptr);
1198 uint32_t left_1 = left_ptr[2];
1200 uint32x2_t right_0 = vld1_u32(right_ptr);
1201 uint32_t right_1 = right_ptr[2];
1204 vst1_u32(right_ptr, left_0);
1205 right_ptr[2] = left_1;
1206 vst1_u32(left_ptr, right_0);
1207 left_ptr[2] = right_1;
1214 while (left_pix < right_pix) {
1215 rgb_pixel_t temp = row[left_pix];
1216 row[left_pix] = row[right_pix];
1217 row[right_pix] = temp;
uint8_t rgb_to_256color(uint8_t r, uint8_t g, uint8_t b)
global_dec3_cache_t g_dec3_cache
#define CUBE_GRAY_THRESHOLD
void emit_set_256_color_bg(outbuf_t *ob, uint8_t color_idx)
void emit_set_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
void emit_set_256_color_fg(outbuf_t *ob, uint8_t color_idx)
void ob_term(outbuf_t *ob)
void ob_putc(outbuf_t *ob, char c)
bool rep_is_profitable(uint32_t runlen)
void emit_set_truecolor_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
void emit_rep(outbuf_t *ob, uint32_t extra)
void ob_write(outbuf_t *ob, const char *s, size_t n)
void emit_reset(outbuf_t *ob)
void emit_set_truecolor_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
void emit_set_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
#define R2(v, w, x, y, z, i)
uint64_t time_get_ns(void)
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)