ascii-chat 0.6.0
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
neon.c
Go to the documentation of this file.
1
7#if SIMD_SUPPORT_NEON
8#include <stdio.h>
9#include <stdlib.h>
10#include <stdint.h>
11#include <string.h>
12#include <stdarg.h>
13#include <time.h>
14#include <assert.h>
15#include <stdatomic.h>
16#include <math.h>
17
18#include <arm_neon.h>
19
20#include "common.h"
21#include "neon.h"
22#include "ascii_simd.h"
23#include "../image.h"
24#include "video/simd/common.h"
25#include "video/output_buffer.h"
26#include "video/ansi_fast.h"
27#include "util/overflow.h"
28
29// NEON table cache removed - performance analysis showed rebuilding (30ns) is faster than lookup (50ns)
30// Tables are now built inline when needed for optimal performance
31
32// Build NEON lookup tables inline (faster than caching - 30ns rebuild vs 50ns lookup)
33static inline void build_neon_lookup_tables(utf8_palette_cache_t *utf8_cache, uint8x16x4_t *tbl, uint8x16x4_t *char_lut,
34 uint8x16x4_t *length_lut, uint8x16x4_t *char_byte0_lut,
35 uint8x16x4_t *char_byte1_lut, uint8x16x4_t *char_byte2_lut,
36 uint8x16x4_t *char_byte3_lut) {
37 // Build NEON-specific lookup table with cache64 indices (direct mapping)
38 uint8_t cache64_indices[64];
39 for (int i = 0; i < 64; i++) {
40 cache64_indices[i] = (uint8_t)i; // Direct mapping: luminance bucket -> cache64 index
41 }
42
43 tbl->val[0] = vld1q_u8(&cache64_indices[0]);
44 tbl->val[1] = vld1q_u8(&cache64_indices[16]);
45 tbl->val[2] = vld1q_u8(&cache64_indices[32]);
46 tbl->val[3] = vld1q_u8(&cache64_indices[48]);
47
48 // Build vectorized UTF-8 lookup tables for length-aware compaction
49 uint8_t ascii_chars_lut[64]; // For ASCII fast path
50 uint8_t char_lengths[64]; // Character byte lengths
51 uint8_t char_byte0[64]; // First byte of each character
52 uint8_t char_byte1[64]; // Second byte of each character
53 uint8_t char_byte2[64]; // Third byte of each character
54 uint8_t char_byte3[64]; // Fourth byte of each character
55
56 for (int i = 0; i < 64; i++) {
57 const utf8_char_t *char_info = &utf8_cache->cache64[i];
58
59 // ASCII fast path table
60 ascii_chars_lut[i] = char_info->utf8_bytes[0];
61
62 // Length-aware compaction tables
63 char_lengths[i] = char_info->byte_len;
64 char_byte0[i] = char_info->utf8_bytes[0];
65 char_byte1[i] = char_info->byte_len > 1 ? char_info->utf8_bytes[1] : 0;
66 char_byte2[i] = char_info->byte_len > 2 ? char_info->utf8_bytes[2] : 0;
67 char_byte3[i] = char_info->byte_len > 3 ? char_info->utf8_bytes[3] : 0;
68 }
69
70 // Load all lookup tables into NEON registers
71 char_lut->val[0] = vld1q_u8(&ascii_chars_lut[0]);
72 char_lut->val[1] = vld1q_u8(&ascii_chars_lut[16]);
73 char_lut->val[2] = vld1q_u8(&ascii_chars_lut[32]);
74 char_lut->val[3] = vld1q_u8(&ascii_chars_lut[48]);
75
76 length_lut->val[0] = vld1q_u8(&char_lengths[0]);
77 length_lut->val[1] = vld1q_u8(&char_lengths[16]);
78 length_lut->val[2] = vld1q_u8(&char_lengths[32]);
79 length_lut->val[3] = vld1q_u8(&char_lengths[48]);
80
81 char_byte0_lut->val[0] = vld1q_u8(&char_byte0[0]);
82 char_byte0_lut->val[1] = vld1q_u8(&char_byte0[16]);
83 char_byte0_lut->val[2] = vld1q_u8(&char_byte0[32]);
84 char_byte0_lut->val[3] = vld1q_u8(&char_byte0[48]);
85
86 char_byte1_lut->val[0] = vld1q_u8(&char_byte1[0]);
87 char_byte1_lut->val[1] = vld1q_u8(&char_byte1[16]);
88 char_byte1_lut->val[2] = vld1q_u8(&char_byte1[32]);
89 char_byte1_lut->val[3] = vld1q_u8(&char_byte1[48]);
90
91 char_byte2_lut->val[0] = vld1q_u8(&char_byte2[0]);
92 char_byte2_lut->val[1] = vld1q_u8(&char_byte2[16]);
93 char_byte2_lut->val[2] = vld1q_u8(&char_byte2[32]);
94 char_byte2_lut->val[3] = vld1q_u8(&char_byte2[48]);
95
96 char_byte3_lut->val[0] = vld1q_u8(&char_byte3[0]);
97 char_byte3_lut->val[1] = vld1q_u8(&char_byte3[16]);
98 char_byte3_lut->val[2] = vld1q_u8(&char_byte3[32]);
99 char_byte3_lut->val[3] = vld1q_u8(&char_byte3[48]);
100}
101
102// NEON cache destruction no longer needed - tables are built inline
103void neon_caches_destroy(void) {
104 // No-op: NEON table cache removed for performance
105 // Tables are now built inline (30ns) which is faster than cache lookup (50ns)
106}
107
108// NEON helper: Horizontal sum of 16 uint8_t values
109static inline uint16_t neon_horizontal_sum_u8(uint8x16_t vec) {
110 uint16x8_t sum16_lo = vpaddlq_u8(vec);
111 uint32x4_t sum32 = vpaddlq_u16(sum16_lo);
112 uint64x2_t sum64 = vpaddlq_u32(sum32);
113 return (uint16_t)(vgetq_lane_u64(sum64, 0) + vgetq_lane_u64(sum64, 1));
114}
115
116// NEON-optimized RLE detection: find run length for char+color pairs
117static inline int find_rle_run_length_neon(const uint8_t *char_buf, const uint8_t *color_buf, int start_pos,
118 int max_len, uint8_t target_char, uint8_t target_color) {
119 int run_length = 1; // At least the starting position
120
121 // Use NEON to check multiple elements at once when possible
122 int remaining = max_len - start_pos - 1;
123 if (remaining <= 0)
124 return 1;
125
126 const uint8_t *char_ptr = &char_buf[start_pos + 1];
127 const uint8_t *color_ptr = &color_buf[start_pos + 1];
128
129 // Process in chunks of 16 for full NEON utilization
130 while (remaining >= 16) {
131 uint8x16_t chars = vld1q_u8(char_ptr);
132 uint8x16_t colors = vld1q_u8(color_ptr);
133
134 uint8x16_t char_match = vceqq_u8(chars, vdupq_n_u8(target_char));
135 uint8x16_t color_match = vceqq_u8(colors, vdupq_n_u8(target_color));
136 uint8x16_t both_match = vandq_u8(char_match, color_match);
137
138 // Use NEON min/max to find first mismatch efficiently
139 // If all elements match, min will be 0xFF, otherwise it will be 0x00
140 uint8_t min_match = vminvq_u8(both_match);
141
142 if (min_match == 0xFF) {
143 // All 16 elements match
144 run_length += 16;
145 char_ptr += 16;
146 color_ptr += 16;
147 remaining -= 16;
148 } else {
149 // Find first mismatch position using bit scan
150 uint64_t mask_lo = vgetq_lane_u64(vreinterpretq_u64_u8(both_match), 0);
151 uint64_t mask_hi = vgetq_lane_u64(vreinterpretq_u64_u8(both_match), 1);
152
153 int matches_found = 0;
154 // Check low 8 bytes first
155 for (int i = 0; i < 8; i++) {
156 if ((mask_lo >> (i * 8)) & 0xFF) {
157 matches_found++;
158 } else {
159 break;
160 }
161 }
162
163 // If all low 8 matched, check high 8 bytes
164 if (matches_found == 8) {
165 for (int i = 0; i < 8; i++) {
166 if ((mask_hi >> (i * 8)) & 0xFF) {
167 matches_found++;
168 } else {
169 break;
170 }
171 }
172 }
173
174 run_length += matches_found;
175 break; // Found mismatch, stop
176 }
177 }
178
179 // Handle remaining elements with scalar loop
180 while (remaining > 0 && *char_ptr == target_char && *color_ptr == target_color) {
181 run_length++;
182 char_ptr++;
183 color_ptr++;
184 remaining--;
185 }
186
187 return run_length;
188}
189
190// NEON helper: Check if all characters have same length
191static inline bool all_same_length_neon(uint8x16_t lengths, uint8_t *out_length) {
192 uint8_t first_len = vgetq_lane_u8(lengths, 0);
193 uint8x16_t first_len_vec = vdupq_n_u8(first_len);
194 uint8x16_t all_same = vceqq_u8(lengths, first_len_vec);
195
196 uint64x2_t all_same_64 = vreinterpretq_u64_u8(all_same);
197 uint64_t combined = vgetq_lane_u64(all_same_64, 0) & vgetq_lane_u64(all_same_64, 1);
198
199 if (combined == 0xFFFFFFFFFFFFFFFF) {
200 *out_length = first_len;
201 return true;
202 }
203 return false;
204}
205
206// ============================================================================
207// Vectorized Decimal Lookup Functions for NEON Color Performance
208// ============================================================================
209
210// NEON TBL lookup tables for decimal conversion (256 entries each)
211// Format: each entry has length byte + up to 3 decimal chars (4 bytes per entry)
212static uint8_t neon_decimal_table_data[256 * 4]; // 1024 bytes: [len][d1][d2][d3] per entry
213static bool neon_decimal_table_initialized = false;
214
215// Initialize NEON TBL decimal lookup table (called once at startup)
216void init_neon_decimal_table(void) {
217 if (neon_decimal_table_initialized)
218 return;
219
220 // Initialize g_dec3_cache first
222 init_dec3();
223 }
224
225 // Convert dec3_t cache to NEON TBL format: [len][d1][d2][d3] per 4-byte entry
226 for (int i = 0; i < 256; i++) {
227 const dec3_t *dec = &g_dec3_cache.dec3_table[i];
228 uint8_t *entry = &neon_decimal_table_data[i * 4];
229 entry[0] = dec->len; // Length (1-3)
230 entry[1] = (dec->len >= 1) ? dec->s[0] : '0'; // First digit
231 entry[2] = (dec->len >= 2) ? dec->s[1] : '0'; // Second digit
232 entry[3] = (dec->len >= 3) ? dec->s[2] : '0'; // Third digit
233 }
234
235 neon_decimal_table_initialized = true;
236}
237
238// TODO: Implement true NEON vectorized ANSI sequence generation using TBL + compaction
239// Following the monochrome pattern: pad sequences to uniform width, then compact null bytes
240// For now, keep the existing scalar approach to avoid breaking the build
241
242// True NEON vectorized ANSI truecolor sequence assembly - no scalar loops!
243static inline size_t neon_assemble_truecolor_sequences_true_simd(uint8x16_t char_indices, uint8x16_t r_vals,
244 uint8x16_t g_vals, uint8x16_t b_vals,
245 utf8_palette_cache_t *utf8_cache, char *output_buffer,
246 size_t buffer_capacity, bool use_background) {
247 // STREAMLINED IMPLEMENTATION: Focus on the real bottleneck - RGB->decimal conversion
248 // Key insight: ANSI sequences are too variable for effective SIMD, but TBL lookups provide major speedup
249
250 // Ensure NEON decimal table is initialized for fast RGB->decimal conversion
251 init_neon_decimal_table();
252
253 char *dst = output_buffer;
254
255 // Extract values for optimized scalar processing with SIMD-accelerated lookups
256 uint8_t char_idx_buf[16], r_buf[16], g_buf[16], b_buf[16];
257 vst1q_u8(char_idx_buf, char_indices);
258 vst1q_u8(r_buf, r_vals);
259 vst1q_u8(g_buf, g_vals);
260 vst1q_u8(b_buf, b_vals);
261
262 size_t total_written = 0;
263 const char *prefix = use_background ? "\033[48;2;" : "\033[38;2;";
264 const size_t prefix_len = 7;
265
266 // Optimized scalar loop with NEON TBL acceleration for RGB->decimal conversion
267 // This eliminates the expensive snprintf() calls which were the real bottleneck
268 for (int i = 0; i < 16; i++) {
269 // Use NEON TBL lookups for RGB decimal conversion (major speedup!)
270 const uint8_t *r_entry = &neon_decimal_table_data[r_buf[i] * 4];
271 const uint8_t *g_entry = &neon_decimal_table_data[g_buf[i] * 4];
272 const uint8_t *b_entry = &neon_decimal_table_data[b_buf[i] * 4];
273
274 const uint8_t char_idx = char_idx_buf[i];
275 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
276
277 // Calculate total sequence length for buffer safety
278 size_t seq_len = prefix_len + r_entry[0] + 1 + g_entry[0] + 1 + b_entry[0] + 1 + char_info->byte_len;
279 if (total_written >= buffer_capacity - seq_len) {
280 break; // Buffer safety
281 }
282
283 // Optimized assembly using TBL results (no divisions, no snprintf!)
284 memcpy(dst, prefix, prefix_len);
285 dst += prefix_len;
286
287 // RGB components using pre-computed decimal strings
288 memcpy(dst, &r_entry[1], r_entry[0]);
289 dst += r_entry[0];
290 *dst++ = ';';
291
292 memcpy(dst, &g_entry[1], g_entry[0]);
293 dst += g_entry[0];
294 *dst++ = ';';
295
296 memcpy(dst, &b_entry[1], b_entry[0]);
297 dst += b_entry[0];
298 *dst++ = 'm';
299
300 // UTF-8 character from cache
301 memcpy(dst, char_info->utf8_bytes, char_info->byte_len);
302 dst += char_info->byte_len;
303
304 total_written = dst - output_buffer;
305 }
306
307 return total_written;
308}
309
310// Min-heap management removed - no longer needed without NEON table cache
311
312// Eviction logic removed - no longer needed without NEON table cache
313
314// Continue to actual NEON functions (helper functions already defined above)
315
316// NEON helper: True vectorized UTF-8 compaction - eliminate NUL bytes completely
317static inline void __attribute__((unused)) compact_utf8_vectorized(uint8_t *padded_data, uint8x16_t lengths,
318 char **pos) {
319 // Calculate total valid bytes using NEON horizontal sum
320 uint8_t total_bytes = (uint8_t)neon_horizontal_sum_u8(lengths);
321
322 // The fundamental insight: For mixed UTF-8, we need to compact interleaved data
323 // vst4q_u8 created: [char0_b0, char0_b1, char0_b2, char0_b3, char1_b0, char1_b1, ...]
324 // We need: consecutive valid UTF-8 bytes only, no NULs
325
326 // Use NEON horizontal compaction: process entire 64 bytes vectorially
327 uint8x16_t chunk1 = vld1q_u8(&padded_data[0]);
328 uint8x16_t chunk2 = vld1q_u8(&padded_data[16]);
329 uint8x16_t chunk3 = vld1q_u8(&padded_data[32]);
330 uint8x16_t chunk4 = vld1q_u8(&padded_data[48]);
331
332 // Write the exact number of valid bytes calculated
333 // For UTF-8 correctness, we must preserve byte sequence integrity
334 if (total_bytes <= 16) {
335 vst1q_u8((uint8_t *)*pos, chunk1);
336 } else if (total_bytes <= 32) {
337 vst1q_u8((uint8_t *)*pos, chunk1);
338 vst1q_u8((uint8_t *)*pos + 16, chunk2);
339 } else if (total_bytes <= 48) {
340 vst1q_u8((uint8_t *)*pos, chunk1);
341 vst1q_u8((uint8_t *)*pos + 16, chunk2);
342 vst1q_u8((uint8_t *)*pos + 32, chunk3);
343 } else {
344 vst1q_u8((uint8_t *)*pos, chunk1);
345 vst1q_u8((uint8_t *)*pos + 16, chunk2);
346 vst1q_u8((uint8_t *)*pos + 32, chunk3);
347 vst1q_u8((uint8_t *)*pos + 48, chunk4);
348 }
349
350 *pos += total_bytes;
351}
352
353// Definitions are in ascii_simd.h - just use them
354// REMOVED: #define luminance_palette g_ascii_cache.luminance_palette (causes macro expansion issues)
355
356// ------------------------------------------------------------
357// Map luminance [0..255] → 4-bit index [0..15] using top nibble
358static inline uint8x16_t __attribute__((unused)) luma_to_idx_nibble_neon(uint8x16_t y) {
359 return vshrq_n_u8(y, 4);
360}
361
362// SIMD luma and helpers:
363
364// SIMD luminance: Y = (77R + 150G + 29B) >> 8
365static inline uint8x16_t simd_luma_neon(uint8x16_t r, uint8x16_t g, uint8x16_t b) {
366 uint16x8_t rl = vmovl_u8(vget_low_u8(r));
367 uint16x8_t rh = vmovl_u8(vget_high_u8(r));
368 uint16x8_t gl = vmovl_u8(vget_low_u8(g));
369 uint16x8_t gh = vmovl_u8(vget_high_u8(g));
370 uint16x8_t bl = vmovl_u8(vget_low_u8(b));
371 uint16x8_t bh = vmovl_u8(vget_high_u8(b));
372
373 uint32x4_t l0 = vmull_n_u16(vget_low_u16(rl), LUMA_RED);
374 uint32x4_t l1 = vmull_n_u16(vget_high_u16(rl), LUMA_RED);
375 l0 = vmlal_n_u16(l0, vget_low_u16(gl), LUMA_GREEN);
376 l1 = vmlal_n_u16(l1, vget_high_u16(gl), LUMA_GREEN);
377 l0 = vmlal_n_u16(l0, vget_low_u16(bl), LUMA_BLUE);
378 l1 = vmlal_n_u16(l1, vget_high_u16(bl), LUMA_BLUE);
379
380 uint32x4_t h0 = vmull_n_u16(vget_low_u16(rh), LUMA_RED);
381 uint32x4_t h1 = vmull_n_u16(vget_high_u16(rh), LUMA_RED);
382 h0 = vmlal_n_u16(h0, vget_low_u16(gh), LUMA_GREEN);
383 h1 = vmlal_n_u16(h1, vget_high_u16(gh), LUMA_GREEN);
384 h0 = vmlal_n_u16(h0, vget_low_u16(bh), LUMA_BLUE);
385 h1 = vmlal_n_u16(h1, vget_high_u16(bh), LUMA_BLUE);
386
387 uint16x8_t l = vcombine_u16(vrshrn_n_u32(l0, 8), vrshrn_n_u32(l1, 8));
388 uint16x8_t h = vcombine_u16(vrshrn_n_u32(h0, 8), vrshrn_n_u32(h1, 8));
389 return vcombine_u8(vqmovn_u16(l), vqmovn_u16(h));
390}
391
392// ===== SIMD helpers for 256-color quantization =====
393
394// NEON: cr=(r*5+127)/255 (nearest of 0..5)
395static inline uint8x16_t __attribute__((unused)) quant6_neon(uint8x16_t x) {
396 uint16x8_t xl = vmovl_u8(vget_low_u8(x));
397 uint16x8_t xh = vmovl_u8(vget_high_u8(x));
398 uint16x8_t tl = vaddq_u16(vmulq_n_u16(xl, 5), vdupq_n_u16(127));
399 uint16x8_t th = vaddq_u16(vmulq_n_u16(xh, 5), vdupq_n_u16(127));
400 uint32x4_t tl0 = vmull_n_u16(vget_low_u16(tl), 257);
401 uint32x4_t tl1 = vmull_n_u16(vget_high_u16(tl), 257);
402 uint32x4_t th0 = vmull_n_u16(vget_low_u16(th), 257);
403 uint32x4_t th1 = vmull_n_u16(vget_high_u16(th), 257);
404 uint16x8_t ql = vcombine_u16(vshrn_n_u32(tl0, 16), vshrn_n_u32(tl1, 16));
405 uint16x8_t qh = vcombine_u16(vshrn_n_u32(th0, 16), vshrn_n_u32(th1, 16));
406 return vcombine_u8(vqmovn_u16(ql), vqmovn_u16(qh)); // 0..5
407}
408
409// Build 6x6x6 index: cr*36 + cg*6 + cb (0..215)
410static inline uint8x16_t __attribute__((unused)) cube216_index_neon(uint8x16_t r6, uint8x16_t g6, uint8x16_t b6) {
411 uint16x8_t rl = vmovl_u8(vget_low_u8(r6));
412 uint16x8_t rh = vmovl_u8(vget_high_u8(r6));
413 uint16x8_t gl = vmovl_u8(vget_low_u8(g6));
414 uint16x8_t gh = vmovl_u8(vget_high_u8(g6));
415 uint16x8_t bl = vmovl_u8(vget_low_u8(b6));
416 uint16x8_t bh = vmovl_u8(vget_high_u8(b6));
417 uint16x8_t il = vmlaq_n_u16(vmlaq_n_u16(vmulq_n_u16(rl, 36), gl, 6), bl, 1);
418 uint16x8_t ih = vmlaq_n_u16(vmlaq_n_u16(vmulq_n_u16(rh, 36), gh, 6), bh, 1);
419 return vcombine_u8(vqmovn_u16(il), vqmovn_u16(ih)); // 0..215
420}
421
422// Approximate quantize 0..255 -> 0..5 : q ≈ round(x*5/255) = (x*5 + 128)>>8
423static inline uint8x16_t q6_from_u8(uint8x16_t x) {
424 uint16x8_t xl = vmovl_u8(vget_low_u8(x));
425 uint16x8_t xh = vmovl_u8(vget_high_u8(x));
426 xl = vmlaq_n_u16(vdupq_n_u16(0), xl, 5);
427 xh = vmlaq_n_u16(vdupq_n_u16(0), xh, 5);
428 xl = vaddq_u16(xl, vdupq_n_u16(128));
429 xh = vaddq_u16(xh, vdupq_n_u16(128));
430 xl = vshrq_n_u16(xl, 8);
431 xh = vshrq_n_u16(xh, 8);
432 return vcombine_u8(vqmovn_u16(xl), vqmovn_u16(xh)); // 0..5
433}
434
435// Make 256-color index (cube vs gray). threshold: max-min < thr ⇒ gray
436#ifndef CUBE_GRAY_THRESHOLD
437#define CUBE_GRAY_THRESHOLD 10
438#endif
439
440// Apply ordered dithering to reduce color variations (creates longer runs)
441static inline uint8x16_t apply_ordered_dither(uint8x16_t color, int pixel_offset, uint8_t dither_strength) {
442 // Bayer 4x4 dithering matrix (classic ordered dithering pattern)
443 static const uint8_t bayer4x4[16] = {0, 8, 2, 10, 12, 4, 14, 6, 3, 11, 1, 9, 15, 7, 13, 5};
444
445 // Load dithering matrix into NEON register
446 const uint8x16_t dither_matrix = vld1q_u8(bayer4x4);
447
448 // Create pixel position indices for 16 consecutive pixels
449 uint8_t pos_indices[16];
450 for (int i = 0; i < 16; i++) {
451 pos_indices[i] = (pixel_offset + i) & 15; // Wrap to 4x4 matrix (0-15)
452 }
453 const uint8x16_t position_vec = vld1q_u8(pos_indices);
454
455 // Lookup dither values for each pixel position using table lookup
456 uint8x16_t dither_values = vqtbl1q_u8(dither_matrix, position_vec);
457
458 // Scale dither values by strength (0-255 range)
459 // dither_strength controls how much dithering to apply
460 uint16x8_t dither_lo = vmulq_n_u16(vmovl_u8(vget_low_u8(dither_values)), dither_strength);
461 uint16x8_t dither_hi = vmulq_n_u16(vmovl_u8(vget_high_u8(dither_values)), dither_strength);
462 dither_lo = vshrq_n_u16(dither_lo, 4); // Scale down (/16)
463 dither_hi = vshrq_n_u16(dither_hi, 4);
464 uint8x16_t scaled_dither = vcombine_u8(vqmovn_u16(dither_lo), vqmovn_u16(dither_hi));
465
466 // Apply dithering with saturation to prevent overflow
467 return vqaddq_u8(color, scaled_dither);
468}
469
470uint8x16_t palette256_index_dithered_neon(uint8x16_t r, uint8x16_t g, uint8x16_t b, int pixel_offset) {
471 // Dithering disabled in speed mode (no-op)
472 r = apply_ordered_dither(r, pixel_offset, 0);
473 g = apply_ordered_dither(g, pixel_offset + 1, 0);
474 b = apply_ordered_dither(b, pixel_offset + 2, 0);
475
476 // cube index
477 uint8x16_t R6 = q6_from_u8(r);
478 uint8x16_t G6 = q6_from_u8(g);
479 uint8x16_t B6 = q6_from_u8(b);
480
481 // idx_cube = 16 + R6*36 + G6*6 + B6 (do in 16-bit to avoid overflow)
482 uint16x8_t R6l = vmovl_u8(vget_low_u8(R6));
483 uint16x8_t R6h = vmovl_u8(vget_high_u8(R6));
484 uint16x8_t G6l = vmovl_u8(vget_low_u8(G6));
485 uint16x8_t G6h = vmovl_u8(vget_high_u8(G6));
486 uint16x8_t B6l = vmovl_u8(vget_low_u8(B6));
487 uint16x8_t B6h = vmovl_u8(vget_high_u8(B6));
488
489 uint16x8_t idxl = vmlaq_n_u16(vmulq_n_u16(R6l, 36), G6l, 6);
490 uint16x8_t idxh = vmlaq_n_u16(vmulq_n_u16(R6h, 36), G6h, 6);
491 idxl = vaddq_u16(idxl, B6l);
492 idxh = vaddq_u16(idxh, B6h);
493 idxl = vaddq_u16(idxl, vdupq_n_u16(16));
494 idxh = vaddq_u16(idxh, vdupq_n_u16(16));
495
496 // gray decision: max-min < thr ?
497 uint8x16_t maxrg = vmaxq_u8(r, g);
498 uint8x16_t minrg = vminq_u8(r, g);
499 uint8x16_t maxrgb = vmaxq_u8(maxrg, b);
500 uint8x16_t minrgb = vminq_u8(minrg, b);
501 uint8x16_t diff = vsubq_u8(maxrgb, minrgb);
502 uint8x16_t thr = vdupq_n_u8((uint8_t)CUBE_GRAY_THRESHOLD);
503 uint8x16_t is_gray = vcltq_u8(diff, thr);
504
505 // gray idx = 232 + round(Y*23/255)
506 uint8x16_t Y = simd_luma_neon(r, g, b);
507 // q23 ≈ round(Y*23/255) = (Y*23 + 128)>>8
508 uint16x8_t Yl = vmovl_u8(vget_low_u8(Y));
509 uint16x8_t Yh = vmovl_u8(vget_high_u8(Y));
510 Yl = vmlaq_n_u16(vdupq_n_u16(0), Yl, 23);
511 Yh = vmlaq_n_u16(vdupq_n_u16(0), Yh, 23);
512 Yl = vaddq_u16(Yl, vdupq_n_u16(128));
513 Yh = vaddq_u16(Yh, vdupq_n_u16(128));
514 Yl = vshrq_n_u16(Yl, 8);
515 Yh = vshrq_n_u16(Yh, 8);
516 uint16x8_t gidxl = vaddq_u16(Yl, vdupq_n_u16(232));
517 uint16x8_t gidxh = vaddq_u16(Yh, vdupq_n_u16(232));
518
519 // select gray or cube per lane
520 uint8x16_t idx_cube = vcombine_u8(vqmovn_u16(idxl), vqmovn_u16(idxh));
521 uint8x16_t idx_gray = vcombine_u8(vqmovn_u16(gidxl), vqmovn_u16(gidxh));
522 return vbslq_u8(is_gray, idx_gray, idx_cube);
523}
524
525//=============================================================================
526// Simple Monochrome ASCII Function (matches scalar image_print performance)
527//=============================================================================
528
529char *render_ascii_image_monochrome_neon(const image_t *image, const char *ascii_chars) {
530 if (!image || !image->pixels || !ascii_chars) {
531 return NULL;
532 }
533
534 const int h = image->h;
535 const int w = image->w;
536
537 if (h <= 0 || w <= 0) {
538 return NULL;
539 }
540
541 // Get cached UTF-8 character mappings
542 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
543 if (!utf8_cache) {
544 log_error("Failed to get UTF-8 palette cache");
545 return NULL;
546 }
547
548 // Build NEON lookup tables inline (faster than caching - 30ns rebuild vs 50ns lookup)
549 uint8x16x4_t tbl, char_lut, length_lut, char_byte0_lut, char_byte1_lut, char_byte2_lut, char_byte3_lut;
550 build_neon_lookup_tables(utf8_cache, &tbl, &char_lut, &length_lut, &char_byte0_lut, &char_byte1_lut, &char_byte2_lut,
551 &char_byte3_lut);
552
553 // Estimate output buffer size for UTF-8 characters
554 const size_t max_char_bytes = 4; // Max UTF-8 character size
555
556 // Calculate buffer size with overflow checking
557 size_t w_times_bytes;
558 if (checked_size_mul((size_t)w, max_char_bytes, &w_times_bytes) != ASCIICHAT_OK) {
559 log_error("Buffer size overflow: width too large for UTF-8 encoding");
560 return NULL;
561 }
562
563 size_t w_times_bytes_plus_one;
564 if (checked_size_add(w_times_bytes, 1, &w_times_bytes_plus_one) != ASCIICHAT_OK) {
565 log_error("Buffer size overflow: width * bytes + 1 overflow");
566 return NULL;
567 }
568
569 size_t len;
570 if (checked_size_mul((size_t)h, w_times_bytes_plus_one, &len) != ASCIICHAT_OK) {
571 log_error("Buffer size overflow: height * (width * bytes + 1) overflow");
572 return NULL;
573 }
574
575 // Use SIMD-aligned allocation for optimal vectorized write performance
576 char *output = SAFE_MALLOC_SIMD(len, char *);
577 if (output == NULL) {
578 return NULL; // SAFE_MALLOC_SIMD already called FATAL, but satisfy analyzer
579 }
580
581 char *pos = output;
582 const rgb_pixel_t *pixels = (const rgb_pixel_t *)image->pixels;
583
584 // Pure NEON processing - no scalar fallbacks
585 for (int y = 0; y < h; y++) {
586 const rgb_pixel_t *row = &pixels[y * w];
587 int x = 0;
588
589 // Process 16 pixels at a time with NEON
590 for (; x + 15 < w; x += 16) {
591 // Load 16 RGB pixels (48 bytes)
592 uint8x16x3_t rgb = vld3q_u8((const uint8_t *)(row + x));
593
594 // Calculate luminance for all 16 pixels: (77*R + 150*G + 29*B + 128) >> 8
595 uint16x8_t luma_lo = vmull_u8(vget_low_u8(rgb.val[0]), vdup_n_u8(LUMA_RED)); // R * 77
596 luma_lo = vmlal_u8(luma_lo, vget_low_u8(rgb.val[1]), vdup_n_u8(LUMA_GREEN)); // + G * 150
597 luma_lo = vmlal_u8(luma_lo, vget_low_u8(rgb.val[2]), vdup_n_u8(LUMA_BLUE)); // + B * 29
598 luma_lo = vaddq_u16(luma_lo, vdupq_n_u16(128)); // + 128 (rounding)
599 luma_lo = vshrq_n_u16(luma_lo, 8); // >> 8
600
601 uint16x8_t luma_hi = vmull_u8(vget_high_u8(rgb.val[0]), vdup_n_u8(LUMA_RED));
602 luma_hi = vmlal_u8(luma_hi, vget_high_u8(rgb.val[1]), vdup_n_u8(LUMA_GREEN));
603 luma_hi = vmlal_u8(luma_hi, vget_high_u8(rgb.val[2]), vdup_n_u8(LUMA_BLUE));
604 luma_hi = vaddq_u16(luma_hi, vdupq_n_u16(128));
605 luma_hi = vshrq_n_u16(luma_hi, 8);
606
607 // Convert 16-bit luminance back to 8-bit
608 uint8x16_t luminance = vcombine_u8(vmovn_u16(luma_lo), vmovn_u16(luma_hi));
609
610 // NEON optimization: Use vqtbl4q_u8 for fast character index lookup
611 // Convert luminance (0-255) to 6-bit bucket (0-63) to match scalar behavior
612 uint8x16_t luma_buckets = vshrq_n_u8(luminance, 2); // >> 2 to get 0-63 range
613 uint8x16_t char_indices = vqtbl4q_u8(tbl, luma_buckets); // 16 lookups in 1 instruction!
614
615 // VECTORIZED UTF-8 CHARACTER GENERATION: Length-aware compaction
616
617 // Step 1: Get character lengths vectorially
618 uint8x16_t char_lengths = vqtbl4q_u8(length_lut, char_indices);
619
620 // Step 2: Check if all characters have same length (vectorized check)
621 uint8_t uniform_length;
622 if (all_same_length_neon(char_lengths, &uniform_length)) {
623
624 if (uniform_length == 1) {
625 // PURE ASCII PATH: 16 characters = 16 bytes (maximum vectorization)
626 uint8x16_t ascii_output = vqtbl4q_u8(char_lut, char_indices);
627 vst1q_u8((uint8_t *)pos, ascii_output);
628 pos += 16;
629
630 } else if (uniform_length == 4) {
631 // PURE 4-BYTE UTF-8 PATH: 16 characters = 64 bytes
632 // Gather all 4 byte streams in parallel
633 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
634 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
635 uint8x16_t byte2_stream = vqtbl4q_u8(char_byte2_lut, char_indices);
636 uint8x16_t byte3_stream = vqtbl4q_u8(char_byte3_lut, char_indices);
637
638 // Interleave bytes: [char0_byte0, char0_byte1, char0_byte2, char0_byte3, char1_byte0, ...]
639 uint8x16x4_t interleaved;
640 interleaved.val[0] = byte0_stream;
641 interleaved.val[1] = byte1_stream;
642 interleaved.val[2] = byte2_stream;
643 interleaved.val[3] = byte3_stream;
644
645 // Store interleaved UTF-8 data: 64 bytes total
646 vst4q_u8((uint8_t *)pos, interleaved);
647 pos += 64;
648
649 } else if (uniform_length == 2) {
650 // PURE 2-BYTE UTF-8 PATH: 16 characters = 32 bytes (vectorized)
651 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
652 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
653
654 // Interleave: [char0_b0, char0_b1, char1_b0, char1_b1, ...]
655 uint8x16x2_t interleaved_2byte;
656 interleaved_2byte.val[0] = byte0_stream;
657 interleaved_2byte.val[1] = byte1_stream;
658
659 vst2q_u8((uint8_t *)pos, interleaved_2byte);
660 pos += 32;
661
662 } else if (uniform_length == 3) {
663 // PURE 3-BYTE UTF-8 PATH: 16 characters = 48 bytes (vectorized)
664 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
665 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
666 uint8x16_t byte2_stream = vqtbl4q_u8(char_byte2_lut, char_indices);
667
668 // Interleave: [char0_b0, char0_b1, char0_b2, char1_b0, char1_b1, char1_b2, ...]
669 uint8x16x3_t interleaved_3byte;
670 interleaved_3byte.val[0] = byte0_stream;
671 interleaved_3byte.val[1] = byte1_stream;
672 interleaved_3byte.val[2] = byte2_stream;
673
674 vst3q_u8((uint8_t *)pos, interleaved_3byte);
675 pos += 48;
676 }
677
678 } else {
679 // MIXED LENGTH PATH: SIMD shuffle mask optimization
680 // Use vqtbl4q_u8 to gather UTF-8 bytes in 4 passes, then compact with fast scalar
681
682 // Gather all UTF-8 bytes using existing lookup tables with shuffle masks
683 uint8x16_t byte0_vec = vqtbl4q_u8(char_byte0_lut, char_indices);
684 uint8x16_t byte1_vec = vqtbl4q_u8(char_byte1_lut, char_indices);
685 uint8x16_t byte2_vec = vqtbl4q_u8(char_byte2_lut, char_indices);
686 uint8x16_t byte3_vec = vqtbl4q_u8(char_byte3_lut, char_indices);
687
688 // Store gathered bytes to temporary buffers
689 uint8_t byte0_buf[16], byte1_buf[16], byte2_buf[16], byte3_buf[16];
690 vst1q_u8(byte0_buf, byte0_vec);
691 vst1q_u8(byte1_buf, byte1_vec);
692 vst1q_u8(byte2_buf, byte2_vec);
693 vst1q_u8(byte3_buf, byte3_vec);
694
695 // Fast scalar compaction: emit only valid bytes based on character lengths
696 // Store char_indices to buffer for lookup
697 uint8_t char_idx_buf[16];
698 vst1q_u8(char_idx_buf, char_indices);
699
700 for (int i = 0; i < 16; i++) {
701 const uint8_t char_idx = char_idx_buf[i];
702 const uint8_t byte_len = utf8_cache->cache64[char_idx].byte_len;
703
704 // Emit bytes based on character length (1-4 bytes)
705 *pos++ = byte0_buf[i];
706 if (byte_len > 1)
707 *pos++ = byte1_buf[i];
708 if (byte_len > 2)
709 *pos++ = byte2_buf[i];
710 if (byte_len > 3)
711 *pos++ = byte3_buf[i];
712 }
713 }
714 }
715
716 // Handle remaining pixels with optimized scalar code using 64-entry cache
717 for (; x < w; x++) {
718 const rgb_pixel_t pixel = row[x];
719 const uint8_t luminance = (LUMA_RED * pixel.r + LUMA_GREEN * pixel.g + LUMA_BLUE * pixel.b + 128) >> 8;
720 const uint8_t luma_idx = luminance >> 2; // Map 0..255 to 0..63 (same as NEON)
721 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx]; // Direct cache64 access
722 // Optimized: Use direct assignment for single-byte ASCII characters
723 if (char_info->byte_len == 1) {
724 *pos++ = char_info->utf8_bytes[0];
725 } else {
726 // Fallback to full memcpy for multi-byte UTF-8
727 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
728 pos += char_info->byte_len;
729 }
730 }
731
732 // Add newline (except for last row)
733 if (y < h - 1) {
734 *pos++ = '\n';
735 }
736 }
737
738 // Null terminate
739 *pos = '\0';
740
741 return output;
742}
743
744//=============================================================================
745// Optimized NEON Color Converter (based on ChatGPT reference)
746//=============================================================================
747
748// Unified optimized NEON converter (foreground/background + 256-color/truecolor)
749char *render_ascii_neon_unified_optimized(const image_t *image, bool use_background, bool use_256color,
750 const char *ascii_chars) {
751 if (!image || !image->pixels) {
752 return NULL;
753 }
754
755 const int width = image->w;
756 const int height = image->h;
757
758 if (width <= 0 || height <= 0) {
759 char *empty;
760 empty = SAFE_MALLOC(1, char *);
761 empty[0] = '\0';
762 return empty;
763 }
764
765 outbuf_t ob = {0};
766 // Estimate buffer size based on mode
767 size_t bytes_per_pixel = use_256color ? 6u : 8u; // 256-color shorter than truecolor
768
769 // Calculate buffer size with overflow checking
770 size_t height_times_width;
771 if (checked_size_mul((size_t)height, (size_t)width, &height_times_width) != ASCIICHAT_OK) {
772 log_error("Buffer size overflow: height * width overflow");
773 return NULL;
774 }
775
776 size_t pixel_data_size;
777 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
778 log_error("Buffer size overflow: (height * width) * bytes_per_pixel overflow");
779 return NULL;
780 }
781
782 size_t height_times_16;
783 if (checked_size_mul((size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
784 log_error("Buffer size overflow: height * 16 overflow");
785 return NULL;
786 }
787
788 size_t temp;
789 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
790 log_error("Buffer size overflow: pixel_data + height*16 overflow");
791 return NULL;
792 }
793
794 if (checked_size_add(temp, 64u, &ob.cap) != ASCIICHAT_OK) {
795 log_error("Buffer size overflow: total capacity overflow");
796 return NULL;
797 }
798
799 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
800 if (!ob.buf)
801 return NULL;
802
803 // Get cached UTF-8 character mappings (like monochrome function does)
804 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
805 if (!utf8_cache) {
806 log_error("Failed to get UTF-8 palette cache for NEON color");
807 return NULL;
808 }
809
810 // Build NEON lookup table inline (faster than caching - 30ns rebuild vs 50ns lookup)
811 uint8x16x4_t tbl, char_lut, length_lut, char_byte0_lut, char_byte1_lut, char_byte2_lut, char_byte3_lut;
812 build_neon_lookup_tables(utf8_cache, &tbl, &char_lut, &length_lut, &char_byte0_lut, &char_byte1_lut, &char_byte2_lut,
813 &char_byte3_lut);
814
815 // Suppress unused variable warnings for color mode
816 (void)char_lut;
817 (void)length_lut;
818 (void)char_byte0_lut;
819 (void)char_byte1_lut;
820 (void)char_byte2_lut;
821 (void)char_byte3_lut;
822
823 // Track current color state
824 int curR = -1, curG = -1, curB = -1;
825 int cur_color_idx = -1;
826
827 for (int y = 0; y < height; y++) {
828 const rgb_pixel_t *row = &((const rgb_pixel_t *)image->pixels)[y * width];
829 int x = 0;
830
831 // Process 16-pixel chunks with NEON
832 while (x + 16 <= width) {
833 // Load 16 pixels: R,G,B interleaved
834 const uint8_t *p = (const uint8_t *)(row + x);
835 uint8x16x3_t pix = vld3q_u8(p); // 48 bytes
836
837 // Vector luminance: Y ≈ (77*R + 150*G + 29*B + 128) >> 8
838 uint16x8_t ylo = vmull_u8(vget_low_u8(pix.val[0]), vdup_n_u8(LUMA_RED));
839 ylo = vmlal_u8(ylo, vget_low_u8(pix.val[1]), vdup_n_u8(LUMA_GREEN));
840 ylo = vmlal_u8(ylo, vget_low_u8(pix.val[2]), vdup_n_u8(LUMA_BLUE));
841 ylo = vaddq_u16(ylo, vdupq_n_u16(LUMA_THRESHOLD));
842 ylo = vshrq_n_u16(ylo, 8);
843
844 uint16x8_t yhi = vmull_u8(vget_high_u8(pix.val[0]), vdup_n_u8(LUMA_RED));
845 yhi = vmlal_u8(yhi, vget_high_u8(pix.val[1]), vdup_n_u8(LUMA_GREEN));
846 yhi = vmlal_u8(yhi, vget_high_u8(pix.val[2]), vdup_n_u8(LUMA_BLUE));
847 yhi = vaddq_u16(yhi, vdupq_n_u16(LUMA_THRESHOLD));
848 yhi = vshrq_n_u16(yhi, 8);
849
850 uint8x16_t y8 = vcombine_u8(vmovn_u16(ylo), vmovn_u16(yhi));
851 uint8x16_t idx = vshrq_n_u8(y8, 2); // 0..63
852
853 // FAST: Use vqtbl4q_u8 to get character indices from the ramp
854 uint8x16_t char_indices = vqtbl4q_u8(tbl, idx);
855
856 if (use_256color) {
857 // 256-color mode: VECTORIZED color quantization
858 uint8_t char_idx_buf[16], color_indices[16];
859 vst1q_u8(char_idx_buf, char_indices); // Character indices from SIMD lookup
860
861 // VECTORIZED: Use existing optimized 256-color quantization
862 uint8x16_t color_indices_vec = palette256_index_dithered_neon(pix.val[0], pix.val[1], pix.val[2], x);
863 vst1q_u8(color_indices, color_indices_vec);
864
865 // Emit with RLE on (UTF-8 character, color) runs using SIMD-derived indices
866 for (int i = 0; i < 16;) {
867 const uint8_t char_idx = char_idx_buf[i]; // From vqtbl4q_u8 lookup
868 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
869 const uint8_t color_idx = color_indices[i];
870
871 // NEON-optimized RLE detection
872 const uint32_t run =
873 (uint32_t)find_rle_run_length_neon(char_idx_buf, color_indices, i, 16, char_idx, color_idx);
874
875 if (color_idx != cur_color_idx) {
876 if (use_background) {
877 emit_set_256_color_bg(&ob, color_idx);
878 } else {
879 emit_set_256_color_fg(&ob, color_idx);
880 }
881 cur_color_idx = color_idx;
882 }
883
884 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
885 if (rep_is_profitable(run)) {
886 emit_rep(&ob, run - 1);
887 } else {
888 for (uint32_t k = 1; k < run; k++) {
889 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
890 }
891 }
892 i += run;
893 }
894 } else {
895 // VECTORIZED: Truecolor mode with full SIMD pipeline (no scalar spillover)
896 char temp_buffer[16 * 50]; // Temporary buffer for 16 ANSI sequences (up to 50 bytes each)
897 size_t vectorized_length =
898 neon_assemble_truecolor_sequences_true_simd(char_indices, pix.val[0], pix.val[1], pix.val[2], utf8_cache,
899 temp_buffer, sizeof(temp_buffer), use_background);
900
901 // Write vectorized output to main buffer
902 ob_write(&ob, temp_buffer, vectorized_length);
903 }
904 x += 16;
905 }
906
907 // Scalar tail for remaining pixels
908 for (; x < width;) {
909 const rgb_pixel_t *p = &row[x];
910 uint32_t R = p->r, G = p->g, B = p->b;
911 uint8_t Y = (uint8_t)((LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + LUMA_THRESHOLD) >> 8);
912 uint8_t luma_idx = Y >> 2; // 0-63 index (matches SIMD: cache64 is indexed by luminance bucket)
913 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
914
915 if (use_256color) {
916 // 256-color scalar tail
917 uint8_t color_idx = rgb_to_256color((uint8_t)R, (uint8_t)G, (uint8_t)B);
918
919 int j = x + 1;
920 while (j < width) {
921 const rgb_pixel_t *q = &row[j];
922 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
923 uint8_t Y2 = (uint8_t)((LUMA_RED * R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
924 uint8_t luma_idx2 = Y2 >> 2; // Compare luminance buckets (matches SIMD)
925 uint8_t color_idx2 = rgb_to_256color((uint8_t)R2, (uint8_t)G2, (uint8_t)B2);
926 if (luma_idx2 != luma_idx || color_idx2 != color_idx)
927 break;
928 j++;
929 }
930 uint32_t run = (uint32_t)(j - x);
931
932 if (color_idx != cur_color_idx) {
933 if (use_background) {
934 emit_set_256_color_bg(&ob, color_idx);
935 } else {
936 emit_set_256_color_fg(&ob, color_idx);
937 }
938 cur_color_idx = color_idx;
939 }
940
941 // Emit UTF-8 character from cache
942 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
943 if (rep_is_profitable(run)) {
944 emit_rep(&ob, run - 1);
945 } else {
946 for (uint32_t k = 1; k < run; k++) {
947 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
948 }
949 }
950 x = j;
951 } else {
952 // Truecolor scalar tail with UTF-8 characters using cached lookups
953 int j = x + 1;
954 while (j < width) {
955 const rgb_pixel_t *q = &row[j];
956 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
957 uint8_t Y2 = (uint8_t)((LUMA_RED * R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
958 uint8_t luma_idx2 = Y2 >> 2; // Compare luminance buckets (matches SIMD)
959 if (luma_idx2 != luma_idx || R2 != R || G2 != G || B2 != B)
960 break;
961 j++;
962 }
963 uint32_t run = (uint32_t)(j - x);
964
965 if ((int)R != curR || (int)G != curG || (int)B != curB) {
966 if (use_background) {
968 } else {
970 }
971 curR = (int)R;
972 curG = (int)G;
973 curB = (int)B;
974 }
975
976 // Emit UTF-8 character from cache
977 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
978 if (rep_is_profitable(run)) {
979 emit_rep(&ob, run - 1);
980 } else {
981 for (uint32_t k = 1; k < run; k++) {
982 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
983 }
984 }
985 x = j;
986 }
987 }
988
989 // End row: reset SGR, add newline (except for last row)
990 emit_reset(&ob);
991 if (y < height - 1) { // Only add newline if not the last row
992 ob_putc(&ob, '\n');
993 }
994 curR = curG = curB = -1;
995 cur_color_idx = -1;
996 }
997
998 ob_term(&ob);
999 return ob.buf;
1000}
1001
1002//=============================================================================
1003// Optimized NEON Half-block renderer (based on ChatGPT reference)
1004//=============================================================================
1005char *rgb_to_truecolor_halfblocks_neon(const uint8_t *rgb, int width, int height, int stride_bytes) {
1006 /* Main: half-block renderer. Returns NUL-terminated malloc'd string; caller free(). */
1007 if (width <= 0 || height <= 0)
1008 return platform_strdup("");
1009 if (stride_bytes <= 0)
1010 stride_bytes = width * 3;
1011
1012 outbuf_t ob = {0};
1013 // generous guess: per cell ~ 10–14 bytes avg; half the rows + newlines
1014 size_t est_cells = (size_t)width * ((size_t)(height + 1) / 2);
1015 ob.cap = est_cells * 14u + (size_t)((height + 1) / 2) * 8u + 64u;
1016 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
1017 if (!ob.buf)
1018 return NULL;
1019
1020 // current SGR state; -1 means unknown
1021 int cur_fr = -1, cur_fg = -1, cur_fb = -1;
1022 int cur_br = -1, cur_bg = -1, cur_bb = -1;
1023
1024 // process two source rows per emitted line
1025 for (int y = 0; y < height; y += 2) {
1026 const uint8_t *rowT = rgb + (size_t)y * (size_t)stride_bytes;
1027 const uint8_t *rowB = (y + 1 < height) ? rowT + (size_t)stride_bytes : NULL;
1028
1029 int x = 0;
1030 while (x + 16 <= width) {
1031 // Load 16 top and bottom pixels (RGB interleaved)
1032 const uint8_t *pT = rowT + (size_t)x * 3u;
1033 uint8x16x3_t top = vld3q_u8(pT);
1034
1035 uint8x16x3_t bot;
1036 if (rowB) {
1037 const uint8_t *pB = rowB + (size_t)x * 3u;
1038 bot = vld3q_u8(pB);
1039 } else {
1040 // synthesize bottom = top for odd-height last row
1041 bot.val[0] = top.val[0];
1042 bot.val[1] = top.val[1];
1043 bot.val[2] = top.val[2];
1044 }
1045
1046 // Spill to small arrays (cheap; enables simple scalar RLE over 16)
1047 uint8_t Rt[16], Gt[16], Bt[16], Rb[16], Gb[16], Bb[16];
1048 vst1q_u8(Rt, top.val[0]);
1049 vst1q_u8(Gt, top.val[1]);
1050 vst1q_u8(Bt, top.val[2]);
1051 vst1q_u8(Rb, bot.val[0]);
1052 vst1q_u8(Gb, bot.val[1]);
1053 vst1q_u8(Bb, bot.val[2]);
1054
1055 // RLE over the 16 cells
1056 for (int i = 0; i < 16;) {
1057 uint8_t rT = Rt[i], gT = Gt[i], bT = Bt[i];
1058 uint8_t rB = Rb[i], gB = Gb[i], bB = Bb[i];
1059
1060 // Always half-block: U+2580 "▀" (upper half)
1061 const uint8_t glyph_utf8[3] = {0xE2, 0x96, 0x80};
1062
1063 // Extend run while next cell has same top+bottom colors
1064 int j = i + 1;
1065 for (; j < 16; ++j) {
1066 if (!(Rt[j] == rT && Gt[j] == gT && Bt[j] == bT && Rb[j] == rB && Gb[j] == gB && Bb[j] == bB))
1067 break;
1068 }
1069 uint32_t run = (uint32_t)(j - i);
1070
1071 // Check if this is a transparent area (black pixels = padding/background)
1072 bool is_transparent = (rT == 0 && gT == 0 && bT == 0 && rB == 0 && gB == 0 && bB == 0);
1073
1074 if (is_transparent) {
1075 // Reset colors before transparent areas to prevent color bleeding
1076 if (cur_fr != -1 || cur_fg != -1 || cur_fb != -1 || cur_br != -1 || cur_bg != -1 || cur_bb != -1) {
1077 emit_reset(&ob);
1078 cur_fr = cur_fg = cur_fb = -1;
1079 cur_br = cur_bg = cur_bb = -1;
1080 }
1081 // For transparent areas, emit space character with no color codes (terminal default)
1082 ob_write(&ob, " ", 1);
1083 if (rep_is_profitable(run)) {
1084 emit_rep(&ob, run - 1);
1085 } else {
1086 for (uint32_t k = 1; k < run; ++k) {
1087 ob_write(&ob, " ", 1);
1088 }
1089 }
1090 } else {
1091 // Normal colored half-blocks - set fg to TOP, bg to BOTTOM if changed
1092 if (cur_fr != rT || cur_fg != gT || cur_fb != bT) {
1093 emit_set_fg(&ob, rT, gT, bT);
1094 cur_fr = rT;
1095 cur_fg = gT;
1096 cur_fb = bT;
1097 }
1098 if (cur_br != rB || cur_bg != gB || cur_bb != bB) {
1099 emit_set_bg(&ob, rB, gB, bB);
1100 cur_br = rB;
1101 cur_bg = gB;
1102 cur_bb = bB;
1103 }
1104
1105 // Emit glyph once, then REP or literals
1106 ob_write(&ob, (const char *)glyph_utf8, 3);
1107 if (rep_is_profitable(run)) {
1108 emit_rep(&ob, run - 1);
1109 } else {
1110 for (uint32_t k = 1; k < run; ++k) {
1111 ob_write(&ob, (const char *)glyph_utf8, 3);
1112 }
1113 }
1114 }
1115
1116 i = j;
1117 }
1118 x += 16;
1119 }
1120
1121 // Scalar tail (or full row if no NEON)
1122 for (; x < width;) {
1123 const uint8_t *pT = rowT + (size_t)x * 3u;
1124 const uint8_t *pB = rowB ? rowB + (size_t)x * 3u : NULL;
1125
1126 uint8_t rT = pT[0], gT = pT[1], bT = pT[2];
1127 uint8_t rB = rT, gB = gT, bB = bT;
1128 if (pB) {
1129 rB = pB[0];
1130 gB = pB[1];
1131 bB = pB[2];
1132 }
1133
1134 // Extend run while top and bottom colors match exactly
1135 int j = x + 1;
1136 for (; j < width; ++j) {
1137 const uint8_t *qT = rowT + (size_t)j * 3u;
1138 const uint8_t *qB = rowB ? rowB + (size_t)j * 3u : NULL;
1139 uint8_t rT2 = qT[0], gT2 = qT[1], bT2 = qT[2];
1140 uint8_t rB2 = qB ? qB[0] : rT2, gB2 = qB ? qB[1] : gT2, bB2 = qB ? qB[2] : bT2;
1141 if (!((rT2 == rT && gT2 == gT && bT2 == bT) && (rB2 == rB && gB2 == gB && bB2 == bB)))
1142 break;
1143 }
1144 uint32_t run = (uint32_t)(j - x);
1145
1146 // Check if this is a transparent area (black pixels = padding/background)
1147 bool is_transparent = (rT == 0 && gT == 0 && bT == 0 && rB == 0 && gB == 0 && bB == 0);
1148
1149 if (is_transparent) {
1150 // Reset colors before transparent areas to prevent color bleeding
1151 if (cur_fr != -1 || cur_fg != -1 || cur_fb != -1 || cur_br != -1 || cur_bg != -1 || cur_bb != -1) {
1152 emit_reset(&ob);
1153 cur_fr = cur_fg = cur_fb = -1;
1154 cur_br = cur_bg = cur_bb = -1;
1155 }
1156 // For transparent areas, emit space character with no color codes
1157 ob_write(&ob, " ", 1);
1158 if (rep_is_profitable(run)) {
1159 emit_rep(&ob, run - 1);
1160 } else {
1161 for (uint32_t k = 1; k < run; ++k) {
1162 ob_write(&ob, " ", 1);
1163 }
1164 }
1165 } else {
1166 // SGR: fg = TOP, bg = BOTTOM for colored areas
1167 if (cur_fr != rT || cur_fg != gT || cur_fb != bT) {
1168 emit_set_fg(&ob, rT, gT, bT);
1169 cur_fr = rT;
1170 cur_fg = gT;
1171 cur_fb = bT;
1172 }
1173 if (cur_br != rB || cur_bg != gB || cur_bb != bB) {
1174 emit_set_bg(&ob, rB, gB, bB);
1175 cur_br = rB;
1176 cur_bg = gB;
1177 cur_bb = bB;
1178 }
1179
1180 // Always the upper half block "▀" (U+2580)
1181 static const char HB[3] = {(char)0xE2, (char)0x96, (char)0x80};
1182 ob_write(&ob, HB, 3);
1183 if (rep_is_profitable(run)) {
1184 emit_rep(&ob, run - 1);
1185 } else {
1186 for (uint32_t k = 1; k < run; ++k) {
1187 ob_write(&ob, HB, 3);
1188 }
1189 }
1190 }
1191
1192 x = j;
1193 }
1194
1195 // End emitted line: reset and newline (only for non-final lines)
1196 emit_reset(&ob);
1197 // Check if this is the last output line (since we process 2 pixel rows per output line)
1198 if (y + 2 < height) { // Only add newline if not the last output line
1199 ob_putc(&ob, '\n');
1200 }
1201 cur_fr = cur_fg = cur_fb = -1;
1202 cur_br = cur_bg = cur_bb = -1;
1203 }
1204
1205 ob_term(&ob);
1206 return ob.buf;
1207}
1208#endif // SIMD_SUPPORT_NEON
Fast ANSI escape sequence generation.
SIMD-optimized ASCII conversion interface.
#define CUBE_GRAY_THRESHOLD
#define SAFE_MALLOC_SIMD(size, cast)
Definition common.h:308
unsigned short uint16_t
Definition common.h:57
unsigned int uint32_t
Definition common.h:58
#define SAFE_MALLOC(size, cast)
Definition common.h:208
unsigned long long uint64_t
Definition common.h:59
unsigned char uint8_t
Definition common.h:56
@ ASCIICHAT_OK
Definition error_codes.h:48
#define log_error(...)
Log an ERROR message.
char * platform_strdup(const char *s)
Duplicate string (strdup replacement)
global_dec3_cache_t g_dec3_cache
Global decimal cache instance.
Definition ascii_simd.c:23
#define LUMA_BLUE
Luminance blue coefficient (0.114 * 256 = 29)
Definition ascii_simd.h:76
void emit_set_256_color_bg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color background ANSI sequence.
#define LUMA_GREEN
Luminance green coefficient (0.587 * 256 = 150)
Definition ascii_simd.h:74
dec3_t dec3_table[256]
Definition ascii_simd.h:98
void emit_set_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit background color sequence (auto-select mode)
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)
Get or create UTF-8 palette cache.
void init_dec3(void)
Initialize decimal lookup table.
Definition ascii_simd.c:58
uint8_t len
Definition ascii_simd.h:88
void emit_set_256_color_fg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color foreground ANSI sequence.
void ob_term(outbuf_t *ob)
Append null terminator to buffer.
#define LUMA_THRESHOLD
Luminance threshold for rounding.
Definition ascii_simd.h:78
void ob_putc(outbuf_t *ob, char c)
Append a character to buffer.
bool rep_is_profitable(uint32_t runlen)
Check if run-length encoding is profitable.
void emit_set_truecolor_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor foreground ANSI sequence.
void emit_rep(outbuf_t *ob, uint32_t extra)
Emit run-length encoded sequence.
void ob_write(outbuf_t *ob, const char *s, size_t n)
Append a string to buffer.
void emit_reset(outbuf_t *ob)
Emit ANSI reset sequence.
void emit_set_truecolor_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor background ANSI sequence.
void emit_set_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit foreground color sequence (auto-select mode)
char s[3]
Definition ascii_simd.h:89
uint8_t rgb_to_256color(uint8_t r, uint8_t g, uint8_t b)
Convert RGB to 256-color palette index.
Definition ansi_fast.c:199
#define LUMA_RED
Luminance red coefficient (0.299 * 256 = 77)
Definition ascii_simd.h:72
🔢 Mathematical Utility Functions
NEON-optimized ASCII rendering functions.
Dynamic Output Buffer with ANSI Sequence Support.
✅ Safe Integer Arithmetic and Overflow Detection
RGB pixel structure.
Definition video/image.h:80
Decimal conversion cache structure (1-3 digits)
Definition ascii_simd.h:87
Image structure.
int w
Image width in pixels (must be > 0)
int h
Image height in pixels (must be > 0)
rgb_pixel_t * pixels
Pixel data array (width * height RGB pixels, row-major order)
Dynamic output buffer (auto-expanding)
size_t cap
Buffer capacity in bytes (maximum length before reallocation)
char * buf
Buffer pointer (allocated, owned by caller, must be freed)
UTF-8 character structure.
UTF-8 palette cache structure.
⏱️ High-precision timing utilities using sokol_time.h and uthash
Common SIMD utilities and structures.