ascii-chat 0.8.38
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
neon.c
Go to the documentation of this file.
1
7#if SIMD_SUPPORT_NEON
8#include <stdio.h>
9#include <stdlib.h>
10#include <stdint.h>
11#include <string.h>
12#include <stdarg.h>
13#include <time.h>
14#include <assert.h>
15#include <stdatomic.h>
16#include <math.h>
17
18#include <arm_neon.h>
19
20#include <ascii-chat/common.h>
21#include <ascii-chat/video/simd/neon.h>
22#include <ascii-chat/video/simd/ascii_simd.h>
23#include <ascii-chat/video/image.h>
24#include <ascii-chat/video/simd/common.h>
25#include <ascii-chat/video/output_buffer.h>
26#include <ascii-chat/video/ansi_fast.h>
27#include <ascii-chat/util/overflow.h>
28#include <ascii-chat/platform/init.h>
29#include <ascii-chat/util/time.h>
30#include <ascii-chat/log/logging.h>
31
32// NEON table cache removed - performance analysis showed rebuilding (30ns) is faster than lookup (50ns)
33// Tables are now built inline when needed for optimal performance
34
35// Build NEON lookup tables inline (faster than caching - 30ns rebuild vs 50ns lookup)
36static inline void build_neon_lookup_tables(utf8_palette_cache_t *utf8_cache, uint8x16x4_t *tbl, uint8x16x4_t *char_lut,
37 uint8x16x4_t *length_lut, uint8x16x4_t *char_byte0_lut,
38 uint8x16x4_t *char_byte1_lut, uint8x16x4_t *char_byte2_lut,
39 uint8x16x4_t *char_byte3_lut) {
40 // Build NEON-specific lookup table with cache64 indices (direct mapping)
41 uint8_t cache64_indices[64];
42 for (int i = 0; i < 64; i++) {
43 cache64_indices[i] = (uint8_t)i; // Direct mapping: luminance bucket -> cache64 index
44 }
45
46 tbl->val[0] = vld1q_u8(&cache64_indices[0]);
47 tbl->val[1] = vld1q_u8(&cache64_indices[16]);
48 tbl->val[2] = vld1q_u8(&cache64_indices[32]);
49 tbl->val[3] = vld1q_u8(&cache64_indices[48]);
50
51 // Build vectorized UTF-8 lookup tables for length-aware compaction
52 uint8_t ascii_chars_lut[64]; // For ASCII fast path
53 uint8_t char_lengths[64]; // Character byte lengths
54 uint8_t char_byte0[64]; // First byte of each character
55 uint8_t char_byte1[64]; // Second byte of each character
56 uint8_t char_byte2[64]; // Third byte of each character
57 uint8_t char_byte3[64]; // Fourth byte of each character
58
59 for (int i = 0; i < 64; i++) {
60 const utf8_char_t *char_info = &utf8_cache->cache64[i];
61
62 // ASCII fast path table
63 ascii_chars_lut[i] = char_info->utf8_bytes[0];
64
65 // Length-aware compaction tables
66 char_lengths[i] = char_info->byte_len;
67 char_byte0[i] = char_info->utf8_bytes[0];
68 char_byte1[i] = char_info->byte_len > 1 ? char_info->utf8_bytes[1] : 0;
69 char_byte2[i] = char_info->byte_len > 2 ? char_info->utf8_bytes[2] : 0;
70 char_byte3[i] = char_info->byte_len > 3 ? char_info->utf8_bytes[3] : 0;
71 }
72
73 // Load all lookup tables into NEON registers
74 char_lut->val[0] = vld1q_u8(&ascii_chars_lut[0]);
75 char_lut->val[1] = vld1q_u8(&ascii_chars_lut[16]);
76 char_lut->val[2] = vld1q_u8(&ascii_chars_lut[32]);
77 char_lut->val[3] = vld1q_u8(&ascii_chars_lut[48]);
78
79 length_lut->val[0] = vld1q_u8(&char_lengths[0]);
80 length_lut->val[1] = vld1q_u8(&char_lengths[16]);
81 length_lut->val[2] = vld1q_u8(&char_lengths[32]);
82 length_lut->val[3] = vld1q_u8(&char_lengths[48]);
83
84 char_byte0_lut->val[0] = vld1q_u8(&char_byte0[0]);
85 char_byte0_lut->val[1] = vld1q_u8(&char_byte0[16]);
86 char_byte0_lut->val[2] = vld1q_u8(&char_byte0[32]);
87 char_byte0_lut->val[3] = vld1q_u8(&char_byte0[48]);
88
89 char_byte1_lut->val[0] = vld1q_u8(&char_byte1[0]);
90 char_byte1_lut->val[1] = vld1q_u8(&char_byte1[16]);
91 char_byte1_lut->val[2] = vld1q_u8(&char_byte1[32]);
92 char_byte1_lut->val[3] = vld1q_u8(&char_byte1[48]);
93
94 char_byte2_lut->val[0] = vld1q_u8(&char_byte2[0]);
95 char_byte2_lut->val[1] = vld1q_u8(&char_byte2[16]);
96 char_byte2_lut->val[2] = vld1q_u8(&char_byte2[32]);
97 char_byte2_lut->val[3] = vld1q_u8(&char_byte2[48]);
98
99 char_byte3_lut->val[0] = vld1q_u8(&char_byte3[0]);
100 char_byte3_lut->val[1] = vld1q_u8(&char_byte3[16]);
101 char_byte3_lut->val[2] = vld1q_u8(&char_byte3[32]);
102 char_byte3_lut->val[3] = vld1q_u8(&char_byte3[48]);
103}
104
105// NEON cache destruction no longer needed - tables are built inline
106void neon_caches_destroy(void) {
107 // No-op: NEON table cache removed for performance
108 // Tables are now built inline (30ns) which is faster than cache lookup (50ns)
109}
110
111// NEON-optimized RLE detection: find run length for char+color pairs
112static inline int find_rle_run_length_neon(const uint8_t *char_buf, const uint8_t *color_buf, int start_pos,
113 int max_len, uint8_t target_char, uint8_t target_color) {
114 int run_length = 1; // At least the starting position
115
116 // Use NEON to check multiple elements at once when possible
117 int remaining = max_len - start_pos - 1;
118 if (remaining <= 0)
119 return 1;
120
121 const uint8_t *char_ptr = &char_buf[start_pos + 1];
122 const uint8_t *color_ptr = &color_buf[start_pos + 1];
123
124 // Process in chunks of 16 for full NEON utilization
125 while (remaining >= 16) {
126 uint8x16_t chars = vld1q_u8(char_ptr);
127 uint8x16_t colors = vld1q_u8(color_ptr);
128
129 uint8x16_t char_match = vceqq_u8(chars, vdupq_n_u8(target_char));
130 uint8x16_t color_match = vceqq_u8(colors, vdupq_n_u8(target_color));
131 uint8x16_t both_match = vandq_u8(char_match, color_match);
132
133 // Use NEON min/max to find first mismatch efficiently
134 // If all elements match, min will be 0xFF, otherwise it will be 0x00
135 uint8_t min_match = vminvq_u8(both_match);
136
137 if (min_match == 0xFF) {
138 // All 16 elements match
139 run_length += 16;
140 char_ptr += 16;
141 color_ptr += 16;
142 remaining -= 16;
143 } else {
144 // Find first mismatch position using bit scan
145 uint64_t mask_lo = vgetq_lane_u64(vreinterpretq_u64_u8(both_match), 0);
146 uint64_t mask_hi = vgetq_lane_u64(vreinterpretq_u64_u8(both_match), 1);
147
148 int matches_found = 0;
149 // Check low 8 bytes first
150 for (int i = 0; i < 8; i++) {
151 if ((mask_lo >> (i * 8)) & 0xFF) {
152 matches_found++;
153 } else {
154 break;
155 }
156 }
157
158 // If all low 8 matched, check high 8 bytes
159 if (matches_found == 8) {
160 for (int i = 0; i < 8; i++) {
161 if ((mask_hi >> (i * 8)) & 0xFF) {
162 matches_found++;
163 } else {
164 break;
165 }
166 }
167 }
168
169 run_length += matches_found;
170 break; // Found mismatch, stop
171 }
172 }
173
174 // Handle remaining elements with scalar loop
175 while (remaining > 0 && *char_ptr == target_char && *color_ptr == target_color) {
176 run_length++;
177 char_ptr++;
178 color_ptr++;
179 remaining--;
180 }
181
182 return run_length;
183}
184
185// NEON helper: Check if all characters have same length
186static inline bool all_same_length_neon(uint8x16_t lengths, uint8_t *out_length) {
187 uint8_t first_len = vgetq_lane_u8(lengths, 0);
188 uint8x16_t first_len_vec = vdupq_n_u8(first_len);
189 uint8x16_t all_same = vceqq_u8(lengths, first_len_vec);
190
191 uint64x2_t all_same_64 = vreinterpretq_u64_u8(all_same);
192 uint64_t combined = vgetq_lane_u64(all_same_64, 0) & vgetq_lane_u64(all_same_64, 1);
193
194 if (combined == 0xFFFFFFFFFFFFFFFF) {
195 *out_length = first_len;
196 return true;
197 }
198 return false;
199}
200
201// ============================================================================
202// Vectorized Decimal Lookup Functions for NEON Color Performance
203// ============================================================================
204
205// NEON TBL lookup tables for decimal conversion (256 entries each)
206// Format: each entry has length byte + up to 3 decimal chars (4 bytes per entry)
207static uint8_t neon_decimal_table_data[256 * 4]; // 1024 bytes: [len][d1][d2][d3] per entry
208static bool neon_decimal_table_initialized = false;
209// Mutex to protect NEON decimal table initialization (TOCTOU race prevention)
210static static_mutex_t g_neon_table_init_mutex = STATIC_MUTEX_INIT;
211
212// Initialize NEON TBL decimal lookup table (called once at startup)
213// Thread-safe with proper mutex protection
214void init_neon_decimal_table(void) {
215 static_mutex_lock(&g_neon_table_init_mutex);
216
217 // Double-check under lock: another thread may have initialized while we waited
218 if (neon_decimal_table_initialized) {
219 static_mutex_unlock(&g_neon_table_init_mutex);
220 return;
221 }
222
223 // Initialize g_dec3_cache first (also mutex-protected if needed)
224 if (!g_dec3_cache.dec3_initialized) {
225 init_dec3();
226 }
227
228 // Convert dec3_t cache to NEON TBL format: [len][d1][d2][d3] per 4-byte entry
229 for (int i = 0; i < 256; i++) {
230 const dec3_t *dec = &g_dec3_cache.dec3_table[i];
231 uint8_t *entry = &neon_decimal_table_data[i * 4];
232 entry[0] = dec->len; // Length (1-3)
233 entry[1] = (dec->len >= 1) ? dec->s[0] : '0'; // First digit
234 entry[2] = (dec->len >= 2) ? dec->s[1] : '0'; // Second digit
235 entry[3] = (dec->len >= 3) ? dec->s[2] : '0'; // Third digit
236 }
237
238 neon_decimal_table_initialized = true;
239 static_mutex_unlock(&g_neon_table_init_mutex);
240}
241
242// TODO: Implement true NEON vectorized ANSI sequence generation using TBL + compaction
243// Following the monochrome pattern: pad sequences to uniform width, then compact null bytes
244// For now, keep the existing scalar approach to avoid breaking the build
245
246// True NEON vectorized ANSI truecolor sequence assembly - no scalar loops!
247static inline size_t neon_assemble_truecolor_sequences_true_simd(uint8x16_t char_indices, uint8x16_t r_vals,
248 uint8x16_t g_vals, uint8x16_t b_vals,
249 utf8_palette_cache_t *utf8_cache, char *output_buffer,
250 size_t buffer_capacity, bool use_background) {
251 // STREAMLINED IMPLEMENTATION: Focus on the real bottleneck - RGB->decimal conversion
252 // Key insight: ANSI sequences are too variable for effective SIMD, but TBL lookups provide major speedup
253
254 // NOTE: Ensure NEON decimal table is initialized BEFORE calling this function (done in
255 // render_ascii_neon_unified_optimized)
256
257 char *dst = output_buffer;
258
259 // Extract values for optimized scalar processing with SIMD-accelerated lookups
260 uint8_t char_idx_buf[16], r_buf[16], g_buf[16], b_buf[16];
261 vst1q_u8(char_idx_buf, char_indices);
262 vst1q_u8(r_buf, r_vals);
263 vst1q_u8(g_buf, g_vals);
264 vst1q_u8(b_buf, b_vals);
265
266 size_t total_written = 0;
267 const char *prefix = use_background ? "\033[48;2;" : "\033[38;2;";
268 const size_t prefix_len = 7;
269
270 // Optimized scalar loop with NEON TBL acceleration for RGB->decimal conversion
271 // This eliminates the expensive safe_snprintf() calls which were the real bottleneck
272 for (int i = 0; i < 16; i++) {
273 // Use NEON TBL lookups for RGB decimal conversion (major speedup!)
274 const uint8_t *r_entry = &neon_decimal_table_data[r_buf[i] * 4];
275 const uint8_t *g_entry = &neon_decimal_table_data[g_buf[i] * 4];
276 const uint8_t *b_entry = &neon_decimal_table_data[b_buf[i] * 4];
277
278 const uint8_t char_idx = char_idx_buf[i];
279 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
280
281 // Calculate total sequence length for buffer safety
282 size_t seq_len = prefix_len + r_entry[0] + 1 + g_entry[0] + 1 + b_entry[0] + 1 + char_info->byte_len;
283 if (total_written >= buffer_capacity - seq_len) {
284 break; // Buffer safety
285 }
286
287 // Optimized assembly using TBL results (no divisions, no snprintf!)
288 memcpy(dst, prefix, prefix_len);
289 dst += prefix_len;
290
291 // RGB components using pre-computed decimal strings
292 memcpy(dst, &r_entry[1], r_entry[0]);
293 dst += r_entry[0];
294 *dst++ = ';';
295
296 memcpy(dst, &g_entry[1], g_entry[0]);
297 dst += g_entry[0];
298 *dst++ = ';';
299
300 memcpy(dst, &b_entry[1], b_entry[0]);
301 dst += b_entry[0];
302 *dst++ = 'm';
303
304 // UTF-8 character from cache
305 memcpy(dst, char_info->utf8_bytes, char_info->byte_len);
306 dst += char_info->byte_len;
307
308 total_written = dst - output_buffer;
309 }
310
311 return total_written;
312}
313
314// Min-heap management removed - no longer needed without NEON table cache
315
316// Eviction logic removed - no longer needed without NEON table cache
317
318// Continue to actual NEON functions (helper functions already defined above)
319
320// Definitions are in ascii_simd.h - just use them
321// REMOVED: #define luminance_palette g_ascii_cache.luminance_palette (causes macro expansion issues)
322
323// SIMD luma and helpers:
324
325// SIMD luminance: Y = (77R + 150G + 29B) >> 8
326static inline uint8x16_t simd_luma_neon(uint8x16_t r, uint8x16_t g, uint8x16_t b) {
327 uint16x8_t rl = vmovl_u8(vget_low_u8(r));
328 uint16x8_t rh = vmovl_u8(vget_high_u8(r));
329 uint16x8_t gl = vmovl_u8(vget_low_u8(g));
330 uint16x8_t gh = vmovl_u8(vget_high_u8(g));
331 uint16x8_t bl = vmovl_u8(vget_low_u8(b));
332 uint16x8_t bh = vmovl_u8(vget_high_u8(b));
333
334 uint32x4_t l0 = vmull_n_u16(vget_low_u16(rl), LUMA_RED);
335 uint32x4_t l1 = vmull_n_u16(vget_high_u16(rl), LUMA_RED);
336 l0 = vmlal_n_u16(l0, vget_low_u16(gl), LUMA_GREEN);
337 l1 = vmlal_n_u16(l1, vget_high_u16(gl), LUMA_GREEN);
338 l0 = vmlal_n_u16(l0, vget_low_u16(bl), LUMA_BLUE);
339 l1 = vmlal_n_u16(l1, vget_high_u16(bl), LUMA_BLUE);
340
341 uint32x4_t h0 = vmull_n_u16(vget_low_u16(rh), LUMA_RED);
342 uint32x4_t h1 = vmull_n_u16(vget_high_u16(rh), LUMA_RED);
343 h0 = vmlal_n_u16(h0, vget_low_u16(gh), LUMA_GREEN);
344 h1 = vmlal_n_u16(h1, vget_high_u16(gh), LUMA_GREEN);
345 h0 = vmlal_n_u16(h0, vget_low_u16(bh), LUMA_BLUE);
346 h1 = vmlal_n_u16(h1, vget_high_u16(bh), LUMA_BLUE);
347
348 uint16x8_t l = vcombine_u16(vrshrn_n_u32(l0, 8), vrshrn_n_u32(l1, 8));
349 uint16x8_t h = vcombine_u16(vrshrn_n_u32(h0, 8), vrshrn_n_u32(h1, 8));
350 return vcombine_u8(vqmovn_u16(l), vqmovn_u16(h));
351}
352
353// ===== SIMD helpers for 256-color quantization =====
354
355// Approximate quantize 0..255 -> 0..5 : q ≈ round(x*5/255) = (x*5 + 128)>>8
356static inline uint8x16_t q6_from_u8(uint8x16_t x) {
357 uint16x8_t xl = vmovl_u8(vget_low_u8(x));
358 uint16x8_t xh = vmovl_u8(vget_high_u8(x));
359 xl = vmlaq_n_u16(vdupq_n_u16(0), xl, 5);
360 xh = vmlaq_n_u16(vdupq_n_u16(0), xh, 5);
361 xl = vaddq_u16(xl, vdupq_n_u16(128));
362 xh = vaddq_u16(xh, vdupq_n_u16(128));
363 xl = vshrq_n_u16(xl, 8);
364 xh = vshrq_n_u16(xh, 8);
365 return vcombine_u8(vqmovn_u16(xl), vqmovn_u16(xh)); // 0..5
366}
367
368// Make 256-color index (cube vs gray). threshold: max-min < thr ⇒ gray
369#ifndef CUBE_GRAY_THRESHOLD
370#define CUBE_GRAY_THRESHOLD 10
371#endif
372
373// Apply ordered dithering to reduce color variations (creates longer runs)
374static inline uint8x16_t apply_ordered_dither(uint8x16_t color, int pixel_offset, uint8_t dither_strength) {
375 // Bayer 4x4 dithering matrix (classic ordered dithering pattern)
376 static const uint8_t bayer4x4[16] = {0, 8, 2, 10, 12, 4, 14, 6, 3, 11, 1, 9, 15, 7, 13, 5};
377
378 // Load dithering matrix into NEON register
379 const uint8x16_t dither_matrix = vld1q_u8(bayer4x4);
380
381 // Create pixel position indices for 16 consecutive pixels
382 uint8_t pos_indices[16];
383 for (int i = 0; i < 16; i++) {
384 pos_indices[i] = (pixel_offset + i) & 15; // Wrap to 4x4 matrix (0-15)
385 }
386 const uint8x16_t position_vec = vld1q_u8(pos_indices);
387
388 // Lookup dither values for each pixel position using table lookup
389 uint8x16_t dither_values = vqtbl1q_u8(dither_matrix, position_vec);
390
391 // Scale dither values by strength (0-255 range)
392 // dither_strength controls how much dithering to apply
393 uint16x8_t dither_lo = vmulq_n_u16(vmovl_u8(vget_low_u8(dither_values)), dither_strength);
394 uint16x8_t dither_hi = vmulq_n_u16(vmovl_u8(vget_high_u8(dither_values)), dither_strength);
395 dither_lo = vshrq_n_u16(dither_lo, 4); // Scale down (/16)
396 dither_hi = vshrq_n_u16(dither_hi, 4);
397 uint8x16_t scaled_dither = vcombine_u8(vqmovn_u16(dither_lo), vqmovn_u16(dither_hi));
398
399 // Apply dithering with saturation to prevent overflow
400 return vqaddq_u8(color, scaled_dither);
401}
402
403uint8x16_t palette256_index_dithered_neon(uint8x16_t r, uint8x16_t g, uint8x16_t b, int pixel_offset) {
404 // Dithering disabled in speed mode (no-op)
405 r = apply_ordered_dither(r, pixel_offset, 0);
406 g = apply_ordered_dither(g, pixel_offset + 1, 0);
407 b = apply_ordered_dither(b, pixel_offset + 2, 0);
408
409 // cube index
410 uint8x16_t R6 = q6_from_u8(r);
411 uint8x16_t G6 = q6_from_u8(g);
412 uint8x16_t B6 = q6_from_u8(b);
413
414 // idx_cube = 16 + R6*36 + G6*6 + B6 (do in 16-bit to avoid overflow)
415 uint16x8_t R6l = vmovl_u8(vget_low_u8(R6));
416 uint16x8_t R6h = vmovl_u8(vget_high_u8(R6));
417 uint16x8_t G6l = vmovl_u8(vget_low_u8(G6));
418 uint16x8_t G6h = vmovl_u8(vget_high_u8(G6));
419 uint16x8_t B6l = vmovl_u8(vget_low_u8(B6));
420 uint16x8_t B6h = vmovl_u8(vget_high_u8(B6));
421
422 uint16x8_t idxl = vmlaq_n_u16(vmulq_n_u16(R6l, 36), G6l, 6);
423 uint16x8_t idxh = vmlaq_n_u16(vmulq_n_u16(R6h, 36), G6h, 6);
424 idxl = vaddq_u16(idxl, B6l);
425 idxh = vaddq_u16(idxh, B6h);
426 idxl = vaddq_u16(idxl, vdupq_n_u16(16));
427 idxh = vaddq_u16(idxh, vdupq_n_u16(16));
428
429 // gray decision: max-min < thr ?
430 uint8x16_t maxrg = vmaxq_u8(r, g);
431 uint8x16_t minrg = vminq_u8(r, g);
432 uint8x16_t maxrgb = vmaxq_u8(maxrg, b);
433 uint8x16_t minrgb = vminq_u8(minrg, b);
434 uint8x16_t diff = vsubq_u8(maxrgb, minrgb);
435 uint8x16_t thr = vdupq_n_u8((uint8_t)CUBE_GRAY_THRESHOLD);
436 uint8x16_t is_gray = vcltq_u8(diff, thr);
437
438 // gray idx = 232 + round(Y*23/255)
439 uint8x16_t Y = simd_luma_neon(r, g, b);
440 // q23 ≈ round(Y*23/255) = (Y*23 + 128)>>8
441 uint16x8_t Yl = vmovl_u8(vget_low_u8(Y));
442 uint16x8_t Yh = vmovl_u8(vget_high_u8(Y));
443 Yl = vmlaq_n_u16(vdupq_n_u16(0), Yl, 23);
444 Yh = vmlaq_n_u16(vdupq_n_u16(0), Yh, 23);
445 Yl = vaddq_u16(Yl, vdupq_n_u16(128));
446 Yh = vaddq_u16(Yh, vdupq_n_u16(128));
447 Yl = vshrq_n_u16(Yl, 8);
448 Yh = vshrq_n_u16(Yh, 8);
449 uint16x8_t gidxl = vaddq_u16(Yl, vdupq_n_u16(232));
450 uint16x8_t gidxh = vaddq_u16(Yh, vdupq_n_u16(232));
451
452 // select gray or cube per lane
453 uint8x16_t idx_cube = vcombine_u8(vqmovn_u16(idxl), vqmovn_u16(idxh));
454 uint8x16_t idx_gray = vcombine_u8(vqmovn_u16(gidxl), vqmovn_u16(gidxh));
455 return vbslq_u8(is_gray, idx_gray, idx_cube);
456}
457
458//=============================================================================
459// Simple Monochrome ASCII Function (matches scalar image_print performance)
460//=============================================================================
461
462char *render_ascii_image_monochrome_neon(const image_t *image, const char *ascii_chars) {
463 if (!image || !image->pixels || !ascii_chars) {
464 return NULL;
465 }
466
467 const int h = image->h;
468 const int w = image->w;
469
470 if (h <= 0 || w <= 0) {
471 return NULL;
472 }
473
474 // Get cached UTF-8 character mappings
475 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
476 if (!utf8_cache) {
477 log_error("Failed to get UTF-8 palette cache");
478 return NULL;
479 }
480
481 // Build NEON lookup tables inline (faster than caching - 30ns rebuild vs 50ns lookup)
482 uint8x16x4_t tbl, char_lut, length_lut, char_byte0_lut, char_byte1_lut, char_byte2_lut, char_byte3_lut;
483 build_neon_lookup_tables(utf8_cache, &tbl, &char_lut, &length_lut, &char_byte0_lut, &char_byte1_lut, &char_byte2_lut,
484 &char_byte3_lut);
485
486 // Estimate output buffer size for UTF-8 characters
487 const size_t max_char_bytes = 4; // Max UTF-8 character size
488
489 // Calculate buffer size with overflow checking
490 size_t w_times_bytes;
491 if (checked_size_mul((size_t)w, max_char_bytes, &w_times_bytes) != ASCIICHAT_OK) {
492 log_error("Buffer size overflow: width too large for UTF-8 encoding");
493 return NULL;
494 }
495
496 size_t w_times_bytes_plus_one;
497 if (checked_size_add(w_times_bytes, 1, &w_times_bytes_plus_one) != ASCIICHAT_OK) {
498 log_error("Buffer size overflow: width * bytes + 1 overflow");
499 return NULL;
500 }
501
502 size_t len;
503 if (checked_size_mul((size_t)h, w_times_bytes_plus_one, &len) != ASCIICHAT_OK) {
504 log_error("Buffer size overflow: height * (width * bytes + 1) overflow");
505 return NULL;
506 }
507
508 // Use SIMD-aligned allocation for optimal vectorized write performance
509 char *output = SAFE_MALLOC_SIMD(len, char *);
510 if (output == NULL) {
511 return NULL; // SAFE_MALLOC_SIMD already called FATAL, but satisfy analyzer
512 }
513
514 char *pos = output;
515 const rgb_pixel_t *pixels = (const rgb_pixel_t *)image->pixels;
516
517 // Pure NEON processing - no scalar fallbacks
518 for (int y = 0; y < h; y++) {
519 const rgb_pixel_t *row = &pixels[y * w];
520 int x = 0;
521
522 // Process 16 pixels at a time with NEON
523 for (; x + 15 < w; x += 16) {
524 // Load 16 RGB pixels (48 bytes)
525 uint8x16x3_t rgb = vld3q_u8((const uint8_t *)(row + x));
526
527 // Calculate luminance for all 16 pixels: (77*R + 150*G + 29*B + 128) >> 8
528 uint16x8_t luma_lo = vmull_u8(vget_low_u8(rgb.val[0]), vdup_n_u8(LUMA_RED)); // R * 77
529 luma_lo = vmlal_u8(luma_lo, vget_low_u8(rgb.val[1]), vdup_n_u8(LUMA_GREEN)); // + G * 150
530 luma_lo = vmlal_u8(luma_lo, vget_low_u8(rgb.val[2]), vdup_n_u8(LUMA_BLUE)); // + B * 29
531 luma_lo = vaddq_u16(luma_lo, vdupq_n_u16(128)); // + 128 (rounding)
532 luma_lo = vshrq_n_u16(luma_lo, 8); // >> 8
533
534 uint16x8_t luma_hi = vmull_u8(vget_high_u8(rgb.val[0]), vdup_n_u8(LUMA_RED));
535 luma_hi = vmlal_u8(luma_hi, vget_high_u8(rgb.val[1]), vdup_n_u8(LUMA_GREEN));
536 luma_hi = vmlal_u8(luma_hi, vget_high_u8(rgb.val[2]), vdup_n_u8(LUMA_BLUE));
537 luma_hi = vaddq_u16(luma_hi, vdupq_n_u16(128));
538 luma_hi = vshrq_n_u16(luma_hi, 8);
539
540 // Convert 16-bit luminance back to 8-bit
541 uint8x16_t luminance = vcombine_u8(vmovn_u16(luma_lo), vmovn_u16(luma_hi));
542
543 // NEON optimization: Use vqtbl4q_u8 for fast character index lookup
544 // Convert luminance (0-255) to 6-bit bucket (0-63) to match scalar behavior
545 uint8x16_t luma_buckets = vshrq_n_u8(luminance, 2); // >> 2 to get 0-63 range
546 uint8x16_t char_indices = vqtbl4q_u8(tbl, luma_buckets); // 16 lookups in 1 instruction!
547
548 // VECTORIZED UTF-8 CHARACTER GENERATION: Length-aware compaction
549
550 // Step 1: Get character lengths vectorially
551 uint8x16_t char_lengths = vqtbl4q_u8(length_lut, char_indices);
552
553 // Step 2: Check if all characters have same length (vectorized check)
554 uint8_t uniform_length;
555 if (all_same_length_neon(char_lengths, &uniform_length)) {
556
557 if (uniform_length == 1) {
558 // PURE ASCII PATH: 16 characters = 16 bytes (maximum vectorization)
559 uint8x16_t ascii_output = vqtbl4q_u8(char_lut, char_indices);
560 vst1q_u8((uint8_t *)pos, ascii_output);
561 pos += 16;
562
563 } else if (uniform_length == 4) {
564 // PURE 4-BYTE UTF-8 PATH: 16 characters = 64 bytes
565 // Gather all 4 byte streams in parallel
566 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
567 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
568 uint8x16_t byte2_stream = vqtbl4q_u8(char_byte2_lut, char_indices);
569 uint8x16_t byte3_stream = vqtbl4q_u8(char_byte3_lut, char_indices);
570
571 // Interleave bytes: [char0_byte0, char0_byte1, char0_byte2, char0_byte3, char1_byte0, ...]
572 uint8x16x4_t interleaved;
573 interleaved.val[0] = byte0_stream;
574 interleaved.val[1] = byte1_stream;
575 interleaved.val[2] = byte2_stream;
576 interleaved.val[3] = byte3_stream;
577
578 // Store interleaved UTF-8 data: 64 bytes total
579 vst4q_u8((uint8_t *)pos, interleaved);
580 pos += 64;
581
582 } else if (uniform_length == 2) {
583 // PURE 2-BYTE UTF-8 PATH: 16 characters = 32 bytes (vectorized)
584 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
585 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
586
587 // Interleave: [char0_b0, char0_b1, char1_b0, char1_b1, ...]
588 uint8x16x2_t interleaved_2byte;
589 interleaved_2byte.val[0] = byte0_stream;
590 interleaved_2byte.val[1] = byte1_stream;
591
592 vst2q_u8((uint8_t *)pos, interleaved_2byte);
593 pos += 32;
594
595 } else if (uniform_length == 3) {
596 // PURE 3-BYTE UTF-8 PATH: 16 characters = 48 bytes (vectorized)
597 uint8x16_t byte0_stream = vqtbl4q_u8(char_byte0_lut, char_indices);
598 uint8x16_t byte1_stream = vqtbl4q_u8(char_byte1_lut, char_indices);
599 uint8x16_t byte2_stream = vqtbl4q_u8(char_byte2_lut, char_indices);
600
601 // Interleave: [char0_b0, char0_b1, char0_b2, char1_b0, char1_b1, char1_b2, ...]
602 uint8x16x3_t interleaved_3byte;
603 interleaved_3byte.val[0] = byte0_stream;
604 interleaved_3byte.val[1] = byte1_stream;
605 interleaved_3byte.val[2] = byte2_stream;
606
607 vst3q_u8((uint8_t *)pos, interleaved_3byte);
608 pos += 48;
609 }
610
611 } else {
612 // MIXED LENGTH PATH: SIMD shuffle mask optimization
613 // Use vqtbl4q_u8 to gather UTF-8 bytes in 4 passes, then compact with fast scalar
614
615 // Gather all UTF-8 bytes using existing lookup tables with shuffle masks
616 uint8x16_t byte0_vec = vqtbl4q_u8(char_byte0_lut, char_indices);
617 uint8x16_t byte1_vec = vqtbl4q_u8(char_byte1_lut, char_indices);
618 uint8x16_t byte2_vec = vqtbl4q_u8(char_byte2_lut, char_indices);
619 uint8x16_t byte3_vec = vqtbl4q_u8(char_byte3_lut, char_indices);
620
621 // Store gathered bytes to temporary buffers
622 uint8_t byte0_buf[16], byte1_buf[16], byte2_buf[16], byte3_buf[16];
623 vst1q_u8(byte0_buf, byte0_vec);
624 vst1q_u8(byte1_buf, byte1_vec);
625 vst1q_u8(byte2_buf, byte2_vec);
626 vst1q_u8(byte3_buf, byte3_vec);
627
628 // Fast scalar compaction: emit only valid bytes based on character lengths
629 // Store char_indices to buffer for lookup
630 uint8_t char_idx_buf[16];
631 vst1q_u8(char_idx_buf, char_indices);
632
633 for (int i = 0; i < 16; i++) {
634 const uint8_t char_idx = char_idx_buf[i];
635 const uint8_t byte_len = utf8_cache->cache64[char_idx].byte_len;
636
637 // Emit bytes based on character length (1-4 bytes)
638 *pos++ = byte0_buf[i];
639 if (byte_len > 1)
640 *pos++ = byte1_buf[i];
641 if (byte_len > 2)
642 *pos++ = byte2_buf[i];
643 if (byte_len > 3)
644 *pos++ = byte3_buf[i];
645 }
646 }
647 }
648
649 // Handle remaining pixels with optimized scalar code using 64-entry cache
650 for (; x < w; x++) {
651 const rgb_pixel_t pixel = row[x];
652 const uint8_t luminance = (LUMA_RED * pixel.r + LUMA_GREEN * pixel.g + LUMA_BLUE * pixel.b + 128) >> 8;
653 const uint8_t luma_idx = luminance >> 2; // Map 0..255 to 0..63 (same as NEON)
654 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx]; // Direct cache64 access
655 // Optimized: Use direct assignment for single-byte ASCII characters
656 if (char_info->byte_len == 1) {
657 *pos++ = char_info->utf8_bytes[0];
658 } else {
659 // Fallback to full memcpy for multi-byte UTF-8
660 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
661 pos += char_info->byte_len;
662 }
663 }
664
665 // Add newline (except last row)
666 if (y < h - 1) {
667 *pos++ = '\n';
668 }
669 }
670
671 // Null terminate
672 *pos = '\0';
673
674 return output;
675}
676
677//=============================================================================
678// Optimized NEON Color Converter (based on ChatGPT reference)
679//=============================================================================
680
681// Unified optimized NEON converter (foreground/background + 256-color/truecolor)
682char *render_ascii_neon_unified_optimized(const image_t *image, bool use_background, bool use_256color,
683 const char *ascii_chars) {
684 if (!image || !image->pixels) {
685 return NULL;
686 }
687
688 const int width = image->w;
689 const int height = image->h;
690
691 if (width <= 0 || height <= 0) {
692 char *empty;
693 empty = SAFE_MALLOC(1, char *);
694 empty[0] = '\0';
695 return empty;
696 }
697
698 outbuf_t ob = {0};
699 // Estimate buffer size based on mode
700 size_t bytes_per_pixel = use_256color ? 6u : 8u; // 256-color shorter than truecolor
701
702 // Calculate buffer size with overflow checking
703 size_t height_times_width;
704 if (checked_size_mul((size_t)height, (size_t)width, &height_times_width) != ASCIICHAT_OK) {
705 log_error("Buffer size overflow: height * width overflow");
706 return NULL;
707 }
708
709 size_t pixel_data_size;
710 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
711 log_error("Buffer size overflow: (height * width) * bytes_per_pixel overflow");
712 return NULL;
713 }
714
715 size_t height_times_16;
716 if (checked_size_mul((size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
717 log_error("Buffer size overflow: height * 16 overflow");
718 return NULL;
719 }
720
721 size_t temp;
722 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
723 log_error("Buffer size overflow: pixel_data + height*16 overflow");
724 return NULL;
725 }
726
727 if (checked_size_add(temp, 64u, &ob.cap) != ASCIICHAT_OK) {
728 log_error("Buffer size overflow: total capacity overflow");
729 return NULL;
730 }
731
732 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
733 if (!ob.buf)
734 return NULL;
735
736 START_TIMER("neon_utf8_cache");
737 // Get cached UTF-8 character mappings (like monochrome function does)
738 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
739 if (!utf8_cache) {
740 log_error("Failed to get UTF-8 palette cache for NEON color");
741 return NULL;
742 }
743 STOP_TIMER_AND_LOG_EVERY(dev, 3 * NS_PER_SEC_INT, 3 * NS_PER_MS_INT, "neon_utf8_cache",
744 "NEON_UTF8_CACHE: Complete (%.2f ms)");
745
746 START_TIMER("neon_lookup_tables");
747 // Build NEON lookup table inline (faster than caching - 30ns rebuild vs 50ns lookup)
748 uint8x16x4_t tbl, char_lut, length_lut, char_byte0_lut, char_byte1_lut, char_byte2_lut, char_byte3_lut;
749 build_neon_lookup_tables(utf8_cache, &tbl, &char_lut, &length_lut, &char_byte0_lut, &char_byte1_lut, &char_byte2_lut,
750 &char_byte3_lut);
751 STOP_TIMER_AND_LOG_EVERY(dev, 3 * NS_PER_SEC_INT, 3 * NS_PER_MS_INT, "neon_lookup_tables",
752 "NEON_LOOKUP_TABLES: Complete (%.2f ms)");
753
754 // Suppress unused variable warnings for color mode
755 (void)char_lut;
756 (void)length_lut;
757 (void)char_byte0_lut;
758 (void)char_byte1_lut;
759 (void)char_byte2_lut;
760 (void)char_byte3_lut;
761
762 START_TIMER("neon_main_loop");
763 uint64_t loop_start_ns = time_get_ns();
764
765 // Track which code path is taken
766 int chunks_256color = 0, chunks_truecolor = 0;
767
768 // PRE-INITIALIZE: Call init once before loop
769 init_neon_decimal_table();
770
771 // Process rows
772 for (int y = 0; y < height; y++) {
773 // Track current color state
774 int curR = -1, curG = -1, curB = -1;
775 int cur_color_idx = -1;
776
777 const rgb_pixel_t *row = &((const rgb_pixel_t *)image->pixels)[y * width];
778 int x = 0;
779
780 // Process 16-pixel chunks with NEON
781 while (x + 16 <= width) {
782 // Load 16 pixels: R,G,B interleaved
783 const uint8_t *p = (const uint8_t *)(row + x);
784 uint8x16x3_t pix = vld3q_u8(p); // 48 bytes
785
786 // Vector luminance: Y ≈ (77*R + 150*G + 29*B + 128) >> 8
787 uint16x8_t ylo = vmull_u8(vget_low_u8(pix.val[0]), vdup_n_u8(LUMA_RED));
788 ylo = vmlal_u8(ylo, vget_low_u8(pix.val[1]), vdup_n_u8(LUMA_GREEN));
789 ylo = vmlal_u8(ylo, vget_low_u8(pix.val[2]), vdup_n_u8(LUMA_BLUE));
790 ylo = vaddq_u16(ylo, vdupq_n_u16(LUMA_THRESHOLD));
791 ylo = vshrq_n_u16(ylo, 8);
792
793 uint16x8_t yhi = vmull_u8(vget_high_u8(pix.val[0]), vdup_n_u8(LUMA_RED));
794 yhi = vmlal_u8(yhi, vget_high_u8(pix.val[1]), vdup_n_u8(LUMA_GREEN));
795 yhi = vmlal_u8(yhi, vget_high_u8(pix.val[2]), vdup_n_u8(LUMA_BLUE));
796 yhi = vaddq_u16(yhi, vdupq_n_u16(LUMA_THRESHOLD));
797 yhi = vshrq_n_u16(yhi, 8);
798
799 uint8x16_t y8 = vcombine_u8(vmovn_u16(ylo), vmovn_u16(yhi));
800 uint8x16_t idx = vshrq_n_u8(y8, 2); // 0..63
801
802 // FAST: Use vqtbl4q_u8 to get character indices from the ramp
803 uint8x16_t char_indices = vqtbl4q_u8(tbl, idx);
804
805 if (use_256color) {
806 chunks_256color++;
807 // 256-color mode: VECTORIZED color quantization
808 uint8_t char_idx_buf[16], color_indices[16];
809 vst1q_u8(char_idx_buf, char_indices); // Character indices from SIMD lookup
810
811 // VECTORIZED: Use existing optimized 256-color quantization
812 uint8x16_t color_indices_vec = palette256_index_dithered_neon(pix.val[0], pix.val[1], pix.val[2], x);
813 vst1q_u8(color_indices, color_indices_vec);
814
815 // Emit with RLE on (UTF-8 character, color) runs using SIMD-derived indices
816 for (int i = 0; i < 16;) {
817 const uint8_t char_idx = char_idx_buf[i]; // From vqtbl4q_u8 lookup
818 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
819 const uint8_t color_idx = color_indices[i];
820
821 // NEON-optimized RLE detection
822 const uint32_t run =
823 (uint32_t)find_rle_run_length_neon(char_idx_buf, color_indices, i, 16, char_idx, color_idx);
824
825 if (color_idx != cur_color_idx) {
826 if (use_background) {
827 emit_set_256_color_bg(&ob, color_idx);
828 } else {
829 emit_set_256_color_fg(&ob, color_idx);
830 }
831 cur_color_idx = color_idx;
832 }
833
834 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
835 if (rep_is_profitable(run)) {
836 emit_rep(&ob, run - 1);
837 } else {
838 for (uint32_t k = 1; k < run; k++) {
839 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
840 }
841 }
842 i += run;
843 }
844 } else {
845 chunks_truecolor++;
846 // VECTORIZED: Truecolor mode with full SIMD pipeline (no scalar spillover)
847 char temp_buffer[16 * 50]; // Temporary buffer for 16 ANSI sequences (up to 50 bytes each)
848 size_t vectorized_length =
849 neon_assemble_truecolor_sequences_true_simd(char_indices, pix.val[0], pix.val[1], pix.val[2], utf8_cache,
850 temp_buffer, sizeof(temp_buffer), use_background);
851
852 // Write vectorized output to main buffer
853 ob_write(&ob, temp_buffer, vectorized_length);
854 }
855 x += 16;
856 }
857
858 // Scalar tail for remaining pixels
859 for (; x < width;) {
860 const rgb_pixel_t *p = &row[x];
861 uint32_t R = p->r, G = p->g, B = p->b;
862 uint8_t Y = (uint8_t)((LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + LUMA_THRESHOLD) >> 8);
863 uint8_t luma_idx = Y >> 2; // 0-63 index (matches SIMD: cache64 is indexed by luminance bucket)
864 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
865
866 if (use_256color) {
867 // 256-color scalar tail
868 uint8_t color_idx = rgb_to_256color((uint8_t)R, (uint8_t)G, (uint8_t)B);
869
870 int j = x + 1;
871 while (j < width) {
872 const rgb_pixel_t *q = &row[j];
873 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
874 uint8_t Y2 = (uint8_t)((LUMA_RED * R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
875 uint8_t luma_idx2 = Y2 >> 2;
876 uint8_t color_idx2 = rgb_to_256color((uint8_t)R2, (uint8_t)G2, (uint8_t)B2);
877 if (luma_idx2 != luma_idx || color_idx2 != color_idx)
878 break;
879 j++;
880 }
881 uint32_t run = (uint32_t)(j - x);
882
883 if (color_idx != cur_color_idx) {
884 if (use_background) {
885 emit_set_256_color_bg(&ob, color_idx);
886 } else {
887 emit_set_256_color_fg(&ob, color_idx);
888 }
889 cur_color_idx = color_idx;
890 }
891
892 // Emit UTF-8 character from cache
893 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
894 if (rep_is_profitable(run)) {
895 emit_rep(&ob, run - 1);
896 } else {
897 for (uint32_t k = 1; k < run; k++) {
898 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
899 }
900 }
901 x = j;
902 } else {
903 // Truecolor scalar tail with UTF-8 characters using cached lookups
904 int j = x + 1;
905 while (j < width) {
906 const rgb_pixel_t *q = &row[j];
907 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
908 uint8_t Y2 = (uint8_t)((LUMA_RED * R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
909 uint8_t luma_idx2 = Y2 >> 2; // Compare luminance buckets (matches SIMD)
910 if (luma_idx2 != luma_idx || R2 != R || G2 != G || B2 != B)
911 break;
912 j++;
913 }
914 uint32_t run = (uint32_t)(j - x);
915
916 if ((int)R != curR || (int)G != curG || (int)B != curB) {
917 if (use_background) {
918 emit_set_truecolor_bg(&ob, (uint8_t)R, (uint8_t)G, (uint8_t)B);
919 } else {
920 emit_set_truecolor_fg(&ob, (uint8_t)R, (uint8_t)G, (uint8_t)B);
921 }
922 curR = (int)R;
923 curG = (int)G;
924 curB = (int)B;
925 }
926
927 // Emit UTF-8 character from cache
928 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
929 if (rep_is_profitable(run)) {
930 emit_rep(&ob, run - 1);
931 } else {
932 for (uint32_t k = 1; k < run; k++) {
933 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
934 }
935 }
936 x = j;
937 }
938 }
939
940 // End row: reset SGR, add newline (except for last row)
941 emit_reset(&ob);
942 if (y < height - 1) {
943 ob_putc(&ob, '\n');
944 }
945 }
946
947 uint64_t loop_end_ns = time_get_ns();
948 uint64_t loop_time_ms = (loop_end_ns - loop_start_ns) / NS_PER_MS_INT;
949 log_dev("NEON_MAIN_LOOP_ACTUAL: %llu ms for %d rows, %d width", loop_time_ms, height, width);
950
951 // Log chunks per mode
952 log_dev(
953 "NEON_MAIN_LOOP processed %d rows x %d width = %d pixels in %llu ms (256color: %d chunks, truecolor: %d chunks)",
954 height, width, height * width, loop_time_ms, chunks_256color, chunks_truecolor);
955
956 STOP_TIMER_AND_LOG_EVERY(dev, 3 * NS_PER_SEC_INT, 5 * NS_PER_MS_INT, "neon_main_loop",
957 "NEON_MAIN_LOOP: Complete (%.2f ms)");
958
959 ob_term(&ob);
960 return ob.buf;
961}
962
963//=============================================================================
964// Optimized NEON Half-block renderer (based on ChatGPT reference)
965//=============================================================================
966char *rgb_to_truecolor_halfblocks_neon(const uint8_t *rgb, int width, int height, int stride_bytes) {
967 /* Main: half-block renderer. Returns NUL-terminated malloc'd string; caller free(). */
968 if (width <= 0 || height <= 0)
969 return platform_strdup("");
970 if (stride_bytes <= 0)
971 stride_bytes = width * 3;
972
973 outbuf_t ob = {0};
974 // generous guess: per cell ~ 10–14 bytes avg; half the rows + newlines
975 size_t est_cells = (size_t)width * ((size_t)(height + 1) / 2);
976 ob.cap = est_cells * 14u + (size_t)((height + 1) / 2) * 8u + 64u;
977 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
978 if (!ob.buf)
979 return NULL;
980
981 // current SGR state; -1 means unknown
982 int cur_fr = -1, cur_fg = -1, cur_fb = -1;
983 int cur_br = -1, cur_bg = -1, cur_bb = -1;
984
985 // process two source rows per emitted line
986 for (int y = 0; y < height; y += 2) {
987 const uint8_t *rowT = rgb + (size_t)y * (size_t)stride_bytes;
988 const uint8_t *rowB = (y + 1 < height) ? rowT + (size_t)stride_bytes : NULL;
989
990 int x = 0;
991 while (x + 16 <= width) {
992 // Load 16 top and bottom pixels (RGB interleaved)
993 const uint8_t *pT = rowT + (size_t)x * 3u;
994 uint8x16x3_t top = vld3q_u8(pT);
995
996 uint8x16x3_t bot;
997 if (rowB) {
998 const uint8_t *pB = rowB + (size_t)x * 3u;
999 bot = vld3q_u8(pB);
1000 } else {
1001 // synthesize bottom = top for odd-height last row
1002 bot.val[0] = top.val[0];
1003 bot.val[1] = top.val[1];
1004 bot.val[2] = top.val[2];
1005 }
1006
1007 // Spill to small arrays (cheap; enables simple scalar RLE over 16)
1008 uint8_t Rt[16], Gt[16], Bt[16], Rb[16], Gb[16], Bb[16];
1009 vst1q_u8(Rt, top.val[0]);
1010 vst1q_u8(Gt, top.val[1]);
1011 vst1q_u8(Bt, top.val[2]);
1012 vst1q_u8(Rb, bot.val[0]);
1013 vst1q_u8(Gb, bot.val[1]);
1014 vst1q_u8(Bb, bot.val[2]);
1015
1016 // RLE over the 16 cells
1017 for (int i = 0; i < 16;) {
1018 uint8_t rT = Rt[i], gT = Gt[i], bT = Bt[i];
1019 uint8_t rB = Rb[i], gB = Gb[i], bB = Bb[i];
1020
1021 // Always half-block: U+2580 "▀" (upper half)
1022 const uint8_t glyph_utf8[3] = {0xE2, 0x96, 0x80};
1023
1024 // Extend run while next cell has same top+bottom colors
1025 int j = i + 1;
1026 for (; j < 16; ++j) {
1027 if (!(Rt[j] == rT && Gt[j] == gT && Bt[j] == bT && Rb[j] == rB && Gb[j] == gB && Bb[j] == bB))
1028 break;
1029 }
1030 uint32_t run = (uint32_t)(j - i);
1031
1032 // Check if this is a transparent area (black pixels = padding/background)
1033 bool is_transparent = (rT == 0 && gT == 0 && bT == 0 && rB == 0 && gB == 0 && bB == 0);
1034
1035 if (is_transparent) {
1036 // Reset colors before transparent areas to prevent color bleeding
1037 if (cur_fr != -1 || cur_fg != -1 || cur_fb != -1 || cur_br != -1 || cur_bg != -1 || cur_bb != -1) {
1038 emit_reset(&ob);
1039 cur_fr = cur_fg = cur_fb = -1;
1040 cur_br = cur_bg = cur_bb = -1;
1041 }
1042 // For transparent areas, emit space character with no color codes (terminal default)
1043 ob_write(&ob, " ", 1);
1044 if (rep_is_profitable(run)) {
1045 emit_rep(&ob, run - 1);
1046 } else {
1047 for (uint32_t k = 1; k < run; ++k) {
1048 ob_write(&ob, " ", 1);
1049 }
1050 }
1051 } else {
1052 // Normal colored half-blocks - set fg to TOP, bg to BOTTOM if changed
1053 if (cur_fr != rT || cur_fg != gT || cur_fb != bT) {
1054 emit_set_fg(&ob, rT, gT, bT);
1055 cur_fr = rT;
1056 cur_fg = gT;
1057 cur_fb = bT;
1058 }
1059 if (cur_br != rB || cur_bg != gB || cur_bb != bB) {
1060 emit_set_bg(&ob, rB, gB, bB);
1061 cur_br = rB;
1062 cur_bg = gB;
1063 cur_bb = bB;
1064 }
1065
1066 // Emit glyph once, then REP or literals
1067 ob_write(&ob, (const char *)glyph_utf8, 3);
1068 if (rep_is_profitable(run)) {
1069 emit_rep(&ob, run - 1);
1070 } else {
1071 for (uint32_t k = 1; k < run; ++k) {
1072 ob_write(&ob, (const char *)glyph_utf8, 3);
1073 }
1074 }
1075 }
1076
1077 i = j;
1078 }
1079 x += 16;
1080 }
1081
1082 // Scalar tail (or full row if no NEON)
1083 for (; x < width;) {
1084 const uint8_t *pT = rowT + (size_t)x * 3u;
1085 const uint8_t *pB = rowB ? rowB + (size_t)x * 3u : NULL;
1086
1087 uint8_t rT = pT[0], gT = pT[1], bT = pT[2];
1088 uint8_t rB = rT, gB = gT, bB = bT;
1089 if (pB) {
1090 rB = pB[0];
1091 gB = pB[1];
1092 bB = pB[2];
1093 }
1094
1095 // Extend run while top and bottom colors match exactly
1096 int j = x + 1;
1097 for (; j < width; ++j) {
1098 const uint8_t *qT = rowT + (size_t)j * 3u;
1099 const uint8_t *qB = rowB ? rowB + (size_t)j * 3u : NULL;
1100 uint8_t rT2 = qT[0], gT2 = qT[1], bT2 = qT[2];
1101 uint8_t rB2 = qB ? qB[0] : rT2, gB2 = qB ? qB[1] : gT2, bB2 = qB ? qB[2] : bT2;
1102 if (!((rT2 == rT && gT2 == gT && bT2 == bT) && (rB2 == rB && gB2 == gB && bB2 == bB)))
1103 break;
1104 }
1105 uint32_t run = (uint32_t)(j - x);
1106
1107 // Check if this is a transparent area (black pixels = padding/background)
1108 bool is_transparent = (rT == 0 && gT == 0 && bT == 0 && rB == 0 && gB == 0 && bB == 0);
1109
1110 if (is_transparent) {
1111 // Reset colors before transparent areas to prevent color bleeding
1112 if (cur_fr != -1 || cur_fg != -1 || cur_fb != -1 || cur_br != -1 || cur_bg != -1 || cur_bb != -1) {
1113 emit_reset(&ob);
1114 cur_fr = cur_fg = cur_fb = -1;
1115 cur_br = cur_bg = cur_bb = -1;
1116 }
1117 // For transparent areas, emit space character with no color codes
1118 ob_write(&ob, " ", 1);
1119 if (rep_is_profitable(run)) {
1120 emit_rep(&ob, run - 1);
1121 } else {
1122 for (uint32_t k = 1; k < run; ++k) {
1123 ob_write(&ob, " ", 1);
1124 }
1125 }
1126 } else {
1127 // SGR: fg = TOP, bg = BOTTOM for colored areas
1128 if (cur_fr != rT || cur_fg != gT || cur_fb != bT) {
1129 emit_set_fg(&ob, rT, gT, bT);
1130 cur_fr = rT;
1131 cur_fg = gT;
1132 cur_fb = bT;
1133 }
1134 if (cur_br != rB || cur_bg != gB || cur_bb != bB) {
1135 emit_set_bg(&ob, rB, gB, bB);
1136 cur_br = rB;
1137 cur_bg = gB;
1138 cur_bb = bB;
1139 }
1140
1141 // Always the upper half block "▀" (U+2580)
1142 static const char HB[3] = {(char)0xE2, (char)0x96, (char)0x80};
1143 ob_write(&ob, HB, 3);
1144 if (rep_is_profitable(run)) {
1145 emit_rep(&ob, run - 1);
1146 } else {
1147 for (uint32_t k = 1; k < run; ++k) {
1148 ob_write(&ob, HB, 3);
1149 }
1150 }
1151 }
1152
1153 x = j;
1154 }
1155
1156 // End emitted line: reset and newline (only for non-final lines)
1157 emit_reset(&ob);
1158 // Check if this is the last output line (since we process 2 pixel rows per output line)
1159 if (y + 2 < height) { // Only add newline if not the last output line
1160 ob_putc(&ob, '\n');
1161 }
1162 cur_fr = cur_fg = cur_fb = -1;
1163 cur_br = cur_bg = cur_bb = -1;
1164 }
1165
1166 ob_term(&ob);
1167 return ob.buf;
1168}
1169
1176void image_flip_horizontal_neon(image_t *image) {
1177 if (!image || !image->pixels || image->w < 2) {
1178 return;
1179 }
1180
1181 // Process each row - swap pixels from both ends using NEON for faster loads/stores
1182 for (int y = 0; y < image->h; y++) {
1183 rgb_pixel_t *row = &image->pixels[y * image->w];
1184 int width = image->w;
1185
1186 // NEON-accelerated swapping: process 4 pixels at a time using uint32 loads
1187 // Each RGB pixel is 3 bytes, so 4 pixels = 12 bytes that can be loaded as 3x u32
1188 int left_pix = 0;
1189 int right_pix = width - 1;
1190
1191 // Fast path: swap 4-pixel groups using uint32 operations
1192 while (left_pix + 3 < right_pix - 3) {
1193 // Load left 4 pixels (12 bytes) as 3 uint32 values using NEON
1194 uint32_t *left_ptr = (uint32_t *)&row[left_pix];
1195 uint32_t *right_ptr = (uint32_t *)&row[right_pix - 3];
1196
1197 uint32x2_t left_0 = vld1_u32(left_ptr); // first 8 bytes
1198 uint32_t left_1 = left_ptr[2]; // last 4 bytes
1199
1200 uint32x2_t right_0 = vld1_u32(right_ptr); // first 8 bytes
1201 uint32_t right_1 = right_ptr[2]; // last 4 bytes
1202
1203 // Store swapped using NEON
1204 vst1_u32(right_ptr, left_0);
1205 right_ptr[2] = left_1;
1206 vst1_u32(left_ptr, right_0);
1207 left_ptr[2] = right_1;
1208
1209 left_pix += 4;
1210 right_pix -= 4;
1211 }
1212
1213 // Scalar cleanup for remaining pixels
1214 while (left_pix < right_pix) {
1215 rgb_pixel_t temp = row[left_pix];
1216 row[left_pix] = row[right_pix];
1217 row[right_pix] = temp;
1218 left_pix++;
1219 right_pix--;
1220 }
1221 }
1222}
1223
1224#endif // SIMD_SUPPORT_NEON
uint8_t rgb_to_256color(uint8_t r, uint8_t g, uint8_t b)
Definition ansi_fast.c:230
global_dec3_cache_t g_dec3_cache
Definition ascii_simd.c:25
void init_dec3(void)
Definition ascii_simd.c:66
#define CUBE_GRAY_THRESHOLD
void emit_set_256_color_bg(outbuf_t *ob, uint8_t color_idx)
void emit_set_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
void emit_set_256_color_fg(outbuf_t *ob, uint8_t color_idx)
void ob_term(outbuf_t *ob)
void ob_putc(outbuf_t *ob, char c)
bool rep_is_profitable(uint32_t runlen)
void emit_set_truecolor_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
void emit_rep(outbuf_t *ob, uint32_t extra)
void ob_write(outbuf_t *ob, const char *s, size_t n)
void emit_reset(outbuf_t *ob)
void emit_set_truecolor_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
void emit_set_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
char * platform_strdup(const char *s)
#define R2(v, w, x, y, z, i)
Definition sha1.c:58
uint64_t time_get_ns(void)
Definition util/time.c:48
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)