ascii-chat 0.6.0
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
sve.c
Go to the documentation of this file.
1
7#if SIMD_SUPPORT_SVE
8#include <stdio.h>
9#include <stdlib.h>
10#include <string.h>
11#include <stdint.h>
12#include "sve.h"
13#include "common.h"
14#include "ascii_simd.h" // For LUMA_RED, LUMA_GREEN, LUMA_BLUE, LUMA_THRESHOLD
15#include "video/output_buffer.h" // For outbuf_t, emit_*, ob_*
16
17#include <arm_sve.h>
18
19#include "util/overflow.h"
20
21//=============================================================================
22// Image-based API (matches NEON architecture)
23//=============================================================================
24
25// Simple monochrome ASCII function (matches scalar image_print performance)
26char *render_ascii_image_monochrome_sve(const image_t *image, const char *ascii_chars) {
27 if (!image || !image->pixels) {
28 return NULL;
29 }
30
31 const int h = image->h;
32 const int w = image->w;
33
34 if (h <= 0 || w <= 0) {
35 return NULL;
36 }
37
38 // Get cached UTF-8 character mappings
39 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
40 if (!utf8_cache) {
41 log_error("Failed to get UTF-8 palette cache");
42 return NULL;
43 }
44
45 // Buffer size for UTF-8 characters
46 const size_t max_char_bytes = 4;
47 const size_t len = (size_t)h * ((size_t)w * max_char_bytes + 1);
48
49 char *output = SAFE_MALLOC(len, char *);
50
51 char *pos = output;
52 const rgb_pixel_t *pixels = (const rgb_pixel_t *)image->pixels;
53
54 // Pure SVE processing - matches NEON approach but with scalable vectors
55 for (int y = 0; y < h; y++) {
56 const rgb_pixel_t *row = &pixels[y * w];
57 int x = 0;
58
59 // Process pixels with SVE (scalable vector length - typically 128, 256, or 512 bits)
60 svbool_t pg = svptrue_b8(); // Predicate for all lanes
61 (void)pg; // May be unused in some code paths
62
63 while (x < w) {
64 // Calculate how many pixels we can process in this iteration
65 int remaining = w - x;
66 (void)remaining;
67 svbool_t pg_active = svwhilelt_b8_s32(x, w);
68 int vec_len = svcntb_pat(SV_ALL) / 3; // Vector length in RGB pixels (3 bytes per pixel)
69 int process_count = (remaining < vec_len) ? remaining : vec_len;
70
71 // Manual deinterleave RGB components (SVE limitation vs NEON's vld3)
72 uint8_t r_array[64], g_array[64], b_array[64]; // Max SVE vector size
73 for (int j = 0; j < process_count; j++) {
74 if (x + j < w) {
75 r_array[j] = row[x + j].r;
76 g_array[j] = row[x + j].g;
77 b_array[j] = row[x + j].b;
78 }
79 }
80
81 // Load into SVE vectors
82 svuint8_t r_vec = svld1_u8(pg_active, r_array);
83 svuint8_t g_vec = svld1_u8(pg_active, g_array);
84 svuint8_t b_vec = svld1_u8(pg_active, b_array);
85
86 // Convert to 16-bit for arithmetic
87 svuint16_t r_16 = svunpklo_u16(r_vec);
88 svuint16_t g_16 = svunpklo_u16(g_vec);
89 svuint16_t b_16 = svunpklo_u16(b_vec);
90
91 // Calculate luminance: (77*R + 150*G + 29*B + 128) >> 8
92 svuint16_t luma = svmul_n_u16_x(svptrue_b16(), r_16, LUMA_RED);
93 luma = svmla_n_u16_x(svptrue_b16(), luma, g_16, LUMA_GREEN);
94 luma = svmla_n_u16_x(svptrue_b16(), luma, b_16, LUMA_BLUE);
95 luma = svadd_n_u16_x(svptrue_b16(), luma, LUMA_THRESHOLD);
96 luma = svlsr_n_u16_x(svptrue_b16(), luma, 8);
97
98 // Store u16 luminance values (SVE1 compatible - no SVE2 narrowing intrinsics)
99 // After right-shift by 8, values are already in 0-255 range
100 uint16_t luma_temp[64];
101 svst1_u16(svptrue_b16(), luma_temp, luma);
102
103 // Convert to u8 array for ASCII lookup
104 uint8_t luma_array[64];
105 for (int j = 0; j < process_count; j++) {
106 luma_array[j] = (uint8_t)luma_temp[j];
107 }
108
109 for (int j = 0; j < process_count; j++) {
110 if (x + j < w) {
111 const utf8_char_t *char_info = &utf8_cache->cache[luma_array[j]];
112 // Optimized: Use direct assignment for single-byte ASCII characters
113 if (char_info->byte_len == 1) {
114 *pos++ = char_info->utf8_bytes[0];
115 } else {
116 // Fallback to full memcpy for multi-byte UTF-8
117 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
118 pos += char_info->byte_len;
119 }
120 }
121 }
122 x += process_count;
123 }
124
125 // Add newline (except for last row)
126 if (y < h - 1) {
127 *pos++ = '\n';
128 }
129 }
130
131 // Null terminate
132 *pos = '\0';
133
134 return output;
135}
136
137// 256-color palette mapping (RGB to ANSI 256 color index) - copied from NEON
138static inline uint8_t rgb_to_256color_sve(uint8_t r, uint8_t g, uint8_t b) {
139 return (uint8_t)(16 + 36 * (r / 51) + 6 * (g / 51) + (b / 51));
140}
141
142// Unified SVE function for all color modes (full implementation like NEON)
143char *render_ascii_sve_unified_optimized(const image_t *image, bool use_background, bool use_256color,
144 const char *ascii_chars) {
145 if (!image || !image->pixels) {
146 return NULL;
147 }
148
149 const int width = image->w;
150 const int height = image->h;
151
152 if (width <= 0 || height <= 0) {
153 char *empty;
154 empty = SAFE_MALLOC(1, char *);
155 empty[0] = '\0';
156 return empty;
157 }
158
159 // Use monochrome optimization for simple case
160 if (!use_background && !use_256color) {
161 return render_ascii_image_monochrome_sve(image, ascii_chars);
162 }
163
164 outbuf_t ob = {0};
165 // Estimate buffer size based on mode (copied from NEON)
166 size_t bytes_per_pixel = use_256color ? 6u : 8u; // 256-color shorter than truecolor
167
168 // Calculate buffer size with overflow checking
169 size_t height_times_width;
170 if (checked_size_mul((size_t)height, (size_t)width, &height_times_width) != ASCIICHAT_OK) {
171 log_error("Buffer size overflow: height * width overflow");
172 return NULL;
173 }
174
175 size_t pixel_data_size;
176 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
177 log_error("Buffer size overflow: (height * width) * bytes_per_pixel overflow");
178 return NULL;
179 }
180
181 size_t height_times_16;
182 if (checked_size_mul((size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
183 log_error("Buffer size overflow: height * 16 overflow");
184 return NULL;
185 }
186
187 size_t temp;
188 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
189 log_error("Buffer size overflow: pixel_data + height*16 overflow");
190 return NULL;
191 }
192
193 if (checked_size_add(temp, 64u, &ob.cap) != ASCIICHAT_OK) {
194 log_error("Buffer size overflow: total capacity overflow");
195 return NULL;
196 }
197
198 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
199 if (!ob.buf)
200 return NULL;
201
202 // Get cached UTF-8 character mappings for color rendering
203 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
204 if (!utf8_cache) {
205 log_error("Failed to get UTF-8 palette cache for SVE color");
206 return NULL;
207 }
208
209 // Track current color state (copied from NEON)
210 int curR = -1, curG = -1, curB = -1;
211 int cur_color_idx = -1;
212
213 for (int y = 0; y < height; y++) {
214 const rgb_pixel_t *row = &((const rgb_pixel_t *)image->pixels)[y * width];
215 int x = 0;
216
217 // Process with SVE scalable vectors (adapts to hardware vector length)
218 while (x < width) {
219 svbool_t pg_active = svwhilelt_b8_s32(x, width);
220 int vec_len = svcntb_pat(SV_ALL) / 3; // Vector length in RGB pixels
221 int remaining = width - x;
222 int process_count = (remaining < vec_len) ? remaining : vec_len;
223
224 // Manual deinterleave RGB components (SVE limitation vs NEON's vld3)
225 uint8_t r_array[64], g_array[64], b_array[64]; // Max SVE vector size
226 for (int j = 0; j < process_count; j++) {
227 if (x + j < width) {
228 r_array[j] = row[x + j].r;
229 g_array[j] = row[x + j].g;
230 b_array[j] = row[x + j].b;
231 }
232 }
233
234 // Load into SVE vectors
235 svuint8_t r_vec = svld1_u8(pg_active, r_array);
236 svuint8_t g_vec = svld1_u8(pg_active, g_array);
237 svuint8_t b_vec = svld1_u8(pg_active, b_array);
238
239 // Convert to 16-bit for arithmetic
240 svuint16_t r_16 = svunpklo_u16(r_vec);
241 svuint16_t g_16 = svunpklo_u16(g_vec);
242 svuint16_t b_16 = svunpklo_u16(b_vec);
243
244 // Calculate luminance: (77*R + 150*G + 29*B + 128) >> 8
245 svuint16_t luma = svmul_n_u16_x(svptrue_b16(), r_16, LUMA_RED);
246 luma = svmla_n_u16_x(svptrue_b16(), luma, g_16, LUMA_GREEN);
247 luma = svmla_n_u16_x(svptrue_b16(), luma, b_16, LUMA_BLUE);
248 luma = svadd_n_u16_x(svptrue_b16(), luma, LUMA_THRESHOLD);
249 luma = svlsr_n_u16_x(svptrue_b16(), luma, 8);
250
251 // Store u16 luminance values (SVE1 compatible - no SVE2 narrowing intrinsics)
252 // After right-shift by 8, values are already in 0-255 range
253 uint16_t luma_temp[64];
254 svst1_u16(svptrue_b16(), luma_temp, luma);
255
256 // Convert to u8 array for ASCII lookup
257 uint8_t luma_array[64];
258 for (int j = 0; j < process_count; j++) {
259 luma_array[j] = (uint8_t)luma_temp[j];
260 }
261
262 // FAST: Use svtbl_u8 to get character indices from the ramp (SVE advantage)
263 // Convert luminance to 0-63 indices
264 svuint8_t luma_vec = svld1_u8(pg_active, luma_array); // Load luminance values
265 svuint8_t luma_idx_vec = svlsr_n_u8_x(svptrue_b8(), luma_vec, 2); // >> 2 for 0-63
266
267 // Use svtbl_u8 for fast character index lookup (scalable!)
268 svuint8_t char_lut_vec = svld1_u8(svptrue_b8(), utf8_cache->char_index_ramp);
269 svuint8_t char_indices_vec = svtbl_u8(char_lut_vec, luma_idx_vec);
270
271 uint8_t gbuf[64]; // Reuse gbuf name for compatibility
272 svst1_u8(pg_active, gbuf, char_indices_vec);
273
274 if (use_256color) {
275 // 256-color mode processing (copied from NEON logic)
276 uint8_t color_indices[64];
277 for (int i = 0; i < process_count; i++) {
278 color_indices[i] = rgb_to_256color_sve(r_array[i], g_array[i], b_array[i]);
279 }
280
281 // Emit with RLE on (glyph, color) runs (copied from NEON)
282 for (int i = 0; i < process_count;) {
283 const uint8_t char_idx = gbuf[i]; // This is now the character index
284 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
285 const uint8_t color_idx = color_indices[i];
286
287 int j = i + 1;
288 while (j < process_count && gbuf[j] == char_idx && color_indices[j] == color_idx) {
289 j++;
290 }
291 const uint32_t run = (uint32_t)(j - i);
292
293 if (color_idx != cur_color_idx) {
294 if (use_background) {
295 emit_set_256_color_bg(&ob, color_idx);
296 } else {
297 emit_set_256_color_fg(&ob, color_idx);
298 }
299 cur_color_idx = color_idx;
300 }
301
302 // Emit UTF-8 character from cache
303 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
304 if (rep_is_profitable(run)) {
305 emit_rep(&ob, run - 1);
306 } else {
307 for (uint32_t k = 1; k < run; k++) {
308 // Emit UTF-8 character from cache
309 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
310 }
311 }
312 i = j;
313 }
314 } else {
315 // Truecolor mode processing (copied from NEON logic)
316 for (int i = 0; i < process_count;) {
317 const uint8_t char_idx = gbuf[i]; // This is now the character index
318 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
319 const uint8_t r = r_array[i];
320 const uint8_t g = g_array[i];
321 const uint8_t b = b_array[i];
322
323 int j = i + 1;
324 while (j < process_count && gbuf[j] == char_idx && r_array[j] == r && g_array[j] == g && b_array[j] == b) {
325 j++;
326 }
327 const uint32_t run = (uint32_t)(j - i);
328
329 if (r != curR || g != curG || b != curB) {
330 if (use_background) {
331 emit_set_truecolor_bg(&ob, r, g, b);
332 } else {
333 emit_set_truecolor_fg(&ob, r, g, b);
334 }
335 curR = r;
336 curG = g;
337 curB = b;
338 }
339
340 // Emit UTF-8 character from cache
341 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
342 if (rep_is_profitable(run)) {
343 emit_rep(&ob, run - 1);
344 } else {
345 for (uint32_t k = 1; k < run; k++) {
346 // Emit UTF-8 character from cache
347 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
348 }
349 }
350 i = j;
351 }
352 }
353 x += process_count;
354 }
355
356 // Scalar tail for any remaining pixels (copied from NEON logic)
357 for (; x < width;) {
358 const rgb_pixel_t *p = &row[x];
359 uint32_t R = p->r, G = p->g, B = p->b;
360 uint8_t Y = (uint8_t)((LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + LUMA_THRESHOLD) >> 8);
361 uint8_t luma_idx = Y >> 2;
362 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
363
364 if (use_256color) {
365 // 256-color scalar tail
366 uint8_t color_idx = rgb_to_256color_sve((uint8_t)R, (uint8_t)G, (uint8_t)B);
367
368 int j = x + 1;
369 while (j < width) {
370 const rgb_pixel_t *q = &row[j];
371 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
372 uint8_t Y2 = (uint8_t)((LUMA_RED * R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
373 uint8_t color_idx2 = rgb_to_256color_sve((uint8_t)R2, (uint8_t)G2, (uint8_t)B2);
374 if (((Y2 >> 2) != (Y >> 2)) || color_idx2 != color_idx)
375 break;
376 j++;
377 }
378 uint32_t run = (uint32_t)(j - x);
379
380 if (color_idx != cur_color_idx) {
381 if (use_background) {
382 emit_set_256_color_bg(&ob, color_idx);
383 } else {
384 emit_set_256_color_fg(&ob, color_idx);
385 }
386 cur_color_idx = color_idx;
387 }
388
389 // Emit UTF-8 character from cache
390 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
391 if (rep_is_profitable(run)) {
392 emit_rep(&ob, run - 1);
393 } else {
394 for (uint32_t k = 1; k < run; k++) {
395 // Emit UTF-8 character from cache
396 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
397 }
398 }
399 x = j;
400 } else {
401 // Truecolor scalar tail
402 int j = x + 1;
403 while (j < width) {
404 const rgb_pixel_t *q = &row[j];
405 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
406 uint8_t Y2 = (uint8_t)((77u * R2 + 150u * G2 + 29u * B2 + 128u) >> 8);
407 if (((Y2 >> 2) != (Y >> 2)) || R2 != R || G2 != G || B2 != B)
408 break;
409 j++;
410 }
411 uint32_t run = (uint32_t)(j - x);
412
413 if ((int)R != curR || (int)G != curG || (int)B != curB) {
414 if (use_background) {
416 } else {
418 }
419 curR = (int)R;
420 curG = (int)G;
421 curB = (int)B;
422 }
423
424 // Emit UTF-8 character from cache
425 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
426 if (rep_is_profitable(run)) {
427 emit_rep(&ob, run - 1);
428 } else {
429 for (uint32_t k = 1; k < run; k++) {
430 // Emit UTF-8 character from cache
431 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
432 }
433 }
434 x = j;
435 }
436 }
437
438 // End row: reset SGR, add newline (except for last row) (copied from NEON)
439 emit_reset(&ob);
440 if (y < height - 1) {
441 ob_putc(&ob, '\n');
442 }
443 curR = curG = curB = -1;
444 cur_color_idx = -1;
445 }
446
447 ob_term(&ob);
448 return ob.buf;
449}
450
451// Destroy SVE cache resources (called at program shutdown)
452void sve_caches_destroy(void) {
453 // SVE currently uses shared caches from common.c, so no specific cleanup needed
454 log_debug("SVE_CACHE: SVE caches cleaned up");
455}
456
457#endif /* SIMD_SUPPORT_SVE */
SIMD-optimized ASCII conversion interface.
unsigned short uint16_t
Definition common.h:57
unsigned int uint32_t
Definition common.h:58
#define SAFE_MALLOC(size, cast)
Definition common.h:208
unsigned char uint8_t
Definition common.h:56
@ ASCIICHAT_OK
Definition error_codes.h:48
#define log_error(...)
Log an ERROR message.
#define log_debug(...)
Log a DEBUG message.
#define LUMA_BLUE
Luminance blue coefficient (0.114 * 256 = 29)
Definition ascii_simd.h:76
void emit_set_256_color_bg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color background ANSI sequence.
#define LUMA_GREEN
Luminance green coefficient (0.587 * 256 = 150)
Definition ascii_simd.h:74
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)
Get or create UTF-8 palette cache.
void emit_set_256_color_fg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color foreground ANSI sequence.
void ob_term(outbuf_t *ob)
Append null terminator to buffer.
#define LUMA_THRESHOLD
Luminance threshold for rounding.
Definition ascii_simd.h:78
void ob_putc(outbuf_t *ob, char c)
Append a character to buffer.
bool rep_is_profitable(uint32_t runlen)
Check if run-length encoding is profitable.
void emit_set_truecolor_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor foreground ANSI sequence.
void emit_rep(outbuf_t *ob, uint32_t extra)
Emit run-length encoded sequence.
void ob_write(outbuf_t *ob, const char *s, size_t n)
Append a string to buffer.
void emit_reset(outbuf_t *ob)
Emit ANSI reset sequence.
void emit_set_truecolor_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor background ANSI sequence.
#define LUMA_RED
Luminance red coefficient (0.299 * 256 = 77)
Definition ascii_simd.h:72
Dynamic Output Buffer with ANSI Sequence Support.
✅ Safe Integer Arithmetic and Overflow Detection
Image structure.
int w
Image width in pixels (must be > 0)
int h
Image height in pixels (must be > 0)
rgb_pixel_t * pixels
Pixel data array (width * height RGB pixels, row-major order)
Dynamic output buffer (auto-expanding)
size_t cap
Buffer capacity in bytes (maximum length before reallocation)
char * buf
Buffer pointer (allocated, owned by caller, must be freed)
UTF-8 character structure.
UTF-8 palette cache structure.
SVE-optimized ASCII rendering functions.