ascii-chat 0.6.0
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
ssse3.c
Go to the documentation of this file.
1
7#if SIMD_SUPPORT_SSSE3
8#include <stdio.h>
9#include <stdlib.h>
10#include <string.h>
11#include <stdint.h>
12
13#include <tmmintrin.h>
14
15#include "ssse3.h"
16#include "ascii_simd.h"
17#include "common.h"
18#include "../output_buffer.h"
19#include "util/overflow.h"
20
21//=============================================================================
22// Image-based API (matches NEON architecture)
23//=============================================================================
24
25// Simple monochrome ASCII function (matches scalar image_print performance)
26char *render_ascii_image_monochrome_ssse3(const image_t *image, const char *ascii_chars) {
27 if (!image || !image->pixels || !ascii_chars) {
28 return NULL;
29 }
30
31 const int h = image->h;
32 const int w = image->w;
33
34 if (h <= 0 || w <= 0) {
35 return NULL;
36 }
37
38 // Get cached UTF-8 character mappings
39 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
40 if (!utf8_cache) {
41 log_error("Failed to get UTF-8 palette cache");
42 return NULL;
43 }
44
45 // Buffer size for UTF-8 characters
46 const size_t max_char_bytes = 4;
47
48 // Calculate buffer size with overflow checking
49 size_t w_times_bytes;
50 if (checked_size_mul((size_t)w, max_char_bytes, &w_times_bytes) != ASCIICHAT_OK) {
51 log_error("Buffer size overflow: width too large for UTF-8 encoding");
52 return NULL;
53 }
54
55 size_t w_times_bytes_plus_one;
56 if (checked_size_add(w_times_bytes, 1, &w_times_bytes_plus_one) != ASCIICHAT_OK) {
57 log_error("Buffer size overflow: width * bytes + 1 overflow");
58 return NULL;
59 }
60
61 size_t len;
62 if (checked_size_mul((size_t)h, w_times_bytes_plus_one, &len) != ASCIICHAT_OK) {
63 log_error("Buffer size overflow: height * (width * bytes + 1) overflow");
64 return NULL;
65 }
66
67 char *output;
68 output = SAFE_MALLOC(len, char *);
69
70 char *pos = output;
71 const rgb_pixel_t *pixels = (const rgb_pixel_t *)image->pixels;
72
73 // Pure SSSE3 processing - matches NEON approach
74 for (int y = 0; y < h; y++) {
75 const rgb_pixel_t *row = &pixels[y * w];
76 int x = 0;
77
78 // Process 16 pixels at a time with SSSE3 (full 128-bit register capacity)
79 for (; x + 15 < w; x += 16) {
80 // Manual deinterleave RGB components (SSSE3 limitation vs NEON's vld3q_u8)
81 uint8_t r_array[16], g_array[16], b_array[16];
82 for (int j = 0; j < 16; j++) {
83 r_array[j] = row[x + j].r;
84 g_array[j] = row[x + j].g;
85 b_array[j] = row[x + j].b;
86 }
87
88 // Process 16 pixels in two 8-pixel SSSE3 batches (same as SSE2 approach)
89 __m128i r_vec_lo = _mm_loadl_epi64((__m128i *)(r_array + 0));
90 __m128i r_vec_hi = _mm_loadl_epi64((__m128i *)(r_array + 8));
91 __m128i g_vec_lo = _mm_loadl_epi64((__m128i *)(g_array + 0));
92 __m128i g_vec_hi = _mm_loadl_epi64((__m128i *)(g_array + 8));
93 __m128i b_vec_lo = _mm_loadl_epi64((__m128i *)(b_array + 0));
94 __m128i b_vec_hi = _mm_loadl_epi64((__m128i *)(b_array + 8));
95
96 // Process first 8 pixels
97 __m128i r_16_lo = _mm_unpacklo_epi8(r_vec_lo, _mm_setzero_si128());
98 __m128i g_16_lo = _mm_unpacklo_epi8(g_vec_lo, _mm_setzero_si128());
99 __m128i b_16_lo = _mm_unpacklo_epi8(b_vec_lo, _mm_setzero_si128());
100
101 __m128i luma_r_lo = _mm_mullo_epi16(r_16_lo, _mm_set1_epi16(LUMA_RED));
102 __m128i luma_g_lo = _mm_mullo_epi16(g_16_lo, _mm_set1_epi16(LUMA_GREEN));
103 __m128i luma_b_lo = _mm_mullo_epi16(b_16_lo, _mm_set1_epi16(LUMA_BLUE));
104
105 __m128i luma_sum_lo = _mm_add_epi16(luma_r_lo, luma_g_lo);
106 luma_sum_lo = _mm_add_epi16(luma_sum_lo, luma_b_lo);
107 luma_sum_lo = _mm_add_epi16(luma_sum_lo, _mm_set1_epi16(LUMA_THRESHOLD));
108 luma_sum_lo = _mm_srli_epi16(luma_sum_lo, 8);
109
110 // Process second 8 pixels
111 __m128i r_16_hi = _mm_unpacklo_epi8(r_vec_hi, _mm_setzero_si128());
112 __m128i g_16_hi = _mm_unpacklo_epi8(g_vec_hi, _mm_setzero_si128());
113 __m128i b_16_hi = _mm_unpacklo_epi8(b_vec_hi, _mm_setzero_si128());
114
115 __m128i luma_r_hi = _mm_mullo_epi16(r_16_hi, _mm_set1_epi16(LUMA_RED));
116 __m128i luma_g_hi = _mm_mullo_epi16(g_16_hi, _mm_set1_epi16(LUMA_GREEN));
117 __m128i luma_b_hi = _mm_mullo_epi16(b_16_hi, _mm_set1_epi16(LUMA_BLUE));
118
119 __m128i luma_sum_hi = _mm_add_epi16(luma_r_hi, luma_g_hi);
120 luma_sum_hi = _mm_add_epi16(luma_sum_hi, luma_b_hi);
121 luma_sum_hi = _mm_add_epi16(luma_sum_hi, _mm_set1_epi16(LUMA_THRESHOLD));
122 luma_sum_hi = _mm_srli_epi16(luma_sum_hi, 8);
123
124 // Pack and store
125 __m128i luminance_lo = _mm_packus_epi16(luma_sum_lo, _mm_setzero_si128());
126 __m128i luminance_hi = _mm_packus_epi16(luma_sum_hi, _mm_setzero_si128());
127
128 uint8_t luma_array[16];
129 _mm_storel_epi64((__m128i *)(luma_array + 0), luminance_lo);
130 _mm_storel_epi64((__m128i *)(luma_array + 8), luminance_hi);
131
132 for (int j = 0; j < 16; j++) {
133 const utf8_char_t *char_info = &utf8_cache->cache[luma_array[j]];
134 // Optimized: Use direct assignment for single-byte ASCII characters
135 if (char_info->byte_len == 1) {
136 *pos++ = char_info->utf8_bytes[0];
137 } else {
138 // Fallback to full memcpy for multi-byte UTF-8
139 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
140 pos += char_info->byte_len;
141 }
142 }
143 }
144
145 // Handle remaining pixels with optimized scalar code
146 for (; x < w; x++) {
147 const rgb_pixel_t pixel = row[x];
148 const int luminance = (LUMA_RED * pixel.r + LUMA_GREEN * pixel.g + LUMA_BLUE * pixel.b + LUMA_THRESHOLD) >> 8;
149 const utf8_char_t *char_info = &utf8_cache->cache[luminance];
150 // Optimized: Use direct assignment for single-byte ASCII characters
151 if (char_info->byte_len == 1) {
152 *pos++ = char_info->utf8_bytes[0];
153 } else {
154 // Fallback to full memcpy for multi-byte UTF-8
155 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
156 pos += char_info->byte_len;
157 }
158 }
159
160 // Add newline (except for last row)
161 if (y < h - 1) {
162 *pos++ = '\n';
163 }
164 }
165
166 // Null terminate
167 *pos = '\0';
168
169 return output;
170}
171
172// 256-color palette mapping (RGB to ANSI 256 color index) - copied from NEON
173static inline uint8_t rgb_to_256color_ssse3(uint8_t r, uint8_t g, uint8_t b) {
174 return (uint8_t)(16 + 36 * (r / 51) + 6 * (g / 51) + (b / 51));
175}
176
177// Unified SSSE3 function for all color modes (full implementation like NEON)
178char *render_ascii_ssse3_unified_optimized(const image_t *image, bool use_background, bool use_256color,
179 const char *ascii_chars) {
180 if (!image || !image->pixels) {
181 return NULL;
182 }
183
184 const int width = image->w;
185 const int height = image->h;
186
187 if (width <= 0 || height <= 0) {
188 char *empty;
189 empty = SAFE_MALLOC(1, char *);
190 empty[0] = '\0';
191 return empty;
192 }
193
194 // Get cached UTF-8 character mappings for color rendering
195 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
196 if (!utf8_cache) {
197 log_error("Failed to get UTF-8 palette cache for SSSE3 color");
198 return NULL;
199 }
200
201 outbuf_t ob = {0};
202 // Estimate buffer size based on mode (copied from NEON)
203 size_t bytes_per_pixel = use_256color ? 6u : 8u; // 256-color shorter than truecolor
204
205 // Calculate buffer size with overflow checking
206 size_t height_times_width;
207 if (checked_size_mul((size_t)height, (size_t)width, &height_times_width) != ASCIICHAT_OK) {
208 log_error("Buffer size overflow: height * width overflow");
209 return NULL;
210 }
211
212 size_t pixel_data_size;
213 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
214 log_error("Buffer size overflow: (height * width) * bytes_per_pixel overflow");
215 return NULL;
216 }
217
218 size_t height_times_16;
219 if (checked_size_mul((size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
220 log_error("Buffer size overflow: height * 16 overflow");
221 return NULL;
222 }
223
224 size_t temp;
225 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
226 log_error("Buffer size overflow: pixel_data + height*16 overflow");
227 return NULL;
228 }
229
230 if (checked_size_add(temp, 64u, &ob.cap) != ASCIICHAT_OK) {
231 log_error("Buffer size overflow: total capacity overflow");
232 return NULL;
233 }
234 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
235 if (!ob.buf)
236 return NULL;
237
238 // Build SSSE3 lookup table for _mm_shuffle_epi8 (uses character indices)
239 __m128i char_lut = _mm_loadu_si128((__m128i *)utf8_cache->char_index_ramp); // Load first 16 indices
240
241 // Track current color state (copied from NEON)
242 int curR = -1, curG = -1, curB = -1;
243 int cur_color_idx = -1;
244
245 for (int y = 0; y < height; y++) {
246 const rgb_pixel_t *row = &((const rgb_pixel_t *)image->pixels)[y * width];
247 int x = 0;
248
249 // Process 16-pixel chunks with SSSE3 (full 128-bit register capacity)
250 while (x + 16 <= width) {
251 // Manual deinterleave RGB components (SSSE3 limitation vs NEON's vld3q_u8)
252 uint8_t r_array[16], g_array[16], b_array[16];
253 for (int j = 0; j < 16; j++) {
254 r_array[j] = row[x + j].r;
255 g_array[j] = row[x + j].g;
256 b_array[j] = row[x + j].b;
257 }
258
259 // Load into SSSE3 registers
260 __m128i r_vec = _mm_loadl_epi64((__m128i *)r_array);
261 __m128i g_vec = _mm_loadl_epi64((__m128i *)g_array);
262 __m128i b_vec = _mm_loadl_epi64((__m128i *)b_array);
263
264 // Convert to 16-bit for arithmetic
265 __m128i r_16 = _mm_unpacklo_epi8(r_vec, _mm_setzero_si128());
266 __m128i g_16 = _mm_unpacklo_epi8(g_vec, _mm_setzero_si128());
267 __m128i b_16 = _mm_unpacklo_epi8(b_vec, _mm_setzero_si128());
268
269 // Calculate luminance: (77*R + 150*G + 29*B + 128) >> 8
270 __m128i luma_r = _mm_mullo_epi16(r_16, _mm_set1_epi16(LUMA_RED));
271 __m128i luma_g = _mm_mullo_epi16(g_16, _mm_set1_epi16(LUMA_GREEN));
272 __m128i luma_b = _mm_mullo_epi16(b_16, _mm_set1_epi16(LUMA_BLUE));
273
274 __m128i luma_sum = _mm_add_epi16(luma_r, luma_g);
275 luma_sum = _mm_add_epi16(luma_sum, luma_b);
276 luma_sum = _mm_add_epi16(luma_sum, _mm_set1_epi16(LUMA_THRESHOLD));
277 luma_sum = _mm_srli_epi16(luma_sum, 8);
278
279 // Pack back to 8-bit and store
280 __m128i luminance = _mm_packus_epi16(luma_sum, _mm_setzero_si128());
281 uint8_t luma_array[8];
282 _mm_storel_epi64((__m128i *)luma_array, luminance);
283
284 // FAST: Use _mm_shuffle_epi8 to get character indices from the ramp (SSSE3 advantage)
285 __m128i luma_vec = _mm_loadl_epi64((__m128i *)luma_array); // Load 8 luminance values
286 __m128i luma_idx_vec = _mm_srli_epi16(_mm_unpacklo_epi8(luma_vec, _mm_setzero_si128()), 2); // >> 2 for 0-63
287 __m128i luma_idx_8bit = _mm_packus_epi16(luma_idx_vec, _mm_setzero_si128()); // Pack back to 8-bit
288
289 // Use _mm_shuffle_epi8 for fast character index lookup
290 __m128i char_indices_vec = _mm_shuffle_epi8(char_lut, luma_idx_8bit);
291
292 uint8_t char_indices[8];
293 _mm_storel_epi64((__m128i *)char_indices, char_indices_vec);
294
295 if (use_256color) {
296 // 256-color mode processing (copied from NEON logic)
297 uint8_t color_indices[8];
298 for (int i = 0; i < 8; i++) {
299 color_indices[i] = rgb_to_256color_ssse3(r_array[i], g_array[i], b_array[i]);
300 }
301
302 // Emit with RLE on (glyph, color) runs (copied from NEON)
303 for (int i = 0; i < 8;) {
304 const uint8_t char_idx = char_indices[i];
305 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
306 const uint8_t color_idx = color_indices[i];
307
308 int j = i + 1;
309 while (j < 8 && char_indices[j] == char_idx && color_indices[j] == color_idx) {
310 j++;
311 }
312 const uint32_t run = (uint32_t)(j - i);
313
314 if (color_idx != cur_color_idx) {
315 if (use_background) {
316 emit_set_256_color_bg(&ob, color_idx);
317 } else {
318 emit_set_256_color_fg(&ob, color_idx);
319 }
320 cur_color_idx = color_idx;
321 }
322
323 // Emit UTF-8 character from cache
324 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
325 if (rep_is_profitable(run)) {
326 emit_rep(&ob, run - 1);
327 } else {
328 for (uint32_t k = 1; k < run; k++) {
329 // Emit UTF-8 character from cache
330 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
331 }
332 }
333 i = j;
334 }
335 } else {
336 // Truecolor mode processing (copied from NEON logic)
337 for (int i = 0; i < 8;) {
338 const uint8_t char_idx = char_indices[i];
339 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
340 const uint8_t r = r_array[i];
341 const uint8_t g = g_array[i];
342 const uint8_t b = b_array[i];
343
344 int j = i + 1;
345 while (j < 8 && char_indices[j] == char_idx && r_array[j] == r && g_array[j] == g && b_array[j] == b) {
346 j++;
347 }
348 const uint32_t run = (uint32_t)(j - i);
349
350 if (r != curR || g != curG || b != curB) {
351 if (use_background) {
352 emit_set_truecolor_bg(&ob, r, g, b);
353 } else {
354 emit_set_truecolor_fg(&ob, r, g, b);
355 }
356 curR = r;
357 curG = g;
358 curB = b;
359 }
360
361 // Emit UTF-8 character from cache
362 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
363 if (rep_is_profitable(run)) {
364 emit_rep(&ob, run - 1);
365 } else {
366 for (uint32_t k = 1; k < run; k++) {
367 // Emit UTF-8 character from cache
368 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
369 }
370 }
371 i = j;
372 }
373 }
374 x += 16;
375 }
376
377 // Scalar tail for remaining pixels (copied from NEON logic)
378 for (; x < width;) {
379 const rgb_pixel_t *p = &row[x];
380 uint32_t R = p->r, G = p->g, B = p->b;
381 uint8_t Y = (uint8_t)((LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + LUMA_THRESHOLD) >> 8);
382 uint8_t luma_idx = Y >> 2;
383 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
384
385 if (use_256color) {
386 // 256-color scalar tail
387 uint8_t color_idx = rgb_to_256color_ssse3((uint8_t)R, (uint8_t)G, (uint8_t)B);
388
389 int j = x + 1;
390 while (j < width) {
391 const rgb_pixel_t *q = &row[j];
392 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
393 uint8_t Y2 = (uint8_t)((LUMA_RED * R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
394 uint8_t color_idx2 = rgb_to_256color_ssse3((uint8_t)R2, (uint8_t)G2, (uint8_t)B2);
395 if (((Y2 >> 2) != (Y >> 2)) || color_idx2 != color_idx)
396 break;
397 j++;
398 }
399 uint32_t run = (uint32_t)(j - x);
400
401 if (color_idx != cur_color_idx) {
402 if (use_background) {
403 emit_set_256_color_bg(&ob, color_idx);
404 } else {
405 emit_set_256_color_fg(&ob, color_idx);
406 }
407 cur_color_idx = color_idx;
408 }
409
410 // Emit UTF-8 character from cache
411 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
412 if (rep_is_profitable(run)) {
413 emit_rep(&ob, run - 1);
414 } else {
415 for (uint32_t k = 1; k < run; k++) {
416 // Emit UTF-8 character from cache
417 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
418 }
419 }
420 x = j;
421 } else {
422 // Truecolor scalar tail
423 int j = x + 1;
424 while (j < width) {
425 const rgb_pixel_t *q = &row[j];
426 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
427 uint8_t Y2 = (uint8_t)((LUMA_RED * R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
428 if (((Y2 >> 2) != (Y >> 2)) || R2 != R || G2 != G || B2 != B)
429 break;
430 j++;
431 }
432 uint32_t run = (uint32_t)(j - x);
433
434 if ((int)R != curR || (int)G != curG || (int)B != curB) {
435 if (use_background) {
437 } else {
439 }
440 curR = (int)R;
441 curG = (int)G;
442 curB = (int)B;
443 }
444
445 // Emit UTF-8 character from cache
446 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
447 if (rep_is_profitable(run)) {
448 emit_rep(&ob, run - 1);
449 } else {
450 for (uint32_t k = 1; k < run; k++) {
451 // Emit UTF-8 character from cache
452 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
453 }
454 }
455 x = j;
456 }
457 }
458
459 // End row: reset SGR, add newline (except for last row) (copied from NEON)
460 emit_reset(&ob);
461 if (y < height - 1) {
462 ob_putc(&ob, '\n');
463 }
464 curR = curG = curB = -1;
465 cur_color_idx = -1;
466 }
467
468 ob_term(&ob);
469 return ob.buf;
470}
471
472// Destroy SSSE3 cache resources (called at program shutdown)
473void ssse3_caches_destroy(void) {
474 // SSSE3 currently uses shared caches from common.c, so no specific cleanup needed
475 log_debug("SSSE3_CACHE: SSSE3 caches cleaned up");
476}
477
478#endif /* SIMD_SUPPORT_SSSE3 */
SIMD-optimized ASCII conversion interface.
unsigned int uint32_t
Definition common.h:58
#define SAFE_MALLOC(size, cast)
Definition common.h:208
unsigned char uint8_t
Definition common.h:56
@ ASCIICHAT_OK
Definition error_codes.h:48
#define log_error(...)
Log an ERROR message.
#define log_debug(...)
Log a DEBUG message.
#define LUMA_BLUE
Luminance blue coefficient (0.114 * 256 = 29)
Definition ascii_simd.h:76
void emit_set_256_color_bg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color background ANSI sequence.
#define LUMA_GREEN
Luminance green coefficient (0.587 * 256 = 150)
Definition ascii_simd.h:74
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)
Get or create UTF-8 palette cache.
void emit_set_256_color_fg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color foreground ANSI sequence.
void ob_term(outbuf_t *ob)
Append null terminator to buffer.
#define LUMA_THRESHOLD
Luminance threshold for rounding.
Definition ascii_simd.h:78
void ob_putc(outbuf_t *ob, char c)
Append a character to buffer.
bool rep_is_profitable(uint32_t runlen)
Check if run-length encoding is profitable.
void emit_set_truecolor_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor foreground ANSI sequence.
void emit_rep(outbuf_t *ob, uint32_t extra)
Emit run-length encoded sequence.
void ob_write(outbuf_t *ob, const char *s, size_t n)
Append a string to buffer.
void emit_reset(outbuf_t *ob)
Emit ANSI reset sequence.
void emit_set_truecolor_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor background ANSI sequence.
#define LUMA_RED
Luminance red coefficient (0.299 * 256 = 77)
Definition ascii_simd.h:72
✅ Safe Integer Arithmetic and Overflow Detection
SSSE3-optimized ASCII rendering functions.
Image structure.
int w
Image width in pixels (must be > 0)
int h
Image height in pixels (must be > 0)
rgb_pixel_t * pixels
Pixel data array (width * height RGB pixels, row-major order)
Dynamic output buffer (auto-expanding)
size_t cap
Buffer capacity in bytes (maximum length before reallocation)
char * buf
Buffer pointer (allocated, owned by caller, must be freed)
UTF-8 character structure.
UTF-8 palette cache structure.