ascii-chat 0.6.0
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
sse2.c
Go to the documentation of this file.
1
7#if SIMD_SUPPORT_SSE2
8#include <stdio.h>
9#include <stdlib.h>
10#include <string.h>
11#include <stdint.h>
12
13#include <emmintrin.h>
14
15#include "sse2.h"
16#include "ascii_simd.h"
17#include "common.h"
18#include "../output_buffer.h"
19#include "util/overflow.h"
20
21//=============================================================================
22// Image-based API (matches NEON architecture)
23//=============================================================================
24
25// Simple monochrome ASCII function (matches scalar image_print performance)
26char *render_ascii_image_monochrome_sse2(const image_t *image, const char *ascii_chars) {
27 if (!image || !image->pixels || !ascii_chars) {
28 return NULL;
29 }
30
31 const int h = image->h;
32 const int w = image->w;
33
34 if (h <= 0 || w <= 0) {
35 return NULL;
36 }
37
38 // Get cached UTF-8 character mappings
39 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
40 if (!utf8_cache) {
41 log_error("Failed to get UTF-8 palette cache");
42 return NULL;
43 }
44
45 // Buffer size for UTF-8 characters
46 const size_t max_char_bytes = 4;
47 const size_t len = (size_t)h * ((size_t)w * max_char_bytes + 1);
48
49 char *output;
50 output = SAFE_MALLOC(len, char *);
51
52 char *pos = output;
53 const rgb_pixel_t *pixels = (const rgb_pixel_t *)image->pixels;
54
55 // Pure SSE2 processing - matches NEON approach
56 for (int y = 0; y < h; y++) {
57 const rgb_pixel_t *row = &pixels[y * w];
58 int x = 0;
59
60 // Process 16 pixels at a time with SSE2 (full 128-bit register capacity)
61 for (; x + 15 < w; x += 16) {
62 // Manual deinterleave RGB components (SSE2 limitation vs NEON's vld3q_u8)
63 uint8_t r_array[16], g_array[16], b_array[16];
64 for (int j = 0; j < 16; j++) {
65 r_array[j] = row[x + j].r;
66 g_array[j] = row[x + j].g;
67 b_array[j] = row[x + j].b;
68 }
69
70 // Load full 16 bytes into SSE2 registers (process in two 8-pixel batches)
71 __m128i r_vec_lo = _mm_loadl_epi64((__m128i *)(r_array + 0)); // First 8 pixels
72 __m128i r_vec_hi = _mm_loadl_epi64((__m128i *)(r_array + 8)); // Second 8 pixels
73 __m128i g_vec_lo = _mm_loadl_epi64((__m128i *)(g_array + 0));
74 __m128i g_vec_hi = _mm_loadl_epi64((__m128i *)(g_array + 8));
75 __m128i b_vec_lo = _mm_loadl_epi64((__m128i *)(b_array + 0));
76 __m128i b_vec_hi = _mm_loadl_epi64((__m128i *)(b_array + 8));
77
78 // Process first 8 pixels
79 __m128i r_16_lo = _mm_unpacklo_epi8(r_vec_lo, _mm_setzero_si128());
80 __m128i g_16_lo = _mm_unpacklo_epi8(g_vec_lo, _mm_setzero_si128());
81 __m128i b_16_lo = _mm_unpacklo_epi8(b_vec_lo, _mm_setzero_si128());
82
83 __m128i luma_r_lo = _mm_mullo_epi16(r_16_lo, _mm_set1_epi16(77));
84 __m128i luma_g_lo = _mm_mullo_epi16(g_16_lo, _mm_set1_epi16(150));
85 __m128i luma_b_lo = _mm_mullo_epi16(b_16_lo, _mm_set1_epi16(29));
86
87 __m128i luma_sum_lo = _mm_add_epi16(luma_r_lo, luma_g_lo);
88 luma_sum_lo = _mm_add_epi16(luma_sum_lo, luma_b_lo);
89 luma_sum_lo = _mm_add_epi16(luma_sum_lo, _mm_set1_epi16(128));
90 luma_sum_lo = _mm_srli_epi16(luma_sum_lo, 8);
91
92 // Process second 8 pixels
93 __m128i r_16_hi = _mm_unpacklo_epi8(r_vec_hi, _mm_setzero_si128());
94 __m128i g_16_hi = _mm_unpacklo_epi8(g_vec_hi, _mm_setzero_si128());
95 __m128i b_16_hi = _mm_unpacklo_epi8(b_vec_hi, _mm_setzero_si128());
96
97 __m128i luma_r_hi = _mm_mullo_epi16(r_16_hi, _mm_set1_epi16(77));
98 __m128i luma_g_hi = _mm_mullo_epi16(g_16_hi, _mm_set1_epi16(150));
99 __m128i luma_b_hi = _mm_mullo_epi16(b_16_hi, _mm_set1_epi16(29));
100
101 __m128i luma_sum_hi = _mm_add_epi16(luma_r_hi, luma_g_hi);
102 luma_sum_hi = _mm_add_epi16(luma_sum_hi, luma_b_hi);
103 luma_sum_hi = _mm_add_epi16(luma_sum_hi, _mm_set1_epi16(128));
104 luma_sum_hi = _mm_srli_epi16(luma_sum_hi, 8);
105
106 // Pack both halves to 8-bit
107 __m128i luminance_lo = _mm_packus_epi16(luma_sum_lo, _mm_setzero_si128());
108 __m128i luminance_hi = _mm_packus_epi16(luma_sum_hi, _mm_setzero_si128());
109
110 // Store and convert to ASCII characters
111 uint8_t luma_array[16];
112 _mm_storel_epi64((__m128i *)(luma_array + 0), luminance_lo);
113 _mm_storel_epi64((__m128i *)(luma_array + 8), luminance_hi);
114
115 // Convert luminance to UTF-8 characters using optimized mappings
116 for (int j = 0; j < 16; j++) {
117 const utf8_char_t *char_info = &utf8_cache->cache[luma_array[j]];
118 // Optimized: Use direct assignment for single-byte ASCII characters
119 if (char_info->byte_len == 1) {
120 *pos++ = char_info->utf8_bytes[0];
121 } else {
122 // Fallback to full memcpy for multi-byte UTF-8
123 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
124 pos += char_info->byte_len;
125 }
126 }
127 }
128
129 // Handle remaining pixels with optimized scalar code
130 for (; x < w; x++) {
131 const rgb_pixel_t pixel = row[x];
132 const int luminance = (LUMA_RED * pixel.r + LUMA_GREEN * pixel.g + LUMA_BLUE * pixel.b + LUMA_THRESHOLD) >> 8;
133 const utf8_char_t *char_info = &utf8_cache->cache[luminance];
134 // Optimized: Use direct assignment for single-byte ASCII characters
135 if (char_info->byte_len == 1) {
136 *pos++ = char_info->utf8_bytes[0];
137 } else {
138 // Fallback to full memcpy for multi-byte UTF-8
139 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
140 pos += char_info->byte_len;
141 }
142 }
143
144 // Add newline (except for last row)
145 if (y < h - 1) {
146 *pos++ = '\n';
147 }
148 }
149
150 // Null terminate
151 *pos = '\0';
152
153 return output;
154}
155
156// 256-color palette mapping (RGB to ANSI 256 color index) - copied from NEON
157static inline uint8_t rgb_to_256color_sse2(uint8_t r, uint8_t g, uint8_t b) {
158 return (uint8_t)(16 + 36 * (r / 51) + 6 * (g / 51) + (b / 51));
159}
160
161// Unified SSE2 function for all color modes (full implementation like NEON)
162char *render_ascii_sse2_unified_optimized(const image_t *image, bool use_background, bool use_256color,
163 const char *ascii_chars) {
164 if (!image || !image->pixels) {
165 return NULL;
166 }
167
168 const int width = image->w;
169 const int height = image->h;
170
171 if (width <= 0 || height <= 0) {
172 char *empty;
173 empty = SAFE_MALLOC(1, char *);
174 empty[0] = '\0';
175 return empty;
176 }
177
178 outbuf_t ob = {0};
179 // Estimate buffer size based on mode (copied from NEON)
180 size_t bytes_per_pixel = use_256color ? 6u : 8u; // 256-color shorter than truecolor
181
182 // Calculate buffer size with overflow checking
183 size_t height_times_width;
184 if (checked_size_mul((size_t)height, (size_t)width, &height_times_width) != ASCIICHAT_OK) {
185 log_error("Buffer size overflow: height * width overflow");
186 return NULL;
187 }
188
189 size_t pixel_data_size;
190 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
191 log_error("Buffer size overflow: (height * width) * bytes_per_pixel overflow");
192 return NULL;
193 }
194
195 size_t height_times_16;
196 if (checked_size_mul((size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
197 log_error("Buffer size overflow: height * 16 overflow");
198 return NULL;
199 }
200
201 size_t temp;
202 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
203 log_error("Buffer size overflow: pixel_data + height*16 overflow");
204 return NULL;
205 }
206
207 if (checked_size_add(temp, 64u, &ob.cap) != ASCIICHAT_OK) {
208 log_error("Buffer size overflow: total capacity overflow");
209 return NULL;
210 }
211
212 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
213 if (!ob.buf)
214 return NULL;
215
216 // Get cached UTF-8 character mappings for color rendering
217 utf8_palette_cache_t *utf8_cache = get_utf8_palette_cache(ascii_chars);
218 if (!utf8_cache) {
219 log_error("Failed to get UTF-8 palette cache for SSE2 color");
220 SAFE_FREE(ob.buf);
221 return NULL;
222 }
223
224 // SSE2 doesn't have _mm_shuffle_epi8 (introduced in SSSE3), so use scalar UTF-8 cache lookup
225 // This is still much faster than the old approach since UTF-8 parsing is cached
226
227 // Track current color state (copied from NEON)
228 int curR = -1, curG = -1, curB = -1;
229 int cur_color_idx = -1;
230
231 for (int y = 0; y < height; y++) {
232 const rgb_pixel_t *row = &((const rgb_pixel_t *)image->pixels)[y * width];
233 int x = 0;
234
235 // Process 16-pixel chunks with SSE2 (full 128-bit register capacity)
236 while (x + 16 <= width) {
237 // Manual deinterleave RGB components (SSE2 limitation vs NEON's vld3q_u8)
238 uint8_t r_array[16], g_array[16], b_array[16];
239 for (int j = 0; j < 16; j++) {
240 r_array[j] = row[x + j].r;
241 g_array[j] = row[x + j].g;
242 b_array[j] = row[x + j].b;
243 }
244
245 // Load into SSE2 registers
246 __m128i r_vec = _mm_loadl_epi64((__m128i *)r_array);
247 __m128i g_vec = _mm_loadl_epi64((__m128i *)g_array);
248 __m128i b_vec = _mm_loadl_epi64((__m128i *)b_array);
249
250 // Convert to 16-bit for arithmetic
251 __m128i r_16 = _mm_unpacklo_epi8(r_vec, _mm_setzero_si128());
252 __m128i g_16 = _mm_unpacklo_epi8(g_vec, _mm_setzero_si128());
253 __m128i b_16 = _mm_unpacklo_epi8(b_vec, _mm_setzero_si128());
254
255 // Calculate luminance: (77*R + 150*G + 29*B + 128) >> 8
256 __m128i luma_r = _mm_mullo_epi16(r_16, _mm_set1_epi16(LUMA_RED));
257 __m128i luma_g = _mm_mullo_epi16(g_16, _mm_set1_epi16(LUMA_GREEN));
258 __m128i luma_b = _mm_mullo_epi16(b_16, _mm_set1_epi16(LUMA_BLUE));
259
260 __m128i luma_sum = _mm_add_epi16(luma_r, luma_g);
261 luma_sum = _mm_add_epi16(luma_sum, luma_b);
262 luma_sum = _mm_add_epi16(luma_sum, _mm_set1_epi16(LUMA_THRESHOLD));
263 luma_sum = _mm_srli_epi16(luma_sum, 8);
264
265 // Pack back to 8-bit and store
266 __m128i luminance = _mm_packus_epi16(luma_sum, _mm_setzero_si128());
267 uint8_t luma_array[8];
268 _mm_storel_epi64((__m128i *)luma_array, luminance);
269
270 // Convert to UTF-8 character indices using cached mappings
271 uint8_t char_indices[8];
272 for (int i = 0; i < 8; i++) {
273 const uint8_t luma_idx = luma_array[i] >> 2; // 0-63 index
274 char_indices[i] = luma_idx; // Direct index into cache64
275 }
276
277 if (use_256color) {
278 // 256-color mode processing (copied from NEON logic)
279 uint8_t color_indices[8];
280 for (int i = 0; i < 8; i++) {
281 color_indices[i] = rgb_to_256color_sse2(r_array[i], g_array[i], b_array[i]);
282 }
283
284 // Emit with RLE on (UTF-8 character, color) runs
285 for (int i = 0; i < 8;) { // SSE2 processes 8 pixels, not 16
286 const uint8_t char_idx = char_indices[i];
287 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
288 const uint8_t color_idx = color_indices[i];
289
290 int j = i + 1;
291 while (j < 8 && char_indices[j] == char_idx && color_indices[j] == color_idx) {
292 j++;
293 }
294 const uint32_t run = (uint32_t)(j - i);
295
296 if (color_idx != cur_color_idx) {
297 if (use_background) {
298 emit_set_256_color_bg(&ob, color_idx);
299 } else {
300 emit_set_256_color_fg(&ob, color_idx);
301 }
302 cur_color_idx = color_idx;
303 }
304
305 // Emit UTF-8 character from cache
306 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
307 if (rep_is_profitable(run)) {
308 emit_rep(&ob, run - 1);
309 } else {
310 for (uint32_t k = 1; k < run; k++) {
311 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
312 }
313 }
314 i = j;
315 }
316 } else {
317 // Truecolor mode processing with UTF-8 characters
318 for (int i = 0; i < 8;) { // SSE2 processes 8 pixels
319 const uint8_t char_idx = char_indices[i];
320 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
321 const uint8_t r = r_array[i];
322 const uint8_t g = g_array[i];
323 const uint8_t b = b_array[i];
324
325 int j = i + 1;
326 while (j < 8 && char_indices[j] == char_idx && r_array[j] == r && g_array[j] == g && b_array[j] == b) {
327 j++;
328 }
329 const uint32_t run = (uint32_t)(j - i);
330
331 if (r != curR || g != curG || b != curB) {
332 if (use_background) {
333 emit_set_truecolor_bg(&ob, r, g, b);
334 } else {
335 emit_set_truecolor_fg(&ob, r, g, b);
336 }
337 curR = r;
338 curG = g;
339 curB = b;
340 }
341
342 // Emit UTF-8 character from cache
343 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
344 if (rep_is_profitable(run)) {
345 emit_rep(&ob, run - 1);
346 } else {
347 for (uint32_t k = 1; k < run; k++) {
348 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
349 }
350 }
351 i = j;
352 }
353 }
354 x += 16;
355 }
356
357 // Scalar tail for remaining pixels (copied from NEON logic)
358 for (; x < width;) {
359 const rgb_pixel_t *p = &row[x];
360 uint32_t R = p->r, G = p->g, B = p->b;
361 uint8_t Y = (uint8_t)((LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + LUMA_THRESHOLD) >> 8);
362 uint8_t luma_idx = Y >> 2;
363 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
364
365 if (use_256color) {
366 // 256-color scalar tail with UTF-8
367 uint8_t color_idx = rgb_to_256color_sse2((uint8_t)R, (uint8_t)G, (uint8_t)B);
368
369 int j = x + 1;
370 while (j < width) {
371 const rgb_pixel_t *q = &row[j];
372 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
373 uint8_t Y2 = (uint8_t)((LUMA_RED * R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
374 uint8_t luma_idx2 = Y2 >> 2;
375 uint8_t color_idx2 = rgb_to_256color_sse2((uint8_t)R2, (uint8_t)G2, (uint8_t)B2);
376 if (luma_idx2 != luma_idx || color_idx2 != color_idx)
377 break;
378 j++;
379 }
380 uint32_t run = (uint32_t)(j - x);
381
382 if (color_idx != cur_color_idx) {
383 if (use_background) {
384 emit_set_256_color_bg(&ob, color_idx);
385 } else {
386 emit_set_256_color_fg(&ob, color_idx);
387 }
388 cur_color_idx = color_idx;
389 }
390
391 // Emit UTF-8 character from cache
392 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
393 if (rep_is_profitable(run)) {
394 emit_rep(&ob, run - 1);
395 } else {
396 for (uint32_t k = 1; k < run; k++) {
397 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
398 }
399 }
400 x = j;
401 } else {
402 // Truecolor scalar tail with UTF-8
403 int j = x + 1;
404 while (j < width) {
405 const rgb_pixel_t *q = &row[j];
406 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
407 uint8_t Y2 = (uint8_t)((LUMA_RED * R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
408 uint8_t luma_idx2 = Y2 >> 2;
409 if (luma_idx2 != luma_idx || R2 != R || G2 != G || B2 != B)
410 break;
411 j++;
412 }
413 uint32_t run = (uint32_t)(j - x);
414
415 if ((int)R != curR || (int)G != curG || (int)B != curB) {
416 if (use_background) {
418 } else {
420 }
421 curR = (int)R;
422 curG = (int)G;
423 curB = (int)B;
424 }
425
426 // Emit UTF-8 character from cache
427 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
428 if (rep_is_profitable(run)) {
429 emit_rep(&ob, run - 1);
430 } else {
431 for (uint32_t k = 1; k < run; k++) {
432 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
433 }
434 }
435 x = j;
436 }
437 }
438
439 // End row: reset SGR, add newline (except for last row) (copied from NEON)
440 emit_reset(&ob);
441 if (y < height - 1) {
442 ob_putc(&ob, '\n');
443 }
444 curR = curG = curB = -1;
445 cur_color_idx = -1;
446 }
447
448 ob_term(&ob);
449 return ob.buf;
450}
451
452// Destroy SSE2 cache resources (called at program shutdown)
453void sse2_caches_destroy(void) {
454 // SSE2 currently uses shared caches from common.c, so no specific cleanup needed
455 log_debug("SSE2_CACHE: SSE2 caches cleaned up");
456}
457
458#endif /* SIMD_SUPPORT_SSE2 */
SIMD-optimized ASCII conversion interface.
unsigned int uint32_t
Definition common.h:58
#define SAFE_FREE(ptr)
Definition common.h:320
#define SAFE_MALLOC(size, cast)
Definition common.h:208
unsigned char uint8_t
Definition common.h:56
@ ASCIICHAT_OK
Definition error_codes.h:48
#define log_error(...)
Log an ERROR message.
#define log_debug(...)
Log a DEBUG message.
#define LUMA_BLUE
Luminance blue coefficient (0.114 * 256 = 29)
Definition ascii_simd.h:76
void emit_set_256_color_bg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color background ANSI sequence.
#define LUMA_GREEN
Luminance green coefficient (0.587 * 256 = 150)
Definition ascii_simd.h:74
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)
Get or create UTF-8 palette cache.
void emit_set_256_color_fg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color foreground ANSI sequence.
void ob_term(outbuf_t *ob)
Append null terminator to buffer.
#define LUMA_THRESHOLD
Luminance threshold for rounding.
Definition ascii_simd.h:78
void ob_putc(outbuf_t *ob, char c)
Append a character to buffer.
bool rep_is_profitable(uint32_t runlen)
Check if run-length encoding is profitable.
void emit_set_truecolor_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor foreground ANSI sequence.
void emit_rep(outbuf_t *ob, uint32_t extra)
Emit run-length encoded sequence.
void ob_write(outbuf_t *ob, const char *s, size_t n)
Append a string to buffer.
void emit_reset(outbuf_t *ob)
Emit ANSI reset sequence.
void emit_set_truecolor_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor background ANSI sequence.
#define LUMA_RED
Luminance red coefficient (0.299 * 256 = 77)
Definition ascii_simd.h:72
✅ Safe Integer Arithmetic and Overflow Detection
SSE2-optimized ASCII rendering functions.
Image structure.
int w
Image width in pixels (must be > 0)
int h
Image height in pixels (must be > 0)
rgb_pixel_t * pixels
Pixel data array (width * height RGB pixels, row-major order)
Dynamic output buffer (auto-expanding)
size_t cap
Buffer capacity in bytes (maximum length before reallocation)
char * buf
Buffer pointer (allocated, owned by caller, must be freed)
UTF-8 character structure.
UTF-8 palette cache structure.