18#include "../output_buffer.h"
26char *render_ascii_image_monochrome_ssse3(
const image_t *image,
const char *ascii_chars) {
27 if (!image || !image->
pixels || !ascii_chars) {
31 const int h = image->
h;
32 const int w = image->
w;
34 if (h <= 0 || w <= 0) {
41 log_error(
"Failed to get UTF-8 palette cache");
46 const size_t max_char_bytes = 4;
50 if (checked_size_mul((
size_t)w, max_char_bytes, &w_times_bytes) !=
ASCIICHAT_OK) {
51 log_error(
"Buffer size overflow: width too large for UTF-8 encoding");
55 size_t w_times_bytes_plus_one;
56 if (checked_size_add(w_times_bytes, 1, &w_times_bytes_plus_one) !=
ASCIICHAT_OK) {
57 log_error(
"Buffer size overflow: width * bytes + 1 overflow");
62 if (checked_size_mul((
size_t)h, w_times_bytes_plus_one, &len) !=
ASCIICHAT_OK) {
63 log_error(
"Buffer size overflow: height * (width * bytes + 1) overflow");
71 const rgb_pixel_t *pixels = (
const rgb_pixel_t *)image->
pixels;
74 for (
int y = 0; y < h; y++) {
75 const rgb_pixel_t *row = &pixels[y * w];
79 for (; x + 15 < w; x += 16) {
81 uint8_t r_array[16], g_array[16], b_array[16];
82 for (
int j = 0; j < 16; j++) {
83 r_array[j] = row[x + j].r;
84 g_array[j] = row[x + j].g;
85 b_array[j] = row[x + j].b;
89 __m128i r_vec_lo = _mm_loadl_epi64((__m128i *)(r_array + 0));
90 __m128i r_vec_hi = _mm_loadl_epi64((__m128i *)(r_array + 8));
91 __m128i g_vec_lo = _mm_loadl_epi64((__m128i *)(g_array + 0));
92 __m128i g_vec_hi = _mm_loadl_epi64((__m128i *)(g_array + 8));
93 __m128i b_vec_lo = _mm_loadl_epi64((__m128i *)(b_array + 0));
94 __m128i b_vec_hi = _mm_loadl_epi64((__m128i *)(b_array + 8));
97 __m128i r_16_lo = _mm_unpacklo_epi8(r_vec_lo, _mm_setzero_si128());
98 __m128i g_16_lo = _mm_unpacklo_epi8(g_vec_lo, _mm_setzero_si128());
99 __m128i b_16_lo = _mm_unpacklo_epi8(b_vec_lo, _mm_setzero_si128());
101 __m128i luma_r_lo = _mm_mullo_epi16(r_16_lo, _mm_set1_epi16(
LUMA_RED));
102 __m128i luma_g_lo = _mm_mullo_epi16(g_16_lo, _mm_set1_epi16(
LUMA_GREEN));
103 __m128i luma_b_lo = _mm_mullo_epi16(b_16_lo, _mm_set1_epi16(
LUMA_BLUE));
105 __m128i luma_sum_lo = _mm_add_epi16(luma_r_lo, luma_g_lo);
106 luma_sum_lo = _mm_add_epi16(luma_sum_lo, luma_b_lo);
107 luma_sum_lo = _mm_add_epi16(luma_sum_lo, _mm_set1_epi16(
LUMA_THRESHOLD));
108 luma_sum_lo = _mm_srli_epi16(luma_sum_lo, 8);
111 __m128i r_16_hi = _mm_unpacklo_epi8(r_vec_hi, _mm_setzero_si128());
112 __m128i g_16_hi = _mm_unpacklo_epi8(g_vec_hi, _mm_setzero_si128());
113 __m128i b_16_hi = _mm_unpacklo_epi8(b_vec_hi, _mm_setzero_si128());
115 __m128i luma_r_hi = _mm_mullo_epi16(r_16_hi, _mm_set1_epi16(
LUMA_RED));
116 __m128i luma_g_hi = _mm_mullo_epi16(g_16_hi, _mm_set1_epi16(
LUMA_GREEN));
117 __m128i luma_b_hi = _mm_mullo_epi16(b_16_hi, _mm_set1_epi16(
LUMA_BLUE));
119 __m128i luma_sum_hi = _mm_add_epi16(luma_r_hi, luma_g_hi);
120 luma_sum_hi = _mm_add_epi16(luma_sum_hi, luma_b_hi);
121 luma_sum_hi = _mm_add_epi16(luma_sum_hi, _mm_set1_epi16(
LUMA_THRESHOLD));
122 luma_sum_hi = _mm_srli_epi16(luma_sum_hi, 8);
125 __m128i luminance_lo = _mm_packus_epi16(luma_sum_lo, _mm_setzero_si128());
126 __m128i luminance_hi = _mm_packus_epi16(luma_sum_hi, _mm_setzero_si128());
129 _mm_storel_epi64((__m128i *)(luma_array + 0), luminance_lo);
130 _mm_storel_epi64((__m128i *)(luma_array + 8), luminance_hi);
132 for (
int j = 0; j < 16; j++) {
147 const rgb_pixel_t pixel = row[x];
174 return (
uint8_t)(16 + 36 * (r / 51) + 6 * (g / 51) + (b / 51));
178char *render_ascii_ssse3_unified_optimized(
const image_t *image,
bool use_background,
bool use_256color,
179 const char *ascii_chars) {
180 if (!image || !image->
pixels) {
184 const int width = image->
w;
185 const int height = image->
h;
187 if (width <= 0 || height <= 0) {
197 log_error(
"Failed to get UTF-8 palette cache for SSSE3 color");
203 size_t bytes_per_pixel = use_256color ? 6u : 8u;
206 size_t height_times_width;
207 if (checked_size_mul((
size_t)height, (
size_t)width, &height_times_width) !=
ASCIICHAT_OK) {
208 log_error(
"Buffer size overflow: height * width overflow");
212 size_t pixel_data_size;
213 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) !=
ASCIICHAT_OK) {
214 log_error(
"Buffer size overflow: (height * width) * bytes_per_pixel overflow");
218 size_t height_times_16;
219 if (checked_size_mul((
size_t)height, 16u, &height_times_16) !=
ASCIICHAT_OK) {
220 log_error(
"Buffer size overflow: height * 16 overflow");
225 if (checked_size_add(pixel_data_size, height_times_16, &temp) !=
ASCIICHAT_OK) {
226 log_error(
"Buffer size overflow: pixel_data + height*16 overflow");
231 log_error(
"Buffer size overflow: total capacity overflow");
239 __m128i char_lut = _mm_loadu_si128((__m128i *)utf8_cache->
char_index_ramp);
242 int curR = -1, curG = -1, curB = -1;
243 int cur_color_idx = -1;
245 for (
int y = 0; y < height; y++) {
246 const rgb_pixel_t *row = &((
const rgb_pixel_t *)image->
pixels)[y * width];
250 while (x + 16 <= width) {
252 uint8_t r_array[16], g_array[16], b_array[16];
253 for (
int j = 0; j < 16; j++) {
254 r_array[j] = row[x + j].r;
255 g_array[j] = row[x + j].g;
256 b_array[j] = row[x + j].b;
260 __m128i r_vec = _mm_loadl_epi64((__m128i *)r_array);
261 __m128i g_vec = _mm_loadl_epi64((__m128i *)g_array);
262 __m128i b_vec = _mm_loadl_epi64((__m128i *)b_array);
265 __m128i r_16 = _mm_unpacklo_epi8(r_vec, _mm_setzero_si128());
266 __m128i g_16 = _mm_unpacklo_epi8(g_vec, _mm_setzero_si128());
267 __m128i b_16 = _mm_unpacklo_epi8(b_vec, _mm_setzero_si128());
270 __m128i luma_r = _mm_mullo_epi16(r_16, _mm_set1_epi16(
LUMA_RED));
271 __m128i luma_g = _mm_mullo_epi16(g_16, _mm_set1_epi16(
LUMA_GREEN));
272 __m128i luma_b = _mm_mullo_epi16(b_16, _mm_set1_epi16(
LUMA_BLUE));
274 __m128i luma_sum = _mm_add_epi16(luma_r, luma_g);
275 luma_sum = _mm_add_epi16(luma_sum, luma_b);
276 luma_sum = _mm_add_epi16(luma_sum, _mm_set1_epi16(
LUMA_THRESHOLD));
277 luma_sum = _mm_srli_epi16(luma_sum, 8);
280 __m128i luminance = _mm_packus_epi16(luma_sum, _mm_setzero_si128());
282 _mm_storel_epi64((__m128i *)luma_array, luminance);
285 __m128i luma_vec = _mm_loadl_epi64((__m128i *)luma_array);
286 __m128i luma_idx_vec = _mm_srli_epi16(_mm_unpacklo_epi8(luma_vec, _mm_setzero_si128()), 2);
287 __m128i luma_idx_8bit = _mm_packus_epi16(luma_idx_vec, _mm_setzero_si128());
290 __m128i char_indices_vec = _mm_shuffle_epi8(char_lut, luma_idx_8bit);
293 _mm_storel_epi64((__m128i *)char_indices, char_indices_vec);
298 for (
int i = 0; i < 8; i++) {
299 color_indices[i] = rgb_to_256color_ssse3(r_array[i], g_array[i], b_array[i]);
303 for (
int i = 0; i < 8;) {
304 const uint8_t char_idx = char_indices[i];
306 const uint8_t color_idx = color_indices[i];
309 while (j < 8 && char_indices[j] == char_idx && color_indices[j] == color_idx) {
314 if (color_idx != cur_color_idx) {
315 if (use_background) {
320 cur_color_idx = color_idx;
328 for (
uint32_t k = 1; k < run; k++) {
337 for (
int i = 0; i < 8;) {
338 const uint8_t char_idx = char_indices[i];
345 while (j < 8 && char_indices[j] == char_idx && r_array[j] == r && g_array[j] == g && b_array[j] == b) {
350 if (r != curR || g != curG || b != curB) {
351 if (use_background) {
366 for (
uint32_t k = 1; k < run; k++) {
379 const rgb_pixel_t *p = &row[x];
380 uint32_t R = p->r, G = p->g, B = p->b;
391 const rgb_pixel_t *q = &row[j];
392 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
395 if (((Y2 >> 2) != (Y >> 2)) || color_idx2 != color_idx)
401 if (color_idx != cur_color_idx) {
402 if (use_background) {
407 cur_color_idx = color_idx;
415 for (
uint32_t k = 1; k < run; k++) {
425 const rgb_pixel_t *q = &row[j];
426 uint32_t R2 = q->r, G2 = q->g, B2 = q->b;
428 if (((Y2 >> 2) != (Y >> 2)) || R2 != R || G2 != G || B2 != B)
434 if ((
int)R != curR || (int)G != curG || (
int)B != curB) {
435 if (use_background) {
450 for (
uint32_t k = 1; k < run; k++) {
461 if (y < height - 1) {
464 curR = curG = curB = -1;
473void ssse3_caches_destroy(
void) {
475 log_debug(
"SSSE3_CACHE: SSSE3 caches cleaned up");
SIMD-optimized ASCII conversion interface.
#define SAFE_MALLOC(size, cast)
#define log_error(...)
Log an ERROR message.
#define log_debug(...)
Log a DEBUG message.
#define LUMA_BLUE
Luminance blue coefficient (0.114 * 256 = 29)
void emit_set_256_color_bg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color background ANSI sequence.
#define LUMA_GREEN
Luminance green coefficient (0.587 * 256 = 150)
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)
Get or create UTF-8 palette cache.
void emit_set_256_color_fg(outbuf_t *ob, uint8_t color_idx)
Emit 256-color foreground ANSI sequence.
void ob_term(outbuf_t *ob)
Append null terminator to buffer.
#define LUMA_THRESHOLD
Luminance threshold for rounding.
void ob_putc(outbuf_t *ob, char c)
Append a character to buffer.
bool rep_is_profitable(uint32_t runlen)
Check if run-length encoding is profitable.
void emit_set_truecolor_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor foreground ANSI sequence.
void emit_rep(outbuf_t *ob, uint32_t extra)
Emit run-length encoded sequence.
uint8_t char_index_ramp[64]
void ob_write(outbuf_t *ob, const char *s, size_t n)
Append a string to buffer.
void emit_reset(outbuf_t *ob)
Emit ANSI reset sequence.
void emit_set_truecolor_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
Emit truecolor background ANSI sequence.
#define LUMA_RED
Luminance red coefficient (0.299 * 256 = 77)
✅ Safe Integer Arithmetic and Overflow Detection
SSSE3-optimized ASCII rendering functions.
int w
Image width in pixels (must be > 0)
int h
Image height in pixels (must be > 0)
rgb_pixel_t * pixels
Pixel data array (width * height RGB pixels, row-major order)
Dynamic output buffer (auto-expanding)
size_t cap
Buffer capacity in bytes (maximum length before reallocation)
char * buf
Buffer pointer (allocated, owned by caller, must be freed)
UTF-8 character structure.
UTF-8 palette cache structure.