15#include <ascii-chat/video/simd/sse2.h>
16#include <ascii-chat/video/simd/ascii_simd.h>
17#include <ascii-chat/common.h>
18#include <ascii-chat/output_buffer.h>
19#include <ascii-chat/util/overflow.h>
26char *render_ascii_image_monochrome_sse2(
const image_t *image,
const char *ascii_chars) {
27 if (!image || !image->pixels || !ascii_chars) {
31 const int h = image->h;
32 const int w = image->w;
34 if (h <= 0 || w <= 0) {
41 log_error(
"Failed to get UTF-8 palette cache");
46 const size_t max_char_bytes = 4;
47 const size_t len = (size_t)h * ((
size_t)w * max_char_bytes + 1);
50 output = SAFE_MALLOC(len,
char *);
53 const rgb_pixel_t *pixels = (
const rgb_pixel_t *)image->pixels;
56 for (
int y = 0; y < h; y++) {
57 const rgb_pixel_t *row = &pixels[y * w];
61 for (; x + 15 < w; x += 16) {
63 uint8_t r_array[16], g_array[16], b_array[16];
64 for (
int j = 0; j < 16; j++) {
65 r_array[j] = row[x + j].r;
66 g_array[j] = row[x + j].g;
67 b_array[j] = row[x + j].b;
71 __m128i r_vec_lo = _mm_loadl_epi64((__m128i *)(r_array + 0));
72 __m128i r_vec_hi = _mm_loadl_epi64((__m128i *)(r_array + 8));
73 __m128i g_vec_lo = _mm_loadl_epi64((__m128i *)(g_array + 0));
74 __m128i g_vec_hi = _mm_loadl_epi64((__m128i *)(g_array + 8));
75 __m128i b_vec_lo = _mm_loadl_epi64((__m128i *)(b_array + 0));
76 __m128i b_vec_hi = _mm_loadl_epi64((__m128i *)(b_array + 8));
79 __m128i r_16_lo = _mm_unpacklo_epi8(r_vec_lo, _mm_setzero_si128());
80 __m128i g_16_lo = _mm_unpacklo_epi8(g_vec_lo, _mm_setzero_si128());
81 __m128i b_16_lo = _mm_unpacklo_epi8(b_vec_lo, _mm_setzero_si128());
83 __m128i luma_r_lo = _mm_mullo_epi16(r_16_lo, _mm_set1_epi16(77));
84 __m128i luma_g_lo = _mm_mullo_epi16(g_16_lo, _mm_set1_epi16(150));
85 __m128i luma_b_lo = _mm_mullo_epi16(b_16_lo, _mm_set1_epi16(29));
87 __m128i luma_sum_lo = _mm_add_epi16(luma_r_lo, luma_g_lo);
88 luma_sum_lo = _mm_add_epi16(luma_sum_lo, luma_b_lo);
89 luma_sum_lo = _mm_add_epi16(luma_sum_lo, _mm_set1_epi16(128));
90 luma_sum_lo = _mm_srli_epi16(luma_sum_lo, 8);
93 __m128i r_16_hi = _mm_unpacklo_epi8(r_vec_hi, _mm_setzero_si128());
94 __m128i g_16_hi = _mm_unpacklo_epi8(g_vec_hi, _mm_setzero_si128());
95 __m128i b_16_hi = _mm_unpacklo_epi8(b_vec_hi, _mm_setzero_si128());
97 __m128i luma_r_hi = _mm_mullo_epi16(r_16_hi, _mm_set1_epi16(77));
98 __m128i luma_g_hi = _mm_mullo_epi16(g_16_hi, _mm_set1_epi16(150));
99 __m128i luma_b_hi = _mm_mullo_epi16(b_16_hi, _mm_set1_epi16(29));
101 __m128i luma_sum_hi = _mm_add_epi16(luma_r_hi, luma_g_hi);
102 luma_sum_hi = _mm_add_epi16(luma_sum_hi, luma_b_hi);
103 luma_sum_hi = _mm_add_epi16(luma_sum_hi, _mm_set1_epi16(128));
104 luma_sum_hi = _mm_srli_epi16(luma_sum_hi, 8);
107 __m128i luminance_lo = _mm_packus_epi16(luma_sum_lo, _mm_setzero_si128());
108 __m128i luminance_hi = _mm_packus_epi16(luma_sum_hi, _mm_setzero_si128());
111 uint8_t luma_array[16];
112 _mm_storel_epi64((__m128i *)(luma_array + 0), luminance_lo);
113 _mm_storel_epi64((__m128i *)(luma_array + 8), luminance_hi);
116 for (
int j = 0; j < 16; j++) {
117 const utf8_char_t *char_info = &utf8_cache->cache[luma_array[j]];
119 if (char_info->byte_len == 1) {
120 *pos++ = char_info->utf8_bytes[0];
123 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
124 pos += char_info->byte_len;
131 const rgb_pixel_t pixel = row[x];
132 const int luminance = (LUMA_RED * pixel.r + LUMA_GREEN * pixel.g + LUMA_BLUE * pixel.b + LUMA_THRESHOLD) >> 8;
133 const utf8_char_t *char_info = &utf8_cache->cache[luminance];
135 if (char_info->byte_len == 1) {
136 *pos++ = char_info->utf8_bytes[0];
139 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
140 pos += char_info->byte_len;
160static inline uint8_t rgb_to_256color_sse2(uint8_t r, uint8_t g, uint8_t b) {
161 return (uint8_t)(16 + 36 * (r / 51) + 6 * (g / 51) + (b / 51));
165char *render_ascii_sse2_unified_optimized(
const image_t *image,
bool use_background,
bool use_256color,
166 const char *ascii_chars) {
167 if (!image || !image->pixels) {
171 const int width = image->w;
172 const int height = image->h;
174 if (width <= 0 || height <= 0) {
176 empty = SAFE_MALLOC(1,
char *);
183 size_t bytes_per_pixel = use_256color ? 6u : 8u;
186 size_t height_times_width;
187 if (checked_size_mul((
size_t)height, (
size_t)width, &height_times_width) != ASCIICHAT_OK) {
188 log_error(
"Buffer size overflow: height * width overflow");
192 size_t pixel_data_size;
193 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
194 log_error(
"Buffer size overflow: (height * width) * bytes_per_pixel overflow");
198 size_t height_times_16;
199 if (checked_size_mul((
size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
200 log_error(
"Buffer size overflow: height * 16 overflow");
205 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
206 log_error(
"Buffer size overflow: pixel_data + height*16 overflow");
210 if (checked_size_add(temp, 64u, &ob.cap) != ASCIICHAT_OK) {
211 log_error(
"Buffer size overflow: total capacity overflow");
215 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
222 log_error(
"Failed to get UTF-8 palette cache for SSE2 color");
231 int curR = -1, curG = -1, curB = -1;
232 int cur_color_idx = -1;
234 for (
int y = 0; y < height; y++) {
235 const rgb_pixel_t *row = &((
const rgb_pixel_t *)image->pixels)[y * width];
239 while (x + 16 <= width) {
241 uint8_t r_array[16], g_array[16], b_array[16];
242 for (
int j = 0; j < 16; j++) {
243 r_array[j] = row[x + j].r;
244 g_array[j] = row[x + j].g;
245 b_array[j] = row[x + j].b;
249 __m128i r_vec = _mm_loadl_epi64((__m128i *)r_array);
250 __m128i g_vec = _mm_loadl_epi64((__m128i *)g_array);
251 __m128i b_vec = _mm_loadl_epi64((__m128i *)b_array);
254 __m128i r_16 = _mm_unpacklo_epi8(r_vec, _mm_setzero_si128());
255 __m128i g_16 = _mm_unpacklo_epi8(g_vec, _mm_setzero_si128());
256 __m128i b_16 = _mm_unpacklo_epi8(b_vec, _mm_setzero_si128());
259 __m128i luma_r = _mm_mullo_epi16(r_16, _mm_set1_epi16(LUMA_RED));
260 __m128i luma_g = _mm_mullo_epi16(g_16, _mm_set1_epi16(LUMA_GREEN));
261 __m128i luma_b = _mm_mullo_epi16(b_16, _mm_set1_epi16(LUMA_BLUE));
263 __m128i luma_sum = _mm_add_epi16(luma_r, luma_g);
264 luma_sum = _mm_add_epi16(luma_sum, luma_b);
265 luma_sum = _mm_add_epi16(luma_sum, _mm_set1_epi16(LUMA_THRESHOLD));
266 luma_sum = _mm_srli_epi16(luma_sum, 8);
269 __m128i luminance = _mm_packus_epi16(luma_sum, _mm_setzero_si128());
270 uint8_t luma_array[8];
271 _mm_storel_epi64((__m128i *)luma_array, luminance);
274 uint8_t char_indices[8];
275 for (
int i = 0; i < 8; i++) {
276 const uint8_t luma_idx = luma_array[i] >> 2;
277 char_indices[i] = luma_idx;
282 uint8_t color_indices[8];
283 for (
int i = 0; i < 8; i++) {
284 color_indices[i] = rgb_to_256color_sse2(r_array[i], g_array[i], b_array[i]);
288 for (
int i = 0; i < 8;) {
289 const uint8_t char_idx = char_indices[i];
290 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
291 const uint8_t color_idx = color_indices[i];
294 while (j < 8 && char_indices[j] == char_idx && color_indices[j] == color_idx) {
297 const uint32_t run = (uint32_t)(j - i);
299 if (color_idx != cur_color_idx) {
300 if (use_background) {
305 cur_color_idx = color_idx;
309 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
313 for (uint32_t k = 1; k < run; k++) {
314 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
321 for (
int i = 0; i < 8;) {
322 const uint8_t char_idx = char_indices[i];
323 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
324 const uint8_t r = r_array[i];
325 const uint8_t g = g_array[i];
326 const uint8_t b = b_array[i];
329 while (j < 8 && char_indices[j] == char_idx && r_array[j] == r && g_array[j] == g && b_array[j] == b) {
332 const uint32_t run = (uint32_t)(j - i);
334 if (r != curR || g != curG || b != curB) {
335 if (use_background) {
346 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
350 for (uint32_t k = 1; k < run; k++) {
351 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
362 const rgb_pixel_t *p = &row[x];
363 uint32_t R = p->r, G = p->g, B = p->b;
364 uint8_t Y = (uint8_t)((LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + LUMA_THRESHOLD) >> 8);
365 uint8_t luma_idx = Y >> 2;
366 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
370 uint8_t color_idx = rgb_to_256color_sse2((uint8_t)R, (uint8_t)G, (uint8_t)B);
374 const rgb_pixel_t *q = &row[j];
375 uint32_t
R2 = q->r, G2 = q->g, B2 = q->b;
376 uint8_t Y2 = (uint8_t)((LUMA_RED *
R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
377 uint8_t luma_idx2 = Y2 >> 2;
378 uint8_t color_idx2 = rgb_to_256color_sse2((uint8_t)
R2, (uint8_t)G2, (uint8_t)B2);
379 if (luma_idx2 != luma_idx || color_idx2 != color_idx)
383 uint32_t run = (uint32_t)(j - x);
385 if (color_idx != cur_color_idx) {
386 if (use_background) {
391 cur_color_idx = color_idx;
395 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
399 for (uint32_t k = 1; k < run; k++) {
400 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
408 const rgb_pixel_t *q = &row[j];
409 uint32_t
R2 = q->r, G2 = q->g, B2 = q->b;
410 uint8_t Y2 = (uint8_t)((LUMA_RED *
R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
411 uint8_t luma_idx2 = Y2 >> 2;
412 if (luma_idx2 != luma_idx ||
R2 != R || G2 != G || B2 != B)
416 uint32_t run = (uint32_t)(j - x);
418 if ((
int)R != curR || (int)G != curG || (
int)B != curB) {
419 if (use_background) {
430 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
434 for (uint32_t k = 1; k < run; k++) {
435 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
447 if (y < height - 1) {
450 curR = curG = curB = -1;
459void sse2_caches_destroy(
void) {
461 log_debug(
"SSE2_CACHE: SSE2 caches cleaned up");
void emit_set_256_color_bg(outbuf_t *ob, uint8_t color_idx)
void emit_set_256_color_fg(outbuf_t *ob, uint8_t color_idx)
void ob_term(outbuf_t *ob)
void ob_putc(outbuf_t *ob, char c)
bool rep_is_profitable(uint32_t runlen)
void emit_set_truecolor_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
void emit_rep(outbuf_t *ob, uint32_t extra)
void ob_write(outbuf_t *ob, const char *s, size_t n)
void emit_reset(outbuf_t *ob)
void emit_set_truecolor_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
#define R2(v, w, x, y, z, i)
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)