12#include <ascii-chat/video/simd/sve.h>
13#include <ascii-chat/common.h>
14#include <ascii-chat/video/simd/ascii_simd.h>
15#include <ascii-chat/video/output_buffer.h>
19#include <ascii-chat/util/overflow.h>
26char *render_ascii_image_monochrome_sve(
const image_t *image,
const char *ascii_chars) {
27 if (!image || !image->pixels) {
31 const int h = image->h;
32 const int w = image->w;
34 if (h <= 0 || w <= 0) {
41 log_error(
"Failed to get UTF-8 palette cache");
46 const size_t max_char_bytes = 4;
47 const size_t len = (size_t)h * ((
size_t)w * max_char_bytes + 1);
49 char *output = SAFE_MALLOC(len,
char *);
52 const rgb_pixel_t *pixels = (
const rgb_pixel_t *)image->pixels;
55 for (
int y = 0; y < h; y++) {
56 const rgb_pixel_t *row = &pixels[y * w];
60 svbool_t pg = svptrue_b8();
65 int remaining = w - x;
67 svbool_t pg_active = svwhilelt_b8_s32(x, w);
68 int vec_len = svcntb_pat(SV_ALL) / 3;
69 int process_count = (remaining < vec_len) ? remaining : vec_len;
72 uint8_t r_array[64], g_array[64], b_array[64];
73 for (
int j = 0; j < process_count; j++) {
75 r_array[j] = row[x + j].r;
76 g_array[j] = row[x + j].g;
77 b_array[j] = row[x + j].b;
82 svuint8_t r_vec = svld1_u8(pg_active, r_array);
83 svuint8_t g_vec = svld1_u8(pg_active, g_array);
84 svuint8_t b_vec = svld1_u8(pg_active, b_array);
87 svuint16_t r_16 = svunpklo_u16(r_vec);
88 svuint16_t g_16 = svunpklo_u16(g_vec);
89 svuint16_t b_16 = svunpklo_u16(b_vec);
92 svuint16_t luma = svmul_n_u16_x(svptrue_b16(), r_16, LUMA_RED);
93 luma = svmla_n_u16_x(svptrue_b16(), luma, g_16, LUMA_GREEN);
94 luma = svmla_n_u16_x(svptrue_b16(), luma, b_16, LUMA_BLUE);
95 luma = svadd_n_u16_x(svptrue_b16(), luma, LUMA_THRESHOLD);
96 luma = svlsr_n_u16_x(svptrue_b16(), luma, 8);
100 uint16_t luma_temp[64];
101 svst1_u16(svptrue_b16(), luma_temp, luma);
104 uint8_t luma_array[64];
105 for (
int j = 0; j < process_count; j++) {
106 luma_array[j] = (uint8_t)luma_temp[j];
109 for (
int j = 0; j < process_count; j++) {
111 const utf8_char_t *char_info = &utf8_cache->cache[luma_array[j]];
113 if (char_info->byte_len == 1) {
114 *pos++ = char_info->utf8_bytes[0];
117 memcpy(pos, char_info->utf8_bytes, char_info->byte_len);
118 pos += char_info->byte_len;
138static inline uint8_t rgb_to_256color_sve(uint8_t r, uint8_t g, uint8_t b) {
139 return (uint8_t)(16 + 36 * (r / 51) + 6 * (g / 51) + (b / 51));
143char *render_ascii_sve_unified_optimized(
const image_t *image,
bool use_background,
bool use_256color,
144 const char *ascii_chars) {
145 if (!image || !image->pixels) {
149 const int width = image->w;
150 const int height = image->h;
152 if (width <= 0 || height <= 0) {
154 empty = SAFE_MALLOC(1,
char *);
160 if (!use_background && !use_256color) {
161 return render_ascii_image_monochrome_sve(image, ascii_chars);
166 size_t bytes_per_pixel = use_256color ? 6u : 8u;
169 size_t height_times_width;
170 if (checked_size_mul((
size_t)height, (
size_t)width, &height_times_width) != ASCIICHAT_OK) {
171 log_error(
"Buffer size overflow: height * width overflow");
175 size_t pixel_data_size;
176 if (checked_size_mul(height_times_width, bytes_per_pixel, &pixel_data_size) != ASCIICHAT_OK) {
177 log_error(
"Buffer size overflow: (height * width) * bytes_per_pixel overflow");
181 size_t height_times_16;
182 if (checked_size_mul((
size_t)height, 16u, &height_times_16) != ASCIICHAT_OK) {
183 log_error(
"Buffer size overflow: height * 16 overflow");
188 if (checked_size_add(pixel_data_size, height_times_16, &temp) != ASCIICHAT_OK) {
189 log_error(
"Buffer size overflow: pixel_data + height*16 overflow");
193 if (checked_size_add(temp, 64u, &ob.cap) != ASCIICHAT_OK) {
194 log_error(
"Buffer size overflow: total capacity overflow");
198 ob.buf = SAFE_MALLOC(ob.cap ? ob.cap : 1, char *);
205 log_error(
"Failed to get UTF-8 palette cache for SVE color");
210 int curR = -1, curG = -1, curB = -1;
211 int cur_color_idx = -1;
213 for (
int y = 0; y < height; y++) {
214 const rgb_pixel_t *row = &((
const rgb_pixel_t *)image->pixels)[y * width];
219 svbool_t pg_active = svwhilelt_b8_s32(x, width);
220 int vec_len = svcntb_pat(SV_ALL) / 3;
221 int remaining = width - x;
222 int process_count = (remaining < vec_len) ? remaining : vec_len;
225 uint8_t r_array[64], g_array[64], b_array[64];
226 for (
int j = 0; j < process_count; j++) {
228 r_array[j] = row[x + j].r;
229 g_array[j] = row[x + j].g;
230 b_array[j] = row[x + j].b;
235 svuint8_t r_vec = svld1_u8(pg_active, r_array);
236 svuint8_t g_vec = svld1_u8(pg_active, g_array);
237 svuint8_t b_vec = svld1_u8(pg_active, b_array);
240 svuint16_t r_16 = svunpklo_u16(r_vec);
241 svuint16_t g_16 = svunpklo_u16(g_vec);
242 svuint16_t b_16 = svunpklo_u16(b_vec);
245 svuint16_t luma = svmul_n_u16_x(svptrue_b16(), r_16, LUMA_RED);
246 luma = svmla_n_u16_x(svptrue_b16(), luma, g_16, LUMA_GREEN);
247 luma = svmla_n_u16_x(svptrue_b16(), luma, b_16, LUMA_BLUE);
248 luma = svadd_n_u16_x(svptrue_b16(), luma, LUMA_THRESHOLD);
249 luma = svlsr_n_u16_x(svptrue_b16(), luma, 8);
253 uint16_t luma_temp[64];
254 svst1_u16(svptrue_b16(), luma_temp, luma);
257 uint8_t luma_array[64];
258 for (
int j = 0; j < process_count; j++) {
259 luma_array[j] = (uint8_t)luma_temp[j];
264 svuint8_t luma_vec = svld1_u8(pg_active, luma_array);
265 svuint8_t luma_idx_vec = svlsr_n_u8_x(svptrue_b8(), luma_vec, 2);
268 svuint8_t char_lut_vec = svld1_u8(svptrue_b8(), utf8_cache->char_index_ramp);
269 svuint8_t char_indices_vec = svtbl_u8(char_lut_vec, luma_idx_vec);
272 svst1_u8(pg_active, gbuf, char_indices_vec);
276 uint8_t color_indices[64];
277 for (
int i = 0; i < process_count; i++) {
278 color_indices[i] = rgb_to_256color_sve(r_array[i], g_array[i], b_array[i]);
282 for (
int i = 0; i < process_count;) {
283 const uint8_t char_idx = gbuf[i];
284 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
285 const uint8_t color_idx = color_indices[i];
288 while (j < process_count && gbuf[j] == char_idx && color_indices[j] == color_idx) {
291 const uint32_t run = (uint32_t)(j - i);
293 if (color_idx != cur_color_idx) {
294 if (use_background) {
299 cur_color_idx = color_idx;
303 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
307 for (uint32_t k = 1; k < run; k++) {
309 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
316 for (
int i = 0; i < process_count;) {
317 const uint8_t char_idx = gbuf[i];
318 const utf8_char_t *char_info = &utf8_cache->cache64[char_idx];
319 const uint8_t r = r_array[i];
320 const uint8_t g = g_array[i];
321 const uint8_t b = b_array[i];
324 while (j < process_count && gbuf[j] == char_idx && r_array[j] == r && g_array[j] == g && b_array[j] == b) {
327 const uint32_t run = (uint32_t)(j - i);
329 if (r != curR || g != curG || b != curB) {
330 if (use_background) {
341 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
345 for (uint32_t k = 1; k < run; k++) {
347 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
358 const rgb_pixel_t *p = &row[x];
359 uint32_t R = p->r, G = p->g, B = p->b;
360 uint8_t Y = (uint8_t)((LUMA_RED * R + LUMA_GREEN * G + LUMA_BLUE * B + LUMA_THRESHOLD) >> 8);
361 uint8_t luma_idx = Y >> 2;
362 const utf8_char_t *char_info = &utf8_cache->cache64[luma_idx];
366 uint8_t color_idx = rgb_to_256color_sve((uint8_t)R, (uint8_t)G, (uint8_t)B);
370 const rgb_pixel_t *q = &row[j];
371 uint32_t
R2 = q->r, G2 = q->g, B2 = q->b;
372 uint8_t Y2 = (uint8_t)((LUMA_RED *
R2 + LUMA_GREEN * G2 + LUMA_BLUE * B2 + LUMA_THRESHOLD) >> 8);
373 uint8_t color_idx2 = rgb_to_256color_sve((uint8_t)
R2, (uint8_t)G2, (uint8_t)B2);
374 if (((Y2 >> 2) != (Y >> 2)) || color_idx2 != color_idx)
378 uint32_t run = (uint32_t)(j - x);
380 if (color_idx != cur_color_idx) {
381 if (use_background) {
386 cur_color_idx = color_idx;
390 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
394 for (uint32_t k = 1; k < run; k++) {
396 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
404 const rgb_pixel_t *q = &row[j];
405 uint32_t
R2 = q->r, G2 = q->g, B2 = q->b;
406 uint8_t Y2 = (uint8_t)((77u *
R2 + 150u * G2 + 29u * B2 + 128u) >> 8);
407 if (((Y2 >> 2) != (Y >> 2)) ||
R2 != R || G2 != G || B2 != B)
411 uint32_t run = (uint32_t)(j - x);
413 if ((
int)R != curR || (int)G != curG || (
int)B != curB) {
414 if (use_background) {
425 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
429 for (uint32_t k = 1; k < run; k++) {
431 ob_write(&ob, char_info->utf8_bytes, char_info->byte_len);
440 if (y < height - 1) {
443 curR = curG = curB = -1;
452void sve_caches_destroy(
void) {
454 log_debug(
"SVE_CACHE: SVE caches cleaned up");
void emit_set_256_color_bg(outbuf_t *ob, uint8_t color_idx)
void emit_set_256_color_fg(outbuf_t *ob, uint8_t color_idx)
void ob_term(outbuf_t *ob)
void ob_putc(outbuf_t *ob, char c)
bool rep_is_profitable(uint32_t runlen)
void emit_set_truecolor_fg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
void emit_rep(outbuf_t *ob, uint32_t extra)
void ob_write(outbuf_t *ob, const char *s, size_t n)
void emit_reset(outbuf_t *ob)
void emit_set_truecolor_bg(outbuf_t *ob, uint8_t r, uint8_t g, uint8_t b)
#define R2(v, w, x, y, z, i)
utf8_palette_cache_t * get_utf8_palette_cache(const char *ascii_chars)