ascii-chat 0.8.38
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
utf8.c File Reference

🔤 UTF-8 encoding and decoding with multi-byte character support More...

Go to the source code of this file.

Functions

int utf8_decode (const uint8_t *s, uint32_t *codepoint)
 
int utf8_display_width (const char *str)
 
int utf8_display_width_n (const char *str, size_t max_bytes)
 
size_t utf8_char_count (const char *str)
 
bool utf8_is_valid (const char *str)
 
bool utf8_is_ascii_only (const char *str)
 
size_t utf8_to_codepoints (const char *str, uint32_t *out_codepoints, size_t max_codepoints)
 
int utf8_next_char_bytes (const char *str, size_t max_bytes)
 
int utf8_continuation_bytes_needed (unsigned char first_byte)
 
int utf8_read_and_insert_continuation_bytes (char *buffer, size_t *cursor, size_t *len, size_t max_len, int continuation_bytes, int(*read_byte_fn)(void))
 
const char * utf8_strcasestr (const char *haystack, const char *needle)
 Case-insensitive substring search with full Unicode support.
 

Detailed Description

🔤 UTF-8 encoding and decoding with multi-byte character support

Uses utf8proc Unicode library for accurate character-width computation and UTF-8 handling.

Definition in file utf8.c.

Function Documentation

◆ utf8_char_count()

size_t utf8_char_count ( const char *  str)

Definition at line 138 of file utf8.c.

138 {
139 if (!str) {
140 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL");
141 return -1; // SIZE_MAX
142 }
143
144 size_t count = 0;
145 const uint8_t *p = (const uint8_t *)str;
146 while (*p) {
147 uint32_t codepoint;
148 int decode_len = utf8_decode(p, &codepoint);
149 if (decode_len < 0) {
150 return -1; // SIZE_MAX - Invalid UTF-8
151 }
152 count++;
153 p += decode_len;
154 }
155 return count;
156}
int utf8_decode(const uint8_t *s, uint32_t *codepoint)
Definition utf8.c:18

References utf8_decode().

Referenced by levenshtein(), and utf8_is_valid().

◆ utf8_continuation_bytes_needed()

int utf8_continuation_bytes_needed ( unsigned char  first_byte)

Definition at line 224 of file utf8.c.

224 {
225 if ((first_byte & 0x80) == 0) {
226 return 0; // ASCII (0xxxxxxx) - single byte, no continuation needed
227 }
228 if ((first_byte & 0xE0) == 0xC0) {
229 return 1; // 110xxxxx - 2 byte sequence, 1 continuation byte needed
230 }
231 if ((first_byte & 0xF0) == 0xE0) {
232 return 2; // 1110xxxx - 3 byte sequence, 2 continuation bytes needed
233 }
234 if ((first_byte & 0xF8) == 0xF0) {
235 return 3; // 11110xxx - 4 byte sequence, 3 continuation bytes needed
236 }
237 return -1; // Invalid UTF-8 start byte
238}

◆ utf8_decode()

int utf8_decode ( const uint8_t *  s,
uint32_t *  codepoint 
)

Definition at line 18 of file utf8.c.

18 {
19 if (s[0] < 0x80) {
20 *codepoint = s[0];
21 return 1;
22 }
23 if ((s[0] & 0xE0) == 0xC0) {
24 // Validate continuation byte
25 if ((s[1] & 0xC0) != 0x80)
26 return -1;
27 *codepoint = (((uint32_t)(s[0] & 0x1F) << 6) | (uint32_t)(s[1] & 0x3F));
28 return 2;
29 } else if ((s[0] & 0xF0) == 0xE0) {
30 // Validate continuation bytes
31 if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
32 return -1;
33 *codepoint = (((uint32_t)(s[0] & 0x0F) << 12) | ((uint32_t)(s[1] & 0x3F) << 6) | (uint32_t)(s[2] & 0x3F));
34 return 3;
35 } else if ((s[0] & 0xF8) == 0xF0) {
36 // Validate continuation bytes
37 if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
38 return -1;
39 *codepoint = (((uint32_t)(s[0] & 0x07) << 18) | ((uint32_t)(s[1] & 0x3F) << 12) | ((uint32_t)(s[2] & 0x3F) << 6) |
40 (uint32_t)(s[3] & 0x3F));
41 return 4;
42 }
43 return -1; // Invalid
44}

Referenced by digital_rain_apply(), interactive_grep_get_match_info(), utf8_char_count(), utf8_to_codepoints(), and validate_shell_safe().

◆ utf8_display_width()

int utf8_display_width ( const char *  str)

Definition at line 46 of file utf8.c.

46 {
47 if (!str) {
48 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL");
49 return 0;
50 }
51
52 int width = 0;
53 const utf8proc_uint8_t *p = (const utf8proc_uint8_t *)str;
54
55 while (*p) {
56 // Skip ANSI escape sequences (ESC [ ... m)
57 // Pattern: \x1b [ [0-9;]* m
58 if (p[0] == 0x1b && p[1] == '[') {
59 // Found ANSI escape sequence start
60 p += 2; // Skip ESC[
61 // Skip digits and semicolons until we find 'm'
62 while (*p && *p != 'm') {
63 p++;
64 }
65 if (*p == 'm') {
66 p++; // Skip the 'm'
67 }
68 continue;
69 }
70
71 utf8proc_int32_t codepoint;
72 utf8proc_ssize_t len = utf8proc_iterate(p, -1, &codepoint);
73
74 if (len <= 0) {
75 // End of string or invalid UTF-8 sequence, stop processing
76 break;
77 }
78
79 // Get display width of this codepoint
80 int char_width = utf8proc_charwidth(codepoint);
81 if (char_width < 0) {
82 // Control character or unprintable - treat as 0 width
83 char_width = 0;
84 }
85 width += char_width;
86 p += len;
87 }
88
89 return width;
90}

Referenced by display_width(), layout_print_two_column_row(), options_config_calculate_max_col_width(), prompt_password(), and validate_palette_chars().

◆ utf8_display_width_n()

int utf8_display_width_n ( const char *  str,
size_t  max_bytes 
)

Definition at line 92 of file utf8.c.

92 {
93 if (!str || max_bytes == 0) {
94 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL or max_bytes is 0");
95 return 0;
96 }
97
98 int width = 0;
99 const utf8proc_uint8_t *p = (const utf8proc_uint8_t *)str;
100 const utf8proc_uint8_t *end = p + max_bytes;
101
102 while (p < end && *p) {
103 // Skip ANSI escape sequences (ESC [ ... m)
104 if (p + 1 < end && p[0] == 0x1b && p[1] == '[') {
105 // Found ANSI escape sequence start
106 p += 2; // Skip ESC[
107 // Skip digits and semicolons until we find 'm'
108 while (p < end && *p && *p != 'm') {
109 p++;
110 }
111 if (p < end && *p == 'm') {
112 p++; // Skip the 'm'
113 }
114 continue;
115 }
116
117 utf8proc_int32_t codepoint;
118 utf8proc_ssize_t len = utf8proc_iterate(p, end - p, &codepoint);
119
120 if (len <= 0) {
121 // End of string or invalid UTF-8 sequence
122 break;
123 }
124
125 // Get display width of this codepoint
126 int char_width = utf8proc_charwidth(codepoint);
127 if (char_width < 0) {
128 // Control character or unprintable - treat as 0 width
129 char_width = 0;
130 }
131 width += char_width;
132 p += len;
133 }
134
135 return width;
136}

Referenced by utf8_palette_create().

◆ utf8_is_ascii_only()

bool utf8_is_ascii_only ( const char *  str)

Definition at line 167 of file utf8.c.

167 {
168 if (!str) {
169 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL");
170 return false;
171 }
172
173 // Fast path: Check if all bytes are in ASCII range (0x00-0x7F)
174 // ASCII characters are 0x00-0x7F (single byte, high bit clear)
175 // Multi-byte UTF-8 has continuation bytes with high bit set (10xxxxxx pattern)
176 // So if all bytes have high bit clear, it's guaranteed ASCII-only
177 const unsigned char *p = (const unsigned char *)str;
178 while (*p) {
179 if ((*p & ~0x7F) != 0) {
180 return false; // Non-ASCII byte found (high bit set)
181 }
182 p++;
183 }
184 return true;
185}

Referenced by is_session_string(), and options_init().

◆ utf8_is_valid()

bool utf8_is_valid ( const char *  str)

Definition at line 158 of file utf8.c.

158 {
159 if (!str) {
160 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL");
161 return false;
162 }
163 // Reuse utf8_char_count to validate without duplicating loop
164 return utf8_char_count(str) != (size_t)-1;
165}
size_t utf8_char_count(const char *str)
Definition utf8.c:138

References utf8_char_count().

Referenced by options_init(), prompt_password(), prompt_password_simple(), validate_private_key(), validate_public_key(), and validate_shell_safe().

◆ utf8_next_char_bytes()

int utf8_next_char_bytes ( const char *  str,
size_t  max_bytes 
)

Definition at line 207 of file utf8.c.

207 {
208 if (!str || max_bytes == 0) {
209 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL or max_bytes is 0");
210 return -1;
211 }
212
213 // Use utf8proc_iterate to get byte length of next character
214 utf8proc_int32_t codepoint;
215 utf8proc_ssize_t len = utf8proc_iterate((const utf8proc_uint8_t *)str, (utf8proc_ssize_t)max_bytes, &codepoint);
216
217 if (len <= 0) {
218 return -1; // Invalid UTF-8 or end of string
219 }
220
221 return (int)len;
222}

Referenced by layout_print_wrapped_description().

◆ utf8_read_and_insert_continuation_bytes()

int utf8_read_and_insert_continuation_bytes ( char *  buffer,
size_t *  cursor,
size_t *  len,
size_t  max_len,
int  continuation_bytes,
int(*)(void)  read_byte_fn 
)

Definition at line 240 of file utf8.c.

241 {
242 if (!buffer || !cursor || !len || continuation_bytes <= 0 || !read_byte_fn) {
243 SET_ERRNO(ERROR_INVALID_PARAM, "invalid params");
244 return -1;
245 }
246
247 for (int i = 0; i < continuation_bytes && *len < max_len - 1; i++) {
248 int next_byte = read_byte_fn();
249 if (next_byte == EOF) {
250 return -1; // EOF reached
251 }
252
253 // Shift characters right to make room
254 memmove(&buffer[*cursor + 1], &buffer[*cursor], *len - *cursor);
255 buffer[*cursor] = (char)next_byte;
256 (*len)++;
257 (*cursor)++;
258 }
259
260 return 0;
261}

◆ utf8_strcasestr()

const char * utf8_strcasestr ( const char *  haystack,
const char *  needle 
)

Case-insensitive substring search with full Unicode support.

Uses utf8proc for Unicode case folding according to Unicode standard. This properly handles all Unicode scripts including Greek, Cyrillic, accented characters, and more.

Definition at line 274 of file utf8.c.

274 {
275 if (!haystack || !needle) {
276 SET_ERRNO(ERROR_INVALID_PARAM, "invalid params");
277 return NULL;
278 }
279
280 // Empty needle matches at start of haystack
281 if (needle[0] == '\0') {
282 return haystack;
283 }
284
285 // Get lengths
286 size_t haystack_len = strlen(haystack);
287 size_t needle_len = strlen(needle);
288
289 if (needle_len > haystack_len) {
290 return NULL;
291 }
292
293 // Case-fold both strings using utf8proc
294 // UTF8PROC_CASEFOLD performs Unicode case folding
295 // UTF8PROC_STABLE ensures stable output
296 // UTF8PROC_COMPOSE normalizes composed characters
297 utf8proc_option_t options = UTF8PROC_CASEFOLD | UTF8PROC_STABLE | UTF8PROC_COMPOSE;
298
299 // Case-fold the needle (pattern to search for)
300 utf8proc_uint8_t *needle_folded = NULL;
301 utf8proc_ssize_t needle_folded_len =
302 utf8proc_map((const utf8proc_uint8_t *)needle, (utf8proc_ssize_t)needle_len, &needle_folded, options);
303
304 if (needle_folded_len < 0 || !needle_folded) {
305 // Invalid UTF-8 in needle
306 if (needle_folded) {
307 free(needle_folded);
308 }
309 return NULL;
310 }
311
312 // Try each position in haystack
313 const char *haystack_pos = haystack;
314 while (*haystack_pos != '\0') {
315 // Calculate remaining haystack length
316 size_t remaining = haystack_len - (size_t)(haystack_pos - haystack);
317
318 if (remaining < needle_len) {
319 // Not enough characters left to match
320 break;
321 }
322
323 // Case-fold the current haystack window
324 utf8proc_uint8_t *haystack_folded = NULL;
325 utf8proc_ssize_t haystack_folded_len =
326 utf8proc_map((const utf8proc_uint8_t *)haystack_pos, (utf8proc_ssize_t)needle_len, &haystack_folded, options);
327
328 if (haystack_folded_len >= 0 && haystack_folded) {
329 // Compare case-folded strings
330 if ((size_t)haystack_folded_len == (size_t)needle_folded_len &&
331 memcmp(haystack_folded, needle_folded, (size_t)needle_folded_len) == 0) {
332 // Match found!
333 free(haystack_folded);
334 free(needle_folded);
335 return haystack_pos;
336 }
337 free(haystack_folded);
338 }
339
340 // Move to next UTF-8 character in haystack
341 utf8proc_int32_t codepoint;
342 utf8proc_ssize_t bytes = utf8proc_iterate((const utf8proc_uint8_t *)haystack_pos, -1, &codepoint);
343 if (bytes <= 0) {
344 // Invalid UTF-8, move by one byte
345 haystack_pos++;
346 } else {
347 haystack_pos += bytes;
348 }
349 }
350
351 free(needle_folded);
352 return NULL;
353}

Referenced by grep_highlight_colored(), grep_should_output(), interactive_grep_gather_and_filter_logs(), and interactive_grep_get_match_info().

◆ utf8_to_codepoints()

size_t utf8_to_codepoints ( const char *  str,
uint32_t *  out_codepoints,
size_t  max_codepoints 
)

Definition at line 187 of file utf8.c.

187 {
188 if (!str || !out_codepoints || max_codepoints == 0) {
189 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL or out_codepoints is NULL or max_codepoints is 0");
190 return 0;
191 }
192
193 size_t count = 0;
194 const uint8_t *p = (const uint8_t *)str;
195 while (*p && count < max_codepoints) {
196 uint32_t codepoint;
197 int decode_len = utf8_decode(p, &codepoint);
198 if (decode_len < 0) {
199 return -1; // SIZE_MAX - Invalid UTF-8
200 }
201 out_codepoints[count++] = codepoint;
202 p += decode_len;
203 }
204 return count;
205}

References utf8_decode().

Referenced by levenshtein(), and validate_palette_chars().