ascii-chat 0.8.38
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
utf8.c
Go to the documentation of this file.
1
10#include "ascii-chat/asciichat_errno.h"
11#include "ascii-chat/common/error_codes.h"
12#include <ascii-chat/util/utf8.h>
13#include <ascii-chat-deps/utf8proc/utf8proc.h>
14#include <stdbool.h>
15#include <string.h>
16#include <stdio.h>
17
18int utf8_decode(const uint8_t *s, uint32_t *codepoint) {
19 if (s[0] < 0x80) {
20 *codepoint = s[0];
21 return 1;
22 }
23 if ((s[0] & 0xE0) == 0xC0) {
24 // Validate continuation byte
25 if ((s[1] & 0xC0) != 0x80)
26 return -1;
27 *codepoint = (((uint32_t)(s[0] & 0x1F) << 6) | (uint32_t)(s[1] & 0x3F));
28 return 2;
29 } else if ((s[0] & 0xF0) == 0xE0) {
30 // Validate continuation bytes
31 if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
32 return -1;
33 *codepoint = (((uint32_t)(s[0] & 0x0F) << 12) | ((uint32_t)(s[1] & 0x3F) << 6) | (uint32_t)(s[2] & 0x3F));
34 return 3;
35 } else if ((s[0] & 0xF8) == 0xF0) {
36 // Validate continuation bytes
37 if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
38 return -1;
39 *codepoint = (((uint32_t)(s[0] & 0x07) << 18) | ((uint32_t)(s[1] & 0x3F) << 12) | ((uint32_t)(s[2] & 0x3F) << 6) |
40 (uint32_t)(s[3] & 0x3F));
41 return 4;
42 }
43 return -1; // Invalid
44}
45
46int utf8_display_width(const char *str) {
47 if (!str) {
48 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL");
49 return 0;
50 }
51
52 int width = 0;
53 const utf8proc_uint8_t *p = (const utf8proc_uint8_t *)str;
54
55 while (*p) {
56 // Skip ANSI escape sequences (ESC [ ... m)
57 // Pattern: \x1b [ [0-9;]* m
58 if (p[0] == 0x1b && p[1] == '[') {
59 // Found ANSI escape sequence start
60 p += 2; // Skip ESC[
61 // Skip digits and semicolons until we find 'm'
62 while (*p && *p != 'm') {
63 p++;
64 }
65 if (*p == 'm') {
66 p++; // Skip the 'm'
67 }
68 continue;
69 }
70
71 utf8proc_int32_t codepoint;
72 utf8proc_ssize_t len = utf8proc_iterate(p, -1, &codepoint);
73
74 if (len <= 0) {
75 // End of string or invalid UTF-8 sequence, stop processing
76 break;
77 }
78
79 // Get display width of this codepoint
80 int char_width = utf8proc_charwidth(codepoint);
81 if (char_width < 0) {
82 // Control character or unprintable - treat as 0 width
83 char_width = 0;
84 }
85 width += char_width;
86 p += len;
87 }
88
89 return width;
90}
91
92int utf8_display_width_n(const char *str, size_t max_bytes) {
93 if (!str || max_bytes == 0) {
94 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL or max_bytes is 0");
95 return 0;
96 }
97
98 int width = 0;
99 const utf8proc_uint8_t *p = (const utf8proc_uint8_t *)str;
100 const utf8proc_uint8_t *end = p + max_bytes;
101
102 while (p < end && *p) {
103 // Skip ANSI escape sequences (ESC [ ... m)
104 if (p + 1 < end && p[0] == 0x1b && p[1] == '[') {
105 // Found ANSI escape sequence start
106 p += 2; // Skip ESC[
107 // Skip digits and semicolons until we find 'm'
108 while (p < end && *p && *p != 'm') {
109 p++;
110 }
111 if (p < end && *p == 'm') {
112 p++; // Skip the 'm'
113 }
114 continue;
115 }
116
117 utf8proc_int32_t codepoint;
118 utf8proc_ssize_t len = utf8proc_iterate(p, end - p, &codepoint);
119
120 if (len <= 0) {
121 // End of string or invalid UTF-8 sequence
122 break;
123 }
124
125 // Get display width of this codepoint
126 int char_width = utf8proc_charwidth(codepoint);
127 if (char_width < 0) {
128 // Control character or unprintable - treat as 0 width
129 char_width = 0;
130 }
131 width += char_width;
132 p += len;
133 }
134
135 return width;
136}
137
138size_t utf8_char_count(const char *str) {
139 if (!str) {
140 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL");
141 return -1; // SIZE_MAX
142 }
143
144 size_t count = 0;
145 const uint8_t *p = (const uint8_t *)str;
146 while (*p) {
147 uint32_t codepoint;
148 int decode_len = utf8_decode(p, &codepoint);
149 if (decode_len < 0) {
150 return -1; // SIZE_MAX - Invalid UTF-8
151 }
152 count++;
153 p += decode_len;
154 }
155 return count;
156}
157
158bool utf8_is_valid(const char *str) {
159 if (!str) {
160 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL");
161 return false;
162 }
163 // Reuse utf8_char_count to validate without duplicating loop
164 return utf8_char_count(str) != (size_t)-1;
165}
166
167bool utf8_is_ascii_only(const char *str) {
168 if (!str) {
169 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL");
170 return false;
171 }
172
173 // Fast path: Check if all bytes are in ASCII range (0x00-0x7F)
174 // ASCII characters are 0x00-0x7F (single byte, high bit clear)
175 // Multi-byte UTF-8 has continuation bytes with high bit set (10xxxxxx pattern)
176 // So if all bytes have high bit clear, it's guaranteed ASCII-only
177 const unsigned char *p = (const unsigned char *)str;
178 while (*p) {
179 if ((*p & ~0x7F) != 0) {
180 return false; // Non-ASCII byte found (high bit set)
181 }
182 p++;
183 }
184 return true;
185}
186
187size_t utf8_to_codepoints(const char *str, uint32_t *out_codepoints, size_t max_codepoints) {
188 if (!str || !out_codepoints || max_codepoints == 0) {
189 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL or out_codepoints is NULL or max_codepoints is 0");
190 return 0;
191 }
192
193 size_t count = 0;
194 const uint8_t *p = (const uint8_t *)str;
195 while (*p && count < max_codepoints) {
196 uint32_t codepoint;
197 int decode_len = utf8_decode(p, &codepoint);
198 if (decode_len < 0) {
199 return -1; // SIZE_MAX - Invalid UTF-8
200 }
201 out_codepoints[count++] = codepoint;
202 p += decode_len;
203 }
204 return count;
205}
206
207int utf8_next_char_bytes(const char *str, size_t max_bytes) {
208 if (!str || max_bytes == 0) {
209 SET_ERRNO(ERROR_INVALID_PARAM, "str is NULL or max_bytes is 0");
210 return -1;
211 }
212
213 // Use utf8proc_iterate to get byte length of next character
214 utf8proc_int32_t codepoint;
215 utf8proc_ssize_t len = utf8proc_iterate((const utf8proc_uint8_t *)str, (utf8proc_ssize_t)max_bytes, &codepoint);
216
217 if (len <= 0) {
218 return -1; // Invalid UTF-8 or end of string
219 }
220
221 return (int)len;
222}
223
224int utf8_continuation_bytes_needed(unsigned char first_byte) {
225 if ((first_byte & 0x80) == 0) {
226 return 0; // ASCII (0xxxxxxx) - single byte, no continuation needed
227 }
228 if ((first_byte & 0xE0) == 0xC0) {
229 return 1; // 110xxxxx - 2 byte sequence, 1 continuation byte needed
230 }
231 if ((first_byte & 0xF0) == 0xE0) {
232 return 2; // 1110xxxx - 3 byte sequence, 2 continuation bytes needed
233 }
234 if ((first_byte & 0xF8) == 0xF0) {
235 return 3; // 11110xxx - 4 byte sequence, 3 continuation bytes needed
236 }
237 return -1; // Invalid UTF-8 start byte
238}
239
240int utf8_read_and_insert_continuation_bytes(char *buffer, size_t *cursor, size_t *len, size_t max_len,
241 int continuation_bytes, int (*read_byte_fn)(void)) {
242 if (!buffer || !cursor || !len || continuation_bytes <= 0 || !read_byte_fn) {
243 SET_ERRNO(ERROR_INVALID_PARAM, "invalid params");
244 return -1;
245 }
246
247 for (int i = 0; i < continuation_bytes && *len < max_len - 1; i++) {
248 int next_byte = read_byte_fn();
249 if (next_byte == EOF) {
250 return -1; // EOF reached
251 }
252
253 // Shift characters right to make room
254 memmove(&buffer[*cursor + 1], &buffer[*cursor], *len - *cursor);
255 buffer[*cursor] = (char)next_byte;
256 (*len)++;
257 (*cursor)++;
258 }
259
260 return 0;
261}
262
263/* ============================================================================
264 * UTF-8 String Search Functions
265 * ========================================================================== */
266
274const char *utf8_strcasestr(const char *haystack, const char *needle) {
275 if (!haystack || !needle) {
276 SET_ERRNO(ERROR_INVALID_PARAM, "invalid params");
277 return NULL;
278 }
279
280 // Empty needle matches at start of haystack
281 if (needle[0] == '\0') {
282 return haystack;
283 }
284
285 // Get lengths
286 size_t haystack_len = strlen(haystack);
287 size_t needle_len = strlen(needle);
288
289 if (needle_len > haystack_len) {
290 return NULL;
291 }
292
293 // Case-fold both strings using utf8proc
294 // UTF8PROC_CASEFOLD performs Unicode case folding
295 // UTF8PROC_STABLE ensures stable output
296 // UTF8PROC_COMPOSE normalizes composed characters
297 utf8proc_option_t options = UTF8PROC_CASEFOLD | UTF8PROC_STABLE | UTF8PROC_COMPOSE;
298
299 // Case-fold the needle (pattern to search for)
300 utf8proc_uint8_t *needle_folded = NULL;
301 utf8proc_ssize_t needle_folded_len =
302 utf8proc_map((const utf8proc_uint8_t *)needle, (utf8proc_ssize_t)needle_len, &needle_folded, options);
303
304 if (needle_folded_len < 0 || !needle_folded) {
305 // Invalid UTF-8 in needle
306 if (needle_folded) {
307 free(needle_folded);
308 }
309 return NULL;
310 }
311
312 // Try each position in haystack
313 const char *haystack_pos = haystack;
314 while (*haystack_pos != '\0') {
315 // Calculate remaining haystack length
316 size_t remaining = haystack_len - (size_t)(haystack_pos - haystack);
317
318 if (remaining < needle_len) {
319 // Not enough characters left to match
320 break;
321 }
322
323 // Case-fold the current haystack window
324 utf8proc_uint8_t *haystack_folded = NULL;
325 utf8proc_ssize_t haystack_folded_len =
326 utf8proc_map((const utf8proc_uint8_t *)haystack_pos, (utf8proc_ssize_t)needle_len, &haystack_folded, options);
327
328 if (haystack_folded_len >= 0 && haystack_folded) {
329 // Compare case-folded strings
330 if ((size_t)haystack_folded_len == (size_t)needle_folded_len &&
331 memcmp(haystack_folded, needle_folded, (size_t)needle_folded_len) == 0) {
332 // Match found!
333 free(haystack_folded);
334 free(needle_folded);
335 return haystack_pos;
336 }
337 free(haystack_folded);
338 }
339
340 // Move to next UTF-8 character in haystack
341 utf8proc_int32_t codepoint;
342 utf8proc_ssize_t bytes = utf8proc_iterate((const utf8proc_uint8_t *)haystack_pos, -1, &codepoint);
343 if (bytes <= 0) {
344 // Invalid UTF-8, move by one byte
345 haystack_pos++;
346 } else {
347 haystack_pos += bytes;
348 }
349 }
350
351 free(needle_folded);
352 return NULL;
353}
int utf8_display_width_n(const char *str, size_t max_bytes)
Definition utf8.c:92
bool utf8_is_ascii_only(const char *str)
Definition utf8.c:167
bool utf8_is_valid(const char *str)
Definition utf8.c:158
int utf8_decode(const uint8_t *s, uint32_t *codepoint)
Definition utf8.c:18
size_t utf8_to_codepoints(const char *str, uint32_t *out_codepoints, size_t max_codepoints)
Definition utf8.c:187
int utf8_display_width(const char *str)
Definition utf8.c:46
int utf8_read_and_insert_continuation_bytes(char *buffer, size_t *cursor, size_t *len, size_t max_len, int continuation_bytes, int(*read_byte_fn)(void))
Definition utf8.c:240
int utf8_next_char_bytes(const char *str, size_t max_bytes)
Definition utf8.c:207
const char * utf8_strcasestr(const char *haystack, const char *needle)
Case-insensitive substring search with full Unicode support.
Definition utf8.c:274
int utf8_continuation_bytes_needed(unsigned char first_byte)
Definition utf8.c:224
size_t utf8_char_count(const char *str)
Definition utf8.c:138