ascii-chat 0.8.38
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
url.c
Go to the documentation of this file.
1
11#include "ascii-chat/common/error_codes.h"
12#include <ascii-chat/util/url.h>
13#include <ascii-chat/common.h>
14#include <ascii-chat/util/pcre2.h>
15#include <pcre2.h>
16#include <string.h>
17#include <stdlib.h>
18#include <stdio.h>
19
20/* ═══════════════════════════════════════════════════════════════════════════
21 * PRODUCTION-GRADE URL REGEX (Diego Perini, MIT License, extended for WebSocket)
22 *
23 * Supports: http/https/ws/wss, public IPv4, IPv6 with zone IDs, hostnames, localhost
24 * Rejects: Private IPs, schemeless URLs, non-http(s)/ws(s) schemes
25 * ═══════════════════════════════════════════════════════════════════════════ */
26
27static const char *URL_REGEX_PATTERN =
28 // SCHEME: http, https, ws, or wss (case-insensitive)
29 "^(?<scheme>https?|wss?)://(?:(?<userinfo>\\S+(?::\\S*)?)@)?"
30 // HOST: one of three alternatives below
31 "(?<host>"
32 "(?:"
33 // IPv4 ADDRESS: e.g. 192.168.1.1
34 // Negative lookaheads reject multicast (224-239) and broadcast (255.255.255.255)
35 "(?!(?:22[4-9]|23\\d)(?:\\.\\d{1,3}){3})(?!255\\.255\\.255\\.255)"
36 // First octet: 0-255
37 "(?:[0-9]\\d?|1\\d\\d|2[01]\\d|22[0-3]|24\\d|25[0-5])"
38 // Second and third octets: 0-255 (repeated twice)
39 "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}"
40 // Fourth octet: 0-255
41 "(?:\\.(?:[0-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-5]))"
42 ")"
43 // IPv6 ADDRESS: e.g. [::1] or [fe80::1%eth0]
44 // Supports zone IDs (e.g. %25eth0 for link-local addresses)
45 "|(?:\\[(?<ipv6>[a-fA-F0-9:.]+(?:%25[a-zA-Z0-9._~!$&'()*+,;=-]+)?)\\])"
46 // HOSTNAME: e.g. example.com, localhost, or international domain names
47 // Negative lookahead rejects bare IP notation (digits.digits.digits format)
48 "|(?!\\d+(?:\\.\\d+)*(?:[:/"
49 "?#]|$))(?:[a-z0-9_\\x{00a1}-\\x{ffff}][a-z0-9\\x{00a1}-\\x{ffff}_-]{0,62})?[a-z0-9_\\x{00a1}-\\x{ffff}](?:\\.(?:["
50 "a-z0-9_\\x{00a1}-\\x{ffff}][a-z0-9\\x{00a1}-\\x{ffff}_-]{0,62})?[a-z0-9_\\x{00a1}-\\x{ffff}])*\\.?"
51 ")"
52 // PORT: optional :port (1-5 digits, e.g. :8080, :443)
53 "(?::(?<port>\\d{1,5}))?"
54 // PATH/QUERY/FRAGMENT: optional /path, ?query, or #fragment
55 "(?<path_query_fragment>[/?#]\\S*)?$";
56
57/* ═══════════════════════════════════════════════════════════════════════════
58 * PCRE2 REGEX VALIDATOR STATE
59 *
60 * Global singleton with lazy initialization via centralized PCRE2 module.
61 * Compiled regex is read-only after initialization, safe for concurrent reads.
62 * ═══════════════════════════════════════════════════════════════════════════ */
63
64static pcre2_singleton_t *g_url_regex = NULL;
65
70static pcre2_code *url_regex_get(void) {
71 if (g_url_regex == NULL) {
72 g_url_regex = asciichat_pcre2_singleton_compile(URL_REGEX_PATTERN, PCRE2_CASELESS | PCRE2_UCP | PCRE2_UTF);
73 }
74 return asciichat_pcre2_singleton_get_code(g_url_regex);
75}
76
77/* ═══════════════════════════════════════════════════════════════════════════
78 * PUBLIC API IMPLEMENTATION
79 * ═══════════════════════════════════════════════════════════════════════════ */
80
81bool url_is_valid(const char *url) {
82 if (!url || !*url) {
83 SET_ERRNO(ERROR_INVALID_PARAM, "URL is NULL or empty");
84 return false;
85 }
86
87 pcre2_code *regex = url_regex_get();
88 if (!regex) {
89 return false;
90 }
91
92 /* Check if URL needs http:// prefix (bare hostname or IP) */
93 char url_with_scheme[2048];
94 const char *url_to_match = url;
95
96 if (!strstr(url, "://")) {
97 /* No scheme - check if it looks like a bare hostname/IP */
98
99 /* Reject bare scheme words like "http", "https", "ftp" */
100 if (strcmp(url, "http") == 0 || strcmp(url, "https") == 0 || strcmp(url, "ftp") == 0 || strcmp(url, "ftps") == 0) {
101 return false;
102 }
103
104 /* Reject URLs that look like malformed schemes (http/ instead of http://) */
105 if (strncmp(url, "http/", 5) == 0 || strncmp(url, "https/", 6) == 0) {
106 return false;
107 }
108
109 /* Reject if it contains @ (email-like) */
110 if (strchr(url, '@')) {
111 return false;
112 }
113
114 /* Reject pure hex strings (raw keys, not hostnames) */
115 if (strlen(url) == 64) {
116 bool all_hex = true;
117 for (const char *p = url; *p && all_hex; p++) {
118 if (!((*p >= '0' && *p <= '9') || (*p >= 'a' && *p <= 'f') || (*p >= 'A' && *p <= 'F'))) {
119 all_hex = false;
120 }
121 }
122 if (all_hex) {
123 return false; /* Looks like raw hex key, not hostname */
124 }
125 }
126
127 /* Check colon handling */
128 const char *colon_pos = strchr(url, ':');
129 if (colon_pos) {
130 /* Has colon - reject unless colon is followed by port number */
131 const char *after_colon = colon_pos + 1;
132 bool looks_like_port = true;
133 for (const char *p = after_colon; *p && *p != '/'; p++) {
134 if (!(*p >= '0' && *p <= '9')) {
135 looks_like_port = false;
136 break;
137 }
138 }
139 if (!looks_like_port) {
140 return false; /* Colon but not a port number */
141 }
142 }
143
144 /* Looks like a bare hostname/IP - prepend http:// */
145 int result = snprintf(url_with_scheme, sizeof(url_with_scheme), "http://%s", url);
146 if (result < 0 || result >= (int)sizeof(url_with_scheme)) {
147 return false; /* URL too long */
148 }
149 url_to_match = url_with_scheme;
150 }
151
152 pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(regex, NULL);
153 if (!match_data) {
154 return false;
155 }
156
157 /* Perform JIT match (falls back to interpreted if JIT unavailable) */
158 int rc = pcre2_jit_match(regex, (PCRE2_SPTR)url_to_match, strlen(url_to_match), 0, /* startoffset */
159 0, /* options */
160 match_data, NULL); /* mcontext */
161
162 pcre2_match_data_free(match_data);
163 return rc >= 0; /* rc >= 0 means successful match */
164}
165
166asciichat_error_t url_parse(const char *url, url_parts_t *parts_out) {
167 if (!url || !*url) {
168 return SET_ERRNO(ERROR_INVALID_PARAM, "URL is NULL or empty");
169 }
170
171 if (!parts_out) {
172 return SET_ERRNO(ERROR_INVALID_PARAM, "parts_out is NULL");
173 }
174
175 /* Clear output structure */
176 memset(parts_out, 0, sizeof(*parts_out));
177
178 pcre2_code *regex = url_regex_get();
179 if (!regex) {
180 return SET_ERRNO(ERROR_CONFIG, "URL validator not initialized");
181 }
182
183 /* Check if URL needs http:// prefix (bare hostname or IP) */
184 // allocate twice the "safe limit" of 2048 for website URLs, even though modern browsers can handle up
185 // to 80k character URLs in some cases.
186 char url_with_scheme[4096];
187 const char *url_to_match = url;
188 const char *original_url = url;
189
190 if (!strstr(url, "://")) {
191 /* No scheme - check if it looks like a bare hostname/IP */
192
193 /* Reject bare scheme words like "http", "https", "ftp" */
194 if (strcmp(url, "http") == 0 || strcmp(url, "https") == 0 || strcmp(url, "ftp") == 0 || strcmp(url, "ftps") == 0) {
195 return SET_ERRNO(ERROR_INVALID_PARAM, "Invalid URL format: %s", url);
196 }
197
198 /* Reject URLs that look like malformed schemes (http/ instead of http://) */
199 if (strncmp(url, "http/", 5) == 0 || strncmp(url, "https/", 6) == 0) {
200 return SET_ERRNO(ERROR_INVALID_PARAM, "Invalid URL format (looks like malformed scheme): %s", url);
201 }
202
203 /* Reject pure hex strings (raw keys, not hostnames) */
204 if (strlen(url) == 64) {
205 bool all_hex = true;
206 for (const char *p = url; *p && all_hex; p++) {
207 if (!((*p >= '0' && *p <= '9') || (*p >= 'a' && *p <= 'f') || (*p >= 'A' && *p <= 'F'))) {
208 all_hex = false;
209 }
210 }
211 if (all_hex) {
212 return SET_ERRNO(ERROR_INVALID_PARAM, "Invalid URL: appears to be raw hex data, not a URL");
213 }
214 }
215
216 const char *colon_pos = strchr(url, ':');
217 if (colon_pos) {
218 /* Has colon - check if what follows is numeric (port) */
219 const char *after_colon = colon_pos + 1;
220 bool looks_like_port = true;
221 for (const char *p = after_colon; *p && *p != '/'; p++) {
222 if (!(*p >= '0' && *p <= '9')) {
223 looks_like_port = false;
224 break;
225 }
226 }
227 if (!looks_like_port) {
228 return SET_ERRNO(ERROR_INVALID_PARAM, "Invalid URL format (invalid scheme): %s", url);
229 }
230 }
231
232 /* Looks like a bare hostname/IP - prepend http:// */
233 int result = snprintf(url_with_scheme, sizeof(url_with_scheme), "http://%s", url);
234 if (result < 0 || result >= (int)sizeof(url_with_scheme)) {
235 return SET_ERRNO(ERROR_INVALID_PARAM, "URL too long");
236 }
237 url_to_match = url_with_scheme;
238 }
239
240 pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(regex, NULL);
241 if (!match_data) {
242 return SET_ERRNO(ERROR_MEMORY, "Failed to create match data");
243 }
244
245 /* Perform JIT match (falls back to interpreted if JIT unavailable) */
246 int rc = pcre2_jit_match(regex, (PCRE2_SPTR)url_to_match, strlen(url_to_match), 0, /* startoffset */
247 0, /* options */
248 match_data, NULL); /* mcontext */
249
250 if (rc < 0) {
251 pcre2_match_data_free(match_data);
252 return SET_ERRNO(ERROR_INVALID_PARAM, "Invalid URL format: %s", original_url);
253 }
254
255 /* Extract named groups */
256 parts_out->scheme = asciichat_pcre2_extract_named_group(regex, match_data, "scheme", url_to_match);
257 parts_out->userinfo = asciichat_pcre2_extract_named_group(regex, match_data, "userinfo", url_to_match);
258 parts_out->host = asciichat_pcre2_extract_named_group(regex, match_data, "host", url_to_match);
259 parts_out->ipv6 = asciichat_pcre2_extract_named_group(regex, match_data, "ipv6", url_to_match);
260 parts_out->path = asciichat_pcre2_extract_named_group(regex, match_data, "path_query_fragment", url_to_match);
261
262 /* Extract port number */
263 parts_out->port = 0;
264 char *port_str = asciichat_pcre2_extract_named_group(regex, match_data, "port", url_to_match);
265 if (port_str) {
266 parts_out->port = (int)strtol(port_str, NULL, 10);
267 SAFE_FREE(port_str);
268 }
269
270 pcre2_match_data_free(match_data);
271
272 /* Verify we got required fields */
273 if (!parts_out->scheme || !parts_out->host) {
274 url_parts_destroy(parts_out);
275 return SET_ERRNO(ERROR_INVALID_PARAM, "Missing required URL components");
276 }
277
278 return ASCIICHAT_OK;
279}
280
281void url_parts_destroy(url_parts_t *parts) {
282 if (!parts) {
283 SET_ERRNO(ERROR_INVALID_PARAM, "parts is NULL");
284 return;
285 }
286
287 SAFE_FREE(parts->scheme);
288 SAFE_FREE(parts->userinfo);
289 SAFE_FREE(parts->host);
290 SAFE_FREE(parts->ipv6);
291 SAFE_FREE(parts->path);
292 parts->port = 0;
293
294 memset(parts, 0, sizeof(*parts));
295}
296
297bool url_is_websocket_scheme(const char *scheme) {
298 if (!scheme || !*scheme) {
299 SET_ERRNO(ERROR_INVALID_PARAM, "scheme is NULL or empty");
300 return false;
301 }
302
303 /* Case-insensitive comparison for "ws" or "wss" */
304 return (strcasecmp(scheme, "ws") == 0 || strcasecmp(scheme, "wss") == 0);
305}
306
307bool url_is_websocket(const char *url) {
308 if (!url || !*url) {
309 SET_ERRNO(ERROR_INVALID_PARAM, "url is NULL or empty");
310 return false;
311 }
312
313 /* Parse URL to validate and check scheme */
314 url_parts_t parts = {0};
315 asciichat_error_t result = url_parse(url, &parts);
316 bool is_ws = (result == ASCIICHAT_OK && url_is_websocket_scheme(parts.scheme));
317 url_parts_destroy(&parts);
318
319 return is_ws;
320}
321
322bool url_looks_like_websocket(const char *url) {
323 if (!url || !*url) {
324 SET_ERRNO(ERROR_INVALID_PARAM, "url is NULL or empty");
325 return false;
326 }
327
328 /* Quick check for ws:// or wss:// prefix (case-insensitive) */
329 return (strncasecmp(url, "ws://", 5) == 0 || strncasecmp(url, "wss://", 6) == 0);
330}
pcre2_code * asciichat_pcre2_singleton_get_code(pcre2_singleton_t *singleton)
Get the compiled pcre2_code from a singleton handle.
Definition pcre2.c:95
char * asciichat_pcre2_extract_named_group(pcre2_code *regex, pcre2_match_data *match_data, const char *group_name, const char *subject)
Extract named substring from PCRE2 match data.
Definition pcre2.c:252
Represents a thread-safe compiled PCRE2 regex singleton.
Definition pcre2.c:21
void url_parts_destroy(url_parts_t *parts)
Definition url.c:281
bool url_is_valid(const char *url)
Definition url.c:81
bool url_is_websocket_scheme(const char *scheme)
Definition url.c:297
bool url_is_websocket(const char *url)
Definition url.c:307
asciichat_error_t url_parse(const char *url, url_parts_t *parts_out)
Definition url.c:166
bool url_looks_like_websocket(const char *url)
Definition url.c:322