ascii-chat 0.8.38
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
url.c File Reference

Production-grade URL parsing and validation using PCRE2. More...

Go to the source code of this file.

Functions

bool url_is_valid (const char *url)
 
asciichat_error_t url_parse (const char *url, url_parts_t *parts_out)
 
void url_parts_destroy (url_parts_t *parts)
 
bool url_is_websocket_scheme (const char *scheme)
 
bool url_is_websocket (const char *url)
 
bool url_looks_like_websocket (const char *url)
 

Detailed Description

Production-grade URL parsing and validation using PCRE2.

Implements robust HTTP(S) URL validation using the production-grade regex by Diego Perini (MIT License), compiled with PCRE2 and JIT for high performance.

Definition in file url.c.

Function Documentation

◆ url_is_valid()

bool url_is_valid ( const char *  url)

Definition at line 81 of file url.c.

81 {
82 if (!url || !*url) {
83 SET_ERRNO(ERROR_INVALID_PARAM, "URL is NULL or empty");
84 return false;
85 }
86
87 pcre2_code *regex = url_regex_get();
88 if (!regex) {
89 return false;
90 }
91
92 /* Check if URL needs http:// prefix (bare hostname or IP) */
93 char url_with_scheme[2048];
94 const char *url_to_match = url;
95
96 if (!strstr(url, "://")) {
97 /* No scheme - check if it looks like a bare hostname/IP */
98
99 /* Reject bare scheme words like "http", "https", "ftp" */
100 if (strcmp(url, "http") == 0 || strcmp(url, "https") == 0 || strcmp(url, "ftp") == 0 || strcmp(url, "ftps") == 0) {
101 return false;
102 }
103
104 /* Reject URLs that look like malformed schemes (http/ instead of http://) */
105 if (strncmp(url, "http/", 5) == 0 || strncmp(url, "https/", 6) == 0) {
106 return false;
107 }
108
109 /* Reject if it contains @ (email-like) */
110 if (strchr(url, '@')) {
111 return false;
112 }
113
114 /* Reject pure hex strings (raw keys, not hostnames) */
115 if (strlen(url) == 64) {
116 bool all_hex = true;
117 for (const char *p = url; *p && all_hex; p++) {
118 if (!((*p >= '0' && *p <= '9') || (*p >= 'a' && *p <= 'f') || (*p >= 'A' && *p <= 'F'))) {
119 all_hex = false;
120 }
121 }
122 if (all_hex) {
123 return false; /* Looks like raw hex key, not hostname */
124 }
125 }
126
127 /* Check colon handling */
128 const char *colon_pos = strchr(url, ':');
129 if (colon_pos) {
130 /* Has colon - reject unless colon is followed by port number */
131 const char *after_colon = colon_pos + 1;
132 bool looks_like_port = true;
133 for (const char *p = after_colon; *p && *p != '/'; p++) {
134 if (!(*p >= '0' && *p <= '9')) {
135 looks_like_port = false;
136 break;
137 }
138 }
139 if (!looks_like_port) {
140 return false; /* Colon but not a port number */
141 }
142 }
143
144 /* Looks like a bare hostname/IP - prepend http:// */
145 int result = snprintf(url_with_scheme, sizeof(url_with_scheme), "http://%s", url);
146 if (result < 0 || result >= (int)sizeof(url_with_scheme)) {
147 return false; /* URL too long */
148 }
149 url_to_match = url_with_scheme;
150 }
151
152 pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(regex, NULL);
153 if (!match_data) {
154 return false;
155 }
156
157 /* Perform JIT match (falls back to interpreted if JIT unavailable) */
158 int rc = pcre2_jit_match(regex, (PCRE2_SPTR)url_to_match, strlen(url_to_match), 0, /* startoffset */
159 0, /* options */
160 match_data, NULL); /* mcontext */
161
162 pcre2_match_data_free(match_data);
163 return rc >= 0; /* rc >= 0 means successful match */
164}

Referenced by ffmpeg_decoder_create(), options_init(), parse_public_key(), and yt_dlp_extract_stream_url().

◆ url_is_websocket()

bool url_is_websocket ( const char *  url)

Definition at line 307 of file url.c.

307 {
308 if (!url || !*url) {
309 SET_ERRNO(ERROR_INVALID_PARAM, "url is NULL or empty");
310 return false;
311 }
312
313 /* Parse URL to validate and check scheme */
314 url_parts_t parts = {0};
315 asciichat_error_t result = url_parse(url, &parts);
316 bool is_ws = (result == ASCIICHAT_OK && url_is_websocket_scheme(parts.scheme));
317 url_parts_destroy(&parts);
318
319 return is_ws;
320}
void url_parts_destroy(url_parts_t *parts)
Definition url.c:281
bool url_is_websocket_scheme(const char *scheme)
Definition url.c:297
asciichat_error_t url_parse(const char *url, url_parts_t *parts_out)
Definition url.c:166

References url_is_websocket_scheme(), url_parse(), and url_parts_destroy().

Referenced by client_main(), connection_attempt_tcp(), server_connection_establish(), and session_client_like_run().

◆ url_is_websocket_scheme()

bool url_is_websocket_scheme ( const char *  scheme)

Definition at line 297 of file url.c.

297 {
298 if (!scheme || !*scheme) {
299 SET_ERRNO(ERROR_INVALID_PARAM, "scheme is NULL or empty");
300 return false;
301 }
302
303 /* Case-insensitive comparison for "ws" or "wss" */
304 return (strcasecmp(scheme, "ws") == 0 || strcasecmp(scheme, "wss") == 0);
305}

Referenced by url_is_websocket().

◆ url_looks_like_websocket()

bool url_looks_like_websocket ( const char *  url)

Definition at line 322 of file url.c.

322 {
323 if (!url || !*url) {
324 SET_ERRNO(ERROR_INVALID_PARAM, "url is NULL or empty");
325 return false;
326 }
327
328 /* Quick check for ws:// or wss:// prefix (case-insensitive) */
329 return (strncasecmp(url, "ws://", 5) == 0 || strncasecmp(url, "wss://", 6) == 0);
330}

◆ url_parse()

asciichat_error_t url_parse ( const char *  url,
url_parts_t *  parts_out 
)

Definition at line 166 of file url.c.

166 {
167 if (!url || !*url) {
168 return SET_ERRNO(ERROR_INVALID_PARAM, "URL is NULL or empty");
169 }
170
171 if (!parts_out) {
172 return SET_ERRNO(ERROR_INVALID_PARAM, "parts_out is NULL");
173 }
174
175 /* Clear output structure */
176 memset(parts_out, 0, sizeof(*parts_out));
177
178 pcre2_code *regex = url_regex_get();
179 if (!regex) {
180 return SET_ERRNO(ERROR_CONFIG, "URL validator not initialized");
181 }
182
183 /* Check if URL needs http:// prefix (bare hostname or IP) */
184 // allocate twice the "safe limit" of 2048 for website URLs, even though modern browsers can handle up
185 // to 80k character URLs in some cases.
186 char url_with_scheme[4096];
187 const char *url_to_match = url;
188 const char *original_url = url;
189
190 if (!strstr(url, "://")) {
191 /* No scheme - check if it looks like a bare hostname/IP */
192
193 /* Reject bare scheme words like "http", "https", "ftp" */
194 if (strcmp(url, "http") == 0 || strcmp(url, "https") == 0 || strcmp(url, "ftp") == 0 || strcmp(url, "ftps") == 0) {
195 return SET_ERRNO(ERROR_INVALID_PARAM, "Invalid URL format: %s", url);
196 }
197
198 /* Reject URLs that look like malformed schemes (http/ instead of http://) */
199 if (strncmp(url, "http/", 5) == 0 || strncmp(url, "https/", 6) == 0) {
200 return SET_ERRNO(ERROR_INVALID_PARAM, "Invalid URL format (looks like malformed scheme): %s", url);
201 }
202
203 /* Reject pure hex strings (raw keys, not hostnames) */
204 if (strlen(url) == 64) {
205 bool all_hex = true;
206 for (const char *p = url; *p && all_hex; p++) {
207 if (!((*p >= '0' && *p <= '9') || (*p >= 'a' && *p <= 'f') || (*p >= 'A' && *p <= 'F'))) {
208 all_hex = false;
209 }
210 }
211 if (all_hex) {
212 return SET_ERRNO(ERROR_INVALID_PARAM, "Invalid URL: appears to be raw hex data, not a URL");
213 }
214 }
215
216 const char *colon_pos = strchr(url, ':');
217 if (colon_pos) {
218 /* Has colon - check if what follows is numeric (port) */
219 const char *after_colon = colon_pos + 1;
220 bool looks_like_port = true;
221 for (const char *p = after_colon; *p && *p != '/'; p++) {
222 if (!(*p >= '0' && *p <= '9')) {
223 looks_like_port = false;
224 break;
225 }
226 }
227 if (!looks_like_port) {
228 return SET_ERRNO(ERROR_INVALID_PARAM, "Invalid URL format (invalid scheme): %s", url);
229 }
230 }
231
232 /* Looks like a bare hostname/IP - prepend http:// */
233 int result = snprintf(url_with_scheme, sizeof(url_with_scheme), "http://%s", url);
234 if (result < 0 || result >= (int)sizeof(url_with_scheme)) {
235 return SET_ERRNO(ERROR_INVALID_PARAM, "URL too long");
236 }
237 url_to_match = url_with_scheme;
238 }
239
240 pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(regex, NULL);
241 if (!match_data) {
242 return SET_ERRNO(ERROR_MEMORY, "Failed to create match data");
243 }
244
245 /* Perform JIT match (falls back to interpreted if JIT unavailable) */
246 int rc = pcre2_jit_match(regex, (PCRE2_SPTR)url_to_match, strlen(url_to_match), 0, /* startoffset */
247 0, /* options */
248 match_data, NULL); /* mcontext */
249
250 if (rc < 0) {
251 pcre2_match_data_free(match_data);
252 return SET_ERRNO(ERROR_INVALID_PARAM, "Invalid URL format: %s", original_url);
253 }
254
255 /* Extract named groups */
256 parts_out->scheme = asciichat_pcre2_extract_named_group(regex, match_data, "scheme", url_to_match);
257 parts_out->userinfo = asciichat_pcre2_extract_named_group(regex, match_data, "userinfo", url_to_match);
258 parts_out->host = asciichat_pcre2_extract_named_group(regex, match_data, "host", url_to_match);
259 parts_out->ipv6 = asciichat_pcre2_extract_named_group(regex, match_data, "ipv6", url_to_match);
260 parts_out->path = asciichat_pcre2_extract_named_group(regex, match_data, "path_query_fragment", url_to_match);
261
262 /* Extract port number */
263 parts_out->port = 0;
264 char *port_str = asciichat_pcre2_extract_named_group(regex, match_data, "port", url_to_match);
265 if (port_str) {
266 parts_out->port = (int)strtol(port_str, NULL, 10);
267 SAFE_FREE(port_str);
268 }
269
270 pcre2_match_data_free(match_data);
271
272 /* Verify we got required fields */
273 if (!parts_out->scheme || !parts_out->host) {
274 url_parts_destroy(parts_out);
275 return SET_ERRNO(ERROR_INVALID_PARAM, "Missing required URL components");
276 }
277
278 return ASCIICHAT_OK;
279}
char * asciichat_pcre2_extract_named_group(pcre2_code *regex, pcre2_match_data *match_data, const char *group_name, const char *subject)
Extract named substring from PCRE2 match data.
Definition pcre2.c:252

References asciichat_pcre2_extract_named_group(), and url_parts_destroy().

Referenced by connection_attempt_tcp(), parse_public_key(), server_connection_establish(), and url_is_websocket().

◆ url_parts_destroy()

void url_parts_destroy ( url_parts_t *  parts)

Definition at line 281 of file url.c.

281 {
282 if (!parts) {
283 SET_ERRNO(ERROR_INVALID_PARAM, "parts is NULL");
284 return;
285 }
286
287 SAFE_FREE(parts->scheme);
288 SAFE_FREE(parts->userinfo);
289 SAFE_FREE(parts->host);
290 SAFE_FREE(parts->ipv6);
291 SAFE_FREE(parts->path);
292 parts->port = 0;
293
294 memset(parts, 0, sizeof(*parts));
295}

Referenced by connection_attempt_tcp(), parse_public_key(), server_connection_establish(), url_is_websocket(), and url_parse().