VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/misc/uri.cpp

Last change on this file was 101657, checked in by vboxsync, 7 months ago

Runtime/uri.cpp: Make it build on linux.arm64 with -Werror, bugref:10541

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 40.6 KB
Line 
1/* $Id: uri.cpp 101657 2023-10-30 13:17:13Z vboxsync $ */
2/** @file
3 * IPRT - Uniform Resource Identifier handling.
4 */
5
6/*
7 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37
38/*********************************************************************************************************************************
39* Header Files *
40*********************************************************************************************************************************/
41#include <iprt/uri.h>
42
43#include <iprt/assert.h>
44#include <iprt/ctype.h>
45#include <iprt/err.h>
46#include <iprt/path.h>
47#include <iprt/string.h>
48
49
50/*********************************************************************************************************************************
51* Defined Constants And Macros *
52*********************************************************************************************************************************/
53/** Internal magic value we use to check if a RTURIPARSED structure has made it thru RTUriParse. */
54#define RTURIPARSED_MAGIC UINT32_C(0x439e0745)
55
56
57/* General URI format:
58
59 foo://example.com:8042/over/there?name=ferret#nose
60 \_/ \______________/\_________/ \_________/ \__/
61 | | | | |
62 scheme authority path query fragment
63 | _____________________|__
64 / \ / \
65 urn:example:animal:ferret:nose
66*/
67
68
69/**
70 * The following defines characters which have to be % escaped:
71 * control = 00-1F
72 * space = ' '
73 * delims = '<' , '>' , '#' , '%' , '"'
74 * unwise = '{' , '}' , '|' , '\' , '^' , '[' , ']' , '`'
75 *
76 * @note ARM defines char as unsigned by default in the AAPCS(64) so the first check would trigger
77 * a compiler warning/error. Apple decided to ignore that and declares char a signed like on
78 * the other platforms.
79 */
80#if defined(RT_OS_LINUX) \
81 && (defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32))
82# define URI_EXCLUDED(a) \
83 ( ((a) <= 0x20) \
84 || ((a) >= 0x5B && (a) <= 0x5E) \
85 || ((a) >= 0x7B && (a) <= 0x7D) \
86 || (a) == '<' || (a) == '>' || (a) == '#' \
87 || (a) == '%' || (a) == '"' || (a) == '`' )
88#else
89# define URI_EXCLUDED(a) \
90 ( ((a) >= 0x0 && (a) <= 0x20) \
91 || ((a) >= 0x5B && (a) <= 0x5E) \
92 || ((a) >= 0x7B && (a) <= 0x7D) \
93 || (a) == '<' || (a) == '>' || (a) == '#' \
94 || (a) == '%' || (a) == '"' || (a) == '`' )
95#endif
96
97static char *rtUriPercentEncodeN(const char *pszString, size_t cchMax)
98{
99 if (!pszString)
100 return NULL;
101
102 int rc = VINF_SUCCESS;
103
104 size_t cbLen = RT_MIN(strlen(pszString), cchMax);
105 /* The new string can be max 3 times in size of the original string. */
106 char *pszNew = RTStrAlloc(cbLen * 3 + 1);
107 if (!pszNew)
108 return NULL;
109
110 char *pszRes = NULL;
111 size_t iIn = 0;
112 size_t iOut = 0;
113 while (iIn < cbLen)
114 {
115 if (URI_EXCLUDED(pszString[iIn]))
116 {
117 char szNum[3] = { 0, 0, 0 };
118 RTStrFormatU8(&szNum[0], 3, pszString[iIn++], 16, 2, 2, RTSTR_F_CAPITAL | RTSTR_F_ZEROPAD);
119 pszNew[iOut++] = '%';
120 pszNew[iOut++] = szNum[0];
121 pszNew[iOut++] = szNum[1];
122 }
123 else
124 pszNew[iOut++] = pszString[iIn++];
125 }
126 if (RT_SUCCESS(rc))
127 {
128 pszNew[iOut] = '\0';
129 if (iOut != iIn)
130 {
131 /* If the source and target strings have different size, recreate
132 * the target string with the correct size. */
133 pszRes = RTStrDupN(pszNew, iOut);
134 RTStrFree(pszNew);
135 }
136 else
137 pszRes = pszNew;
138 }
139 else
140 RTStrFree(pszNew);
141
142 return pszRes;
143}
144
145
146/**
147 * Calculates the encoded string length.
148 *
149 * @returns Number of chars (excluding the terminator).
150 * @param pszString The string to encode.
151 * @param cchMax The maximum string length (e.g. RTSTR_MAX).
152 * @param fEncodeDosSlash Whether to encode DOS slashes or not.
153 */
154static size_t rtUriCalcEncodedLength(const char *pszString, size_t cchMax, bool fEncodeDosSlash)
155{
156 size_t cchEncoded = 0;
157 if (pszString)
158 {
159 size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
160 while (cchSrcLeft-- > 0)
161 {
162 char const ch = *pszString++;
163 if (!URI_EXCLUDED(ch) || (ch == '\\' && !fEncodeDosSlash))
164 cchEncoded += 1;
165 else
166 cchEncoded += 3;
167 }
168 }
169 return cchEncoded;
170}
171
172
173/**
174 * Encodes an URI into a caller allocated buffer.
175 *
176 * @returns IPRT status code.
177 * @param pszString The string to encode.
178 * @param cchMax The maximum string length (e.g. RTSTR_MAX).
179 * @param fEncodeDosSlash Whether to encode DOS slashes or not.
180 * @param pszDst The destination buffer.
181 * @param cbDst The size of the destination buffer.
182 */
183static int rtUriEncodeIntoBuffer(const char *pszString, size_t cchMax, bool fEncodeDosSlash, char *pszDst, size_t cbDst)
184{
185 AssertReturn(pszString, VERR_INVALID_POINTER);
186 AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
187
188 /*
189 * We do buffer size checking up front and every time we encode a special
190 * character. That's faster than checking for each char.
191 */
192 size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
193 AssertMsgReturn(cbDst > cchSrcLeft, ("cbDst=%zu cchSrcLeft=%zu\n", cbDst, cchSrcLeft), VERR_BUFFER_OVERFLOW);
194 cbDst -= cchSrcLeft;
195
196 while (cchSrcLeft-- > 0)
197 {
198 char const ch = *pszString++;
199 if (!URI_EXCLUDED(ch) || (ch == '\\' && !fEncodeDosSlash))
200 *pszDst++ = ch;
201 else
202 {
203 AssertReturn(cbDst >= 3, VERR_BUFFER_OVERFLOW); /* 2 extra bytes + zero terminator. */
204 cbDst -= 2;
205
206 *pszDst++ = '%';
207 ssize_t cchTmp = RTStrFormatU8(pszDst, 3, (unsigned char)ch, 16, 2, 2, RTSTR_F_CAPITAL | RTSTR_F_ZEROPAD);
208 Assert(cchTmp == 2); NOREF(cchTmp);
209 pszDst += 2;
210 }
211 }
212
213 *pszDst = '\0';
214 return VINF_SUCCESS;
215}
216
217
218static char *rtUriPercentDecodeN(const char *pszString, size_t cchString)
219{
220 AssertPtrReturn(pszString, NULL);
221 AssertReturn(memchr(pszString, '\0', cchString) == NULL, NULL);
222
223 /*
224 * The new string can only get smaller, so use the input length as a
225 * staring buffer size.
226 */
227 char *pszDecoded = RTStrAlloc(cchString + 1);
228 if (pszDecoded)
229 {
230 /*
231 * Knowing that the pszString itself is valid UTF-8, we only have to
232 * validate the escape sequences.
233 */
234 size_t cchLeft = cchString;
235 char const *pchSrc = pszString;
236 char *pchDst = pszDecoded;
237 while (cchLeft > 0)
238 {
239 const char *pchPct = (const char *)memchr(pchSrc, '%', cchLeft);
240 if (pchPct)
241 {
242 size_t cchBefore = pchPct - pchSrc;
243 if (cchBefore)
244 {
245 memcpy(pchDst, pchSrc, cchBefore);
246 pchDst += cchBefore;
247 pchSrc += cchBefore;
248 cchLeft -= cchBefore;
249 }
250
251 char chHigh, chLow;
252 if ( cchLeft >= 3
253 && RT_C_IS_XDIGIT(chHigh = pchSrc[1])
254 && RT_C_IS_XDIGIT(chLow = pchSrc[2]))
255 {
256 uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
257 b <<= 4;
258 b |= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
259 *pchDst++ = (char)b;
260 pchSrc += 3;
261 cchLeft -= 3;
262 }
263 else
264 {
265 AssertFailed();
266 *pchDst++ = *pchSrc++;
267 cchLeft--;
268 }
269 }
270 else
271 {
272 memcpy(pchDst, pchSrc, cchLeft);
273 pchDst += cchLeft;
274 pchSrc += cchLeft;
275 cchLeft = 0;
276 break;
277 }
278 }
279
280 *pchDst = '\0';
281
282 /*
283 * If we've got lof space room in the result string, reallocate it.
284 */
285 size_t cchDecoded = pchDst - pszDecoded;
286 Assert(cchDecoded <= cchString);
287 if (cchString - cchDecoded > 64)
288 RTStrRealloc(&pszDecoded, cchDecoded + 1);
289 }
290 return pszDecoded;
291}
292
293
294/**
295 * Calculates the decoded string length.
296 *
297 * @returns Number of chars (excluding the terminator).
298 * @param pszString The string to decode.
299 * @param cchMax The maximum string length (e.g. RTSTR_MAX).
300 */
301static size_t rtUriCalcDecodedLength(const char *pszString, size_t cchMax)
302{
303 size_t cchDecoded;
304 if (pszString)
305 {
306 size_t cchSrcLeft = cchDecoded = RTStrNLen(pszString, cchMax);
307 while (cchSrcLeft-- > 0)
308 {
309 char const ch = *pszString++;
310 if (ch != '%')
311 { /* typical */}
312 else if ( cchSrcLeft >= 2
313 && RT_C_IS_XDIGIT(pszString[0])
314 && RT_C_IS_XDIGIT(pszString[1]))
315 {
316 cchDecoded -= 2;
317 pszString += 2;
318 cchSrcLeft -= 2;
319 }
320 }
321 }
322 else
323 cchDecoded = 0;
324 return cchDecoded;
325}
326
327
328/**
329 * Decodes a string into a buffer.
330 *
331 * @returns IPRT status code.
332 * @param pchSrc The source string.
333 * @param cchSrc The max number of bytes to decode in the source string.
334 * @param pszDst The destination buffer.
335 * @param cbDst The size of the buffer (including terminator).
336 */
337static int rtUriDecodeIntoBuffer(const char *pchSrc, size_t cchSrc, char *pszDst, size_t cbDst)
338{
339 AssertPtrReturn(pchSrc, VERR_INVALID_POINTER);
340 AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
341
342 /*
343 * Knowing that the pszString itself is valid UTF-8, we only have to
344 * validate the escape sequences.
345 */
346 cchSrc = RTStrNLen(pchSrc, cchSrc);
347 while (cchSrc > 0)
348 {
349 const char *pchPct = (const char *)memchr(pchSrc, '%', cchSrc);
350 if (pchPct)
351 {
352 size_t cchBefore = pchPct - pchSrc;
353 AssertReturn(cchBefore + 1 < cbDst, VERR_BUFFER_OVERFLOW);
354 if (cchBefore)
355 {
356 memcpy(pszDst, pchSrc, cchBefore);
357 pszDst += cchBefore;
358 cbDst -= cchBefore;
359 pchSrc += cchBefore;
360 cchSrc -= cchBefore;
361 }
362
363 char chHigh, chLow;
364 if ( cchSrc >= 3
365 && RT_C_IS_XDIGIT(chHigh = pchSrc[1])
366 && RT_C_IS_XDIGIT(chLow = pchSrc[2]))
367 {
368 uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
369 b <<= 4;
370 b |= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
371 *pszDst++ = (char)b;
372 pchSrc += 3;
373 cchSrc -= 3;
374 }
375 else
376 {
377 AssertFailed();
378 *pszDst++ = *pchSrc++;
379 cchSrc--;
380 }
381 cbDst -= 1;
382 }
383 else
384 {
385 AssertReturn(cchSrc < cbDst, VERR_BUFFER_OVERFLOW);
386 memcpy(pszDst, pchSrc, cchSrc);
387 pszDst += cchSrc;
388 cbDst -= cchSrc;
389 pchSrc += cchSrc;
390 cchSrc = 0;
391 break;
392 }
393 }
394
395 AssertReturn(cbDst > 0, VERR_BUFFER_OVERFLOW);
396 *pszDst = '\0';
397 return VINF_SUCCESS;
398}
399
400
401
402static int rtUriParse(const char *pszUri, PRTURIPARSED pParsed)
403{
404 /*
405 * Validate the input and clear the output.
406 */
407 AssertPtrReturn(pParsed, VERR_INVALID_POINTER);
408 RT_ZERO(*pParsed);
409 pParsed->uAuthorityPort = UINT32_MAX;
410
411 AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
412
413 size_t const cchUri = strlen(pszUri);
414 if (RT_LIKELY(cchUri >= 3)) { /* likely */ }
415 else return cchUri ? VERR_URI_TOO_SHORT : VERR_URI_EMPTY;
416
417 /*
418 * Validating escaped text sequences is much simpler if we know that
419 * that the base URI string is valid. Also, we don't necessarily trust
420 * the developer calling us to remember to do this.
421 */
422 int rc = RTStrValidateEncoding(pszUri);
423 AssertRCReturn(rc, rc);
424
425 /*
426 * RFC-3986, section 3.1:
427 * scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
428 *
429 * The scheme ends with a ':', which we also skip here.
430 */
431 size_t off = 0;
432 char ch = pszUri[off++];
433 if (RT_LIKELY(RT_C_IS_ALPHA(ch))) { /* likely */ }
434 else return VERR_URI_INVALID_SCHEME;
435 for (;;)
436 {
437 ch = pszUri[off];
438 if (ch == ':')
439 break;
440 if (RT_LIKELY(RT_C_IS_ALNUM(ch) || ch == '.' || ch == '-' || ch == '+')) { /* likely */ }
441 else return VERR_URI_INVALID_SCHEME;
442 off++;
443 }
444 pParsed->cchScheme = off;
445
446 /* Require the scheme length to be at least two chars so we won't confuse
447 it with a path starting with a DOS drive letter specification. */
448 if (RT_LIKELY(off >= 2)) { /* likely */ }
449 else return VERR_URI_INVALID_SCHEME;
450
451 off++; /* (skip colon) */
452
453 /*
454 * Find the end of the path, we'll need this several times.
455 * Also, while we're potentially scanning the whole thing, check for '%'.
456 */
457 size_t const offHash = RTStrOffCharOrTerm(&pszUri[off], '#') + off;
458 size_t const offQuestionMark = RTStrOffCharOrTerm(&pszUri[off], '?') + off;
459
460 if (memchr(pszUri, '%', cchUri) != NULL)
461 pParsed->fFlags |= RTURIPARSED_F_CONTAINS_ESCAPED_CHARS;
462
463 /*
464 * RFC-3986, section 3.2:
465 * The authority component is preceeded by a double slash ("//")...
466 */
467 if ( pszUri[off] == '/'
468 && pszUri[off + 1] == '/')
469 {
470 off += 2;
471 pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
472 pParsed->fFlags |= RTURIPARSED_F_HAS_AUTHORITY;
473
474 /*
475 * RFC-3986, section 3.2:
476 * ...and is terminated by the next slash ("/"), question mark ("?"),
477 * or number sign ("#") character, or by the end of the URI.
478 */
479 const char *pszAuthority = &pszUri[off];
480 size_t cchAuthority = RTStrOffCharOrTerm(pszAuthority, '/');
481 cchAuthority = RT_MIN(cchAuthority, offHash - off);
482 cchAuthority = RT_MIN(cchAuthority, offQuestionMark - off);
483 pParsed->cchAuthority = cchAuthority;
484
485 /* The Authority can be empty, like for: file:///usr/bin/grep */
486 if (cchAuthority > 0)
487 {
488 pParsed->cchAuthorityHost = cchAuthority;
489
490 /*
491 * If there is a userinfo part, it is ended by a '@'.
492 */
493 const char *pszAt = (const char *)memchr(pszAuthority, '@', cchAuthority);
494 if (pszAt)
495 {
496 size_t cchTmp = pszAt - pszAuthority;
497 pParsed->offAuthorityHost += cchTmp + 1;
498 pParsed->cchAuthorityHost -= cchTmp + 1;
499
500 /* If there is a password part, it's separated from the username with a colon. */
501 const char *pszColon = (const char *)memchr(pszAuthority, ':', cchTmp);
502 if (pszColon)
503 {
504 pParsed->cchAuthorityUsername = pszColon - pszAuthority;
505 pParsed->offAuthorityPassword = &pszColon[1] - pszUri;
506 pParsed->cchAuthorityPassword = pszAt - &pszColon[1];
507 }
508 else
509 {
510 pParsed->cchAuthorityUsername = cchTmp;
511 pParsed->offAuthorityPassword = off + cchTmp;
512 }
513 }
514
515 /*
516 * If there is a port part, its after the last colon in the host part.
517 */
518 const char *pszColon = (const char *)memrchr(&pszUri[pParsed->offAuthorityHost], ':', pParsed->cchAuthorityHost);
519 if (pszColon)
520 {
521 size_t cchTmp = &pszUri[pParsed->offAuthorityHost + pParsed->cchAuthorityHost] - &pszColon[1];
522 pParsed->cchAuthorityHost -= cchTmp + 1;
523 pParsed->fFlags |= RTURIPARSED_F_HAS_PORT;
524 if (cchTmp > 0)
525 {
526 pParsed->uAuthorityPort = 0;
527 while (cchTmp-- > 0)
528 {
529 ch = *++pszColon;
530 if ( RT_C_IS_DIGIT(ch)
531 && pParsed->uAuthorityPort < UINT32_MAX / UINT32_C(10))
532 {
533 pParsed->uAuthorityPort *= 10;
534 pParsed->uAuthorityPort += ch - '0';
535 }
536 else
537 return VERR_URI_INVALID_PORT_NUMBER;
538 }
539 }
540 }
541 }
542
543 /* Skip past the authority. */
544 off += cchAuthority;
545 }
546 else
547 pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
548
549 /*
550 * RFC-3986, section 3.3: Path
551 * The path is terminated by the first question mark ("?")
552 * or number sign ("#") character, or by the end of the URI.
553 */
554 pParsed->offPath = off;
555 pParsed->cchPath = RT_MIN(offHash, offQuestionMark) - off;
556 off += pParsed->cchPath;
557
558 /*
559 * RFC-3986, section 3.4: Query
560 * The query component is indicated by the first question mark ("?")
561 * character and terminated by a number sign ("#") character or by the
562 * end of the URI.
563 */
564 if ( off == offQuestionMark
565 && off < cchUri)
566 {
567 Assert(pszUri[offQuestionMark] == '?');
568 pParsed->offQuery = ++off;
569 pParsed->cchQuery = offHash - off;
570 off = offHash;
571 }
572 else
573 {
574 Assert(!pszUri[offQuestionMark]);
575 pParsed->offQuery = off;
576 }
577
578 /*
579 * RFC-3986, section 3.5: Fragment
580 * A fragment identifier component is indicated by the presence of a
581 * number sign ("#") character and terminated by the end of the URI.
582 */
583 if ( off == offHash
584 && off < cchUri)
585 {
586 pParsed->offFragment = ++off;
587 pParsed->cchFragment = cchUri - off;
588 }
589 else
590 {
591 Assert(!pszUri[offHash]);
592 pParsed->offFragment = off;
593 }
594
595 /*
596 * If there are any escape sequences, validate them.
597 *
598 * This is reasonably simple as we already know that the string is valid UTF-8
599 * before they get decoded. Thus we only have to validate the escaped sequences.
600 */
601 if (pParsed->fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
602 {
603 const char *pchSrc = (const char *)memchr(pszUri, '%', cchUri);
604 AssertReturn(pchSrc, VERR_INTERNAL_ERROR);
605 do
606 {
607 char szUtf8Seq[8];
608 unsigned cchUtf8Seq = 0;
609 unsigned cchNeeded = 0;
610 size_t cchLeft = &pszUri[cchUri] - pchSrc;
611 do
612 {
613 if (cchLeft >= 3)
614 {
615 char chHigh = pchSrc[1];
616 char chLow = pchSrc[2];
617 if ( RT_C_IS_XDIGIT(chHigh)
618 && RT_C_IS_XDIGIT(chLow))
619 {
620 uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
621 b <<= 4;
622 b |= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
623
624 if (!(b & 0x80))
625 {
626 /* We don't want the string to be terminated prematurely. */
627 if (RT_LIKELY(b != 0)) { /* likely */ }
628 else return VERR_URI_ESCAPED_ZERO;
629
630 /* Check that we're not expecting more UTF-8 bytes. */
631 if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
632 else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
633 }
634 /* Are we waiting UTF-8 bytes? */
635 else if (cchNeeded > 0)
636 {
637 if (RT_LIKELY(!(b & 0x40))) { /* likely */ }
638 else return VERR_URI_INVALID_ESCAPED_UTF8_CONTINUATION_BYTE;
639
640 szUtf8Seq[cchUtf8Seq++] = (char)b;
641 if (--cchNeeded == 0)
642 {
643 szUtf8Seq[cchUtf8Seq] = '\0';
644 rc = RTStrValidateEncoding(szUtf8Seq);
645 if (RT_FAILURE(rc))
646 return VERR_URI_ESCAPED_CHARS_NOT_VALID_UTF8;
647 cchUtf8Seq = 0;
648 }
649 }
650 /* Start a new UTF-8 sequence. */
651 else
652 {
653 if ((b & 0xf8) == 0xf0)
654 cchNeeded = 3;
655 else if ((b & 0xf0) == 0xe0)
656 cchNeeded = 2;
657 else if ((b & 0xe0) == 0xc0)
658 cchNeeded = 1;
659 else
660 return VERR_URI_INVALID_ESCAPED_UTF8_LEAD_BYTE;
661 szUtf8Seq[0] = (char)b;
662 cchUtf8Seq = 1;
663 }
664 pchSrc += 3;
665 cchLeft -= 3;
666 }
667 else
668 return VERR_URI_INVALID_ESCAPE_SEQ;
669 }
670 else
671 return VERR_URI_INVALID_ESCAPE_SEQ;
672 } while (cchLeft > 0 && pchSrc[0] == '%');
673
674 /* Check that we're not expecting more UTF-8 bytes. */
675 if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
676 else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
677
678 /* next */
679 pchSrc = (const char *)memchr(pchSrc, '%', cchLeft);
680 } while (pchSrc);
681 }
682
683 pParsed->u32Magic = RTURIPARSED_MAGIC;
684 return VINF_SUCCESS;
685}
686
687
688RTDECL(int) RTUriParse(const char *pszUri, PRTURIPARSED pParsed)
689{
690 return rtUriParse(pszUri, pParsed);
691}
692
693
694RTDECL(char *) RTUriParsedScheme(const char *pszUri, PCRTURIPARSED pParsed)
695{
696 AssertPtrReturn(pszUri, NULL);
697 AssertPtrReturn(pParsed, NULL);
698 AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
699 return RTStrDupN(pszUri, pParsed->cchScheme);
700}
701
702
703RTDECL(char *) RTUriParsedAuthority(const char *pszUri, PCRTURIPARSED pParsed)
704{
705 AssertPtrReturn(pszUri, NULL);
706 AssertPtrReturn(pParsed, NULL);
707 AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
708 if (pParsed->cchAuthority || (pParsed->fFlags & RTURIPARSED_F_HAS_AUTHORITY))
709 return rtUriPercentDecodeN(&pszUri[pParsed->offAuthority], pParsed->cchAuthority);
710 return NULL;
711}
712
713
714RTDECL(char *) RTUriParsedAuthorityUsername(const char *pszUri, PCRTURIPARSED pParsed)
715{
716 AssertPtrReturn(pszUri, NULL);
717 AssertPtrReturn(pParsed, NULL);
718 AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
719 if (pParsed->cchAuthorityUsername)
720 return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityUsername], pParsed->cchAuthorityUsername);
721 return NULL;
722}
723
724
725RTDECL(char *) RTUriParsedAuthorityPassword(const char *pszUri, PCRTURIPARSED pParsed)
726{
727 AssertPtrReturn(pszUri, NULL);
728 AssertPtrReturn(pParsed, NULL);
729 AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
730 if (pParsed->cchAuthorityPassword)
731 return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityPassword], pParsed->cchAuthorityPassword);
732 return NULL;
733}
734
735
736RTDECL(char *) RTUriParsedAuthorityHost(const char *pszUri, PCRTURIPARSED pParsed)
737{
738 AssertPtrReturn(pszUri, NULL);
739 AssertPtrReturn(pParsed, NULL);
740 AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
741 if (pParsed->cchAuthorityHost)
742 return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityHost], pParsed->cchAuthorityHost);
743 return NULL;
744}
745
746
747RTDECL(uint32_t) RTUriParsedAuthorityPort(const char *pszUri, PCRTURIPARSED pParsed)
748{
749 AssertPtrReturn(pszUri, UINT32_MAX);
750 AssertPtrReturn(pParsed, UINT32_MAX);
751 AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, UINT32_MAX);
752 return pParsed->uAuthorityPort;
753}
754
755
756RTDECL(char *) RTUriParsedPath(const char *pszUri, PCRTURIPARSED pParsed)
757{
758 AssertPtrReturn(pszUri, NULL);
759 AssertPtrReturn(pParsed, NULL);
760 AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
761 if (pParsed->cchPath)
762 return rtUriPercentDecodeN(&pszUri[pParsed->offPath], pParsed->cchPath);
763 return NULL;
764}
765
766
767RTDECL(char *) RTUriParsedQuery(const char *pszUri, PCRTURIPARSED pParsed)
768{
769 AssertPtrReturn(pszUri, NULL);
770 AssertPtrReturn(pParsed, NULL);
771 AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
772 if (pParsed->cchQuery)
773 return rtUriPercentDecodeN(&pszUri[pParsed->offQuery], pParsed->cchQuery);
774 return NULL;
775}
776
777
778RTDECL(char *) RTUriParsedFragment(const char *pszUri, PCRTURIPARSED pParsed)
779{
780 AssertPtrReturn(pszUri, NULL);
781 AssertPtrReturn(pParsed, NULL);
782 AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
783 if (pParsed->cchFragment)
784 return rtUriPercentDecodeN(&pszUri[pParsed->offFragment], pParsed->cchFragment);
785 return NULL;
786}
787
788
789RTDECL(char *) RTUriCreate(const char *pszScheme, const char *pszAuthority, const char *pszPath, const char *pszQuery,
790 const char *pszFragment)
791{
792 if (!pszScheme) /* Scheme is minimum requirement */
793 return NULL;
794
795 char *pszResult = 0;
796 char *pszAuthority1 = 0;
797 char *pszPath1 = 0;
798 char *pszQuery1 = 0;
799 char *pszFragment1 = 0;
800
801 do
802 {
803 /* Create the percent encoded strings and calculate the necessary uri
804 * length. */
805 size_t cbSize = strlen(pszScheme) + 1 + 1; /* plus zero byte */
806 if (pszAuthority)
807 {
808 pszAuthority1 = rtUriPercentEncodeN(pszAuthority, RTSTR_MAX);
809 if (!pszAuthority1)
810 break;
811 cbSize += strlen(pszAuthority1) + 2;
812 }
813 if (pszPath)
814 {
815 pszPath1 = rtUriPercentEncodeN(pszPath, RTSTR_MAX);
816 if (!pszPath1)
817 break;
818 cbSize += strlen(pszPath1);
819 }
820 if (pszQuery)
821 {
822 pszQuery1 = rtUriPercentEncodeN(pszQuery, RTSTR_MAX);
823 if (!pszQuery1)
824 break;
825 cbSize += strlen(pszQuery1) + 1;
826 }
827 if (pszFragment)
828 {
829 pszFragment1 = rtUriPercentEncodeN(pszFragment, RTSTR_MAX);
830 if (!pszFragment1)
831 break;
832 cbSize += strlen(pszFragment1) + 1;
833 }
834
835 char *pszTmp = pszResult = (char *)RTStrAlloc(cbSize);
836 if (!pszResult)
837 break;
838 RT_BZERO(pszTmp, cbSize);
839
840 /* Compose the target uri string. */
841 RTStrCatP(&pszTmp, &cbSize, pszScheme);
842 RTStrCatP(&pszTmp, &cbSize, ":");
843 if (pszAuthority1)
844 {
845 RTStrCatP(&pszTmp, &cbSize, "//");
846 RTStrCatP(&pszTmp, &cbSize, pszAuthority1);
847 }
848 if (pszPath1)
849 {
850 RTStrCatP(&pszTmp, &cbSize, pszPath1);
851 }
852 if (pszQuery1)
853 {
854 RTStrCatP(&pszTmp, &cbSize, "?");
855 RTStrCatP(&pszTmp, &cbSize, pszQuery1);
856 }
857 if (pszFragment1)
858 {
859 RTStrCatP(&pszTmp, &cbSize, "#");
860 RTStrCatP(&pszTmp, &cbSize, pszFragment1);
861 }
862 } while (0);
863
864 /* Cleanup */
865 if (pszAuthority1)
866 RTStrFree(pszAuthority1);
867 if (pszPath1)
868 RTStrFree(pszPath1);
869 if (pszQuery1)
870 RTStrFree(pszQuery1);
871 if (pszFragment1)
872 RTStrFree(pszFragment1);
873
874 return pszResult;
875}
876
877
878RTDECL(bool) RTUriIsSchemeMatch(const char *pszUri, const char *pszScheme)
879{
880 AssertPtrReturn(pszUri, false);
881 size_t const cchScheme = strlen(pszScheme);
882 return RTStrNICmp(pszUri, pszScheme, cchScheme) == 0
883 && pszUri[cchScheme] == ':';
884}
885
886
887RTDECL(int) RTUriFileCreateEx(const char *pszPath, uint32_t fPathStyle, char **ppszUri, size_t cbUri, size_t *pcchUri)
888{
889 /*
890 * Validate and adjust input. (RTPathParse check pszPath out for us)
891 */
892 if (pcchUri)
893 {
894 AssertPtrReturn(pcchUri, VERR_INVALID_POINTER);
895 *pcchUri = ~(size_t)0;
896 }
897 AssertPtrReturn(ppszUri, VERR_INVALID_POINTER);
898 AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
899 if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
900 fPathStyle = RTPATH_STYLE;
901
902 /*
903 * Let the RTPath code parse the stuff (no reason to duplicate path parsing
904 * and get it slightly wrong here).
905 */
906 union
907 {
908 RTPATHPARSED ParsedPath;
909 uint8_t abPadding[sizeof(RTPATHPARSED)];
910 } u;
911 int rc = RTPathParse(pszPath, &u.ParsedPath, sizeof(u.ParsedPath), fPathStyle);
912 if (RT_SUCCESS(rc) || rc == VERR_BUFFER_OVERFLOW)
913 {
914 /* Skip leading slashes. */
915 if (u.ParsedPath.fProps & RTPATH_PROP_ROOT_SLASH)
916 {
917 if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
918 while (pszPath[0] == '/' || pszPath[0] == '\\')
919 pszPath++;
920 else
921 while (pszPath[0] == '/')
922 pszPath++;
923 }
924 const size_t cchPath = strlen(pszPath);
925
926 /*
927 * Calculate the encoded length and figure destination buffering.
928 */
929 static const char s_szPrefix[] = "file:///";
930 size_t const cchPrefix = sizeof(s_szPrefix) - (u.ParsedPath.fProps & RTPATH_PROP_UNC ? 2 : 1);
931 size_t cchEncoded = rtUriCalcEncodedLength(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS);
932
933 if (pcchUri)
934 *pcchUri = cchEncoded;
935
936 char *pszDst;
937 char *pszFreeMe = NULL;
938 if (!cbUri || *ppszUri == NULL)
939 {
940 cbUri = RT_MAX(cbUri, cchPrefix + cchEncoded + 1);
941 *ppszUri = pszFreeMe = pszDst = RTStrAlloc(cbUri);
942 AssertReturn(pszDst, VERR_NO_STR_MEMORY);
943 }
944 else if (cchEncoded < cbUri)
945 pszDst = *ppszUri;
946 else
947 return VERR_BUFFER_OVERFLOW;
948
949 /*
950 * Construct the URI.
951 */
952 memcpy(pszDst, s_szPrefix, cchPrefix);
953 pszDst[cchPrefix] = '\0';
954 rc = rtUriEncodeIntoBuffer(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS, &pszDst[cchPrefix], cbUri - cchPrefix);
955 if (RT_SUCCESS(rc))
956 {
957 Assert(strlen(pszDst) == cbUri - 1);
958 if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
959 RTPathChangeToUnixSlashes(pszDst, true /*fForce*/);
960 return VINF_SUCCESS;
961 }
962
963 AssertRC(rc); /* Impossible! rtUriCalcEncodedLength or something above is busted! */
964 if (pszFreeMe)
965 RTStrFree(pszFreeMe);
966 }
967 return rc;
968}
969
970
971RTDECL(char *) RTUriFileCreate(const char *pszPath)
972{
973 char *pszUri = NULL;
974 int rc = RTUriFileCreateEx(pszPath, RTPATH_STR_F_STYLE_HOST, &pszUri, 0 /*cbUri*/, NULL /*pcchUri*/);
975 if (RT_SUCCESS(rc))
976 return pszUri;
977 return NULL;
978}
979
980
981RTDECL(int) RTUriFilePathEx(const char *pszUri, uint32_t fPathStyle, char **ppszPath, size_t cbPath, size_t *pcchPath)
982{
983 /*
984 * Validate and adjust input.
985 */
986 if (pcchPath)
987 {
988 AssertPtrReturn(pcchPath, VERR_INVALID_POINTER);
989 *pcchPath = ~(size_t)0;
990 }
991 AssertPtrReturn(ppszPath, VERR_INVALID_POINTER);
992 AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
993 if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
994 fPathStyle = RTPATH_STYLE;
995 AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
996
997 /*
998 * Check that this is a file URI.
999 */
1000 if (RTStrNICmp(pszUri, RT_STR_TUPLE("file:")) == 0)
1001 { /* likely */ }
1002 else
1003 return VERR_URI_NOT_FILE_SCHEME;
1004
1005 /*
1006 * We may have a number of variations here, mostly thanks to
1007 * various windows software. First the canonical variations:
1008 * - file:///C:/Windows/System32/kernel32.dll
1009 * - file:///C|/Windows/System32/kernel32.dll
1010 * - file:///C:%5CWindows%5CSystem32%5Ckernel32.dll
1011 * - file://localhost/C:%5CWindows%5CSystem32%5Ckernel32.dll
1012 * - file://cifsserver.dev/systemshare%5CWindows%5CSystem32%5Ckernel32.dll
1013 * - file://cifsserver.dev:139/systemshare%5CWindows%5CSystem32%5Ckernel32.dll (not quite sure here, but whatever)
1014 *
1015 * Legacy variant without any slashes after the schema:
1016 * - file:C:/Windows/System32/kernel32.dll
1017 * - file:C|/Windows/System32%5Ckernel32.dll
1018 * - file:~/.bashrc
1019 * \--path-/
1020 *
1021 * Legacy variant with exactly one slashes after the schema:
1022 * - file:/C:/Windows/System32%5Ckernel32.dll
1023 * - file:/C|/Windows/System32/kernel32.dll
1024 * - file:/usr/bin/env
1025 * \---path---/
1026 *
1027 * Legacy variant with two slashes after the schema and an unescaped DOS path:
1028 * - file://C:/Windows/System32\kernel32.dll (**)
1029 * - file://C|/Windows/System32\kernel32.dll
1030 * \---path---------------------/
1031 * -- authority, with ':' as non-working port separator
1032 *
1033 * Legacy variant with exactly four slashes after the schema and an unescaped DOS path.
1034 * - file:////C:/Windows\System32\user32.dll
1035 *
1036 * Legacy variant with four or more slashes after the schema and an unescaped UNC path:
1037 * - file:////cifsserver.dev/systemshare/System32%\kernel32.dll
1038 * - file://///cifsserver.dev/systemshare/System32\kernel32.dll
1039 * \---path--------------------------------------------/
1040 *
1041 * The two unescaped variants shouldn't be handed to rtUriParse, which
1042 * is good as we cannot actually handle the one marked by (**). So, handle
1043 * those two special when parsing.
1044 */
1045 RTURIPARSED Parsed;
1046 int rc;
1047 size_t cSlashes = 0;
1048 while (pszUri[5 + cSlashes] == '/')
1049 cSlashes++;
1050 if ( (cSlashes == 2 || cSlashes == 4)
1051 && RT_C_IS_ALPHA(pszUri[5 + cSlashes])
1052 && (pszUri[5 + cSlashes + 1] == ':' || pszUri[5 + cSlashes + 1] == '|'))
1053 {
1054 RT_ZERO(Parsed); /* RTURIPARSED_F_CONTAINS_ESCAPED_CHARS is now clear. */
1055 Parsed.offPath = 5 + cSlashes;
1056 Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1057 rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1058 }
1059 else if (cSlashes >= 4)
1060 {
1061 RT_ZERO(Parsed);
1062 Parsed.fFlags = cSlashes > 4 ? RTURIPARSED_F_CONTAINS_ESCAPED_CHARS : 0;
1063 Parsed.offPath = 5 + cSlashes - 2;
1064 Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1065 rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1066 }
1067 else
1068 rc = rtUriParse(pszUri, &Parsed);
1069 if (RT_SUCCESS(rc))
1070 {
1071 /*
1072 * Ignore localhost as hostname (it's implicit).
1073 */
1074 static char const s_szLocalhost[] = "localhost";
1075 if ( Parsed.cchAuthorityHost == sizeof(s_szLocalhost) - 1U
1076 && RTStrNICmp(&pszUri[Parsed.offAuthorityHost], RT_STR_TUPLE(s_szLocalhost)) == 0)
1077 {
1078 Parsed.cchAuthorityHost = 0;
1079 Parsed.cchAuthority = 0;
1080 }
1081
1082 /*
1083 * Ignore leading path slash/separator if we detect a DOS drive letter
1084 * and we don't have a host name.
1085 */
1086 if ( Parsed.cchPath >= 3
1087 && Parsed.cchAuthorityHost == 0
1088 && pszUri[Parsed.offPath] == '/' /* Leading path slash/separator. */
1089 && ( pszUri[Parsed.offPath + 2] == ':' /* Colon after drive letter. */
1090 || pszUri[Parsed.offPath + 2] == '|') /* Colon alternative. */
1091 && RT_C_IS_ALPHA(pszUri[Parsed.offPath + 1]) ) /* Drive letter. */
1092 {
1093 Parsed.offPath++;
1094 Parsed.cchPath--;
1095 }
1096
1097 /*
1098 * Calculate the size of the encoded result.
1099 *
1100 * Since we're happily returning "C:/Windows/System32/kernel.dll"
1101 * style paths when the caller requested UNIX style paths, we will
1102 * return straight UNC paths too ("//cifsserver/share/dir/file").
1103 */
1104 size_t cchDecodedHost = 0;
1105 size_t cbResult;
1106 if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1107 {
1108 cchDecodedHost = rtUriCalcDecodedLength(&pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1109 cbResult = cchDecodedHost + rtUriCalcDecodedLength(&pszUri[Parsed.offPath], Parsed.cchPath) + 1;
1110 }
1111 else
1112 {
1113 cchDecodedHost = 0;
1114 cbResult = Parsed.cchAuthorityHost + Parsed.cchPath + 1;
1115 }
1116 if (pcchPath)
1117 *pcchPath = cbResult - 1;
1118 if (cbResult > 1)
1119 {
1120 /*
1121 * Prepare the necessary buffer space for the result.
1122 */
1123 char *pszDst;
1124 char *pszFreeMe = NULL;
1125 if (!cbPath || *ppszPath == NULL)
1126 {
1127 cbPath = RT_MAX(cbPath, cbResult);
1128 *ppszPath = pszFreeMe = pszDst = RTStrAlloc(cbPath);
1129 AssertReturn(pszDst, VERR_NO_STR_MEMORY);
1130 }
1131 else if (cbResult <= cbPath)
1132 pszDst = *ppszPath;
1133 else
1134 return VERR_BUFFER_OVERFLOW;
1135
1136 /*
1137 * Compose the result.
1138 */
1139 if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1140 {
1141 rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offAuthorityHost],Parsed.cchAuthorityHost,
1142 pszDst, cchDecodedHost + 1);
1143 Assert(RT_SUCCESS(rc) && strlen(pszDst) == cchDecodedHost);
1144 if (RT_SUCCESS(rc))
1145 rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offPath], Parsed.cchPath,
1146 &pszDst[cchDecodedHost], cbResult - cchDecodedHost);
1147 Assert(RT_SUCCESS(rc) && strlen(pszDst) == cbResult - 1);
1148 }
1149 else
1150 {
1151 memcpy(pszDst, &pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1152 memcpy(&pszDst[Parsed.cchAuthorityHost], &pszUri[Parsed.offPath], Parsed.cchPath);
1153 pszDst[cbResult - 1] = '\0';
1154 }
1155 if (RT_SUCCESS(rc))
1156 {
1157 /*
1158 * Convert colon DOS driver letter colon alternative.
1159 * We do this regardless of the desired path style.
1160 */
1161 if ( RT_C_IS_ALPHA(pszDst[0])
1162 && pszDst[1] == '|')
1163 pszDst[1] = ':';
1164
1165 /*
1166 * Fix slashes.
1167 */
1168 if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
1169 RTPathChangeToDosSlashes(pszDst, true);
1170 else if (fPathStyle == RTPATH_STR_F_STYLE_UNIX)
1171 RTPathChangeToUnixSlashes(pszDst, true); /** @todo not quite sure how this actually makes sense... */
1172 else
1173 AssertFailed();
1174 return rc;
1175 }
1176
1177 /* bail out */
1178 RTStrFree(pszFreeMe);
1179 }
1180 else
1181 rc = VERR_PATH_ZERO_LENGTH;
1182 }
1183 return rc;
1184}
1185
1186
1187RTDECL(char *) RTUriFilePath(const char *pszUri)
1188{
1189 char *pszPath = NULL;
1190 int rc = RTUriFilePathEx(pszUri, RTPATH_STR_F_STYLE_HOST, &pszPath, 0 /*cbPath*/, NULL /*pcchPath*/);
1191 if (RT_SUCCESS(rc))
1192 return pszPath;
1193 return NULL;
1194}
1195
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use