VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp

Last change on this file was 98103, checked in by vboxsync, 16 months ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Revision
File size: 36.3 KB
Line 
1/* $Id: utf-16.cpp 98103 2023-01-17 14:15:46Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * The contents of this file may alternatively be used under the terms
26 * of the Common Development and Distribution License Version 1.0
27 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28 * in the VirtualBox distribution, in which case the provisions of the
29 * CDDL are applicable instead of those of the GPL.
30 *
31 * You may elect to license modified versions of this file under the
32 * terms and conditions of either the GPL or the CDDL or both.
33 *
34 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35 */
36
37
38/*********************************************************************************************************************************
39* Header Files *
40*********************************************************************************************************************************/
41#include <iprt/utf16.h>
42#include "internal/iprt.h"
43
44#include <iprt/uni.h>
45#include <iprt/asm.h>
46#include <iprt/mem.h>
47#include <iprt/assert.h>
48#include <iprt/err.h>
49#include "internal/string.h"
50
51
52/**
53 * Get get length in code points of an UTF-16 encoded string, validating the
54 * string while doing so.
55 *
56 * @returns IPRT status code.
57 * @param pwsz Pointer to the UTF-16 string.
58 * @param cwc The max length of the string in UTF-16 units. Use
59 * RTSTR_MAX if all of the string is to be examined.
60 * @param pcuc Where to store the length in unicode code points.
61 * @param pcwcActual Where to store the actual size of the UTF-16 string
62 * on success. Optional.
63 */
64static int rtUtf16Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcuc, size_t *pcwcActual)
65{
66 PCRTUTF16 pwszStart = pwsz;
67 size_t cCodePoints = 0;
68 while (cwc > 0)
69 {
70 RTUTF16 wc = *pwsz;
71 if (!wc)
72 break;
73 if (wc < 0xd800 || wc > 0xdfff)
74 {
75 cCodePoints++;
76 pwsz++;
77 cwc--;
78 }
79 /* Surrogate pair: */
80 else if (wc >= 0xdc00)
81 {
82 RTStrAssertMsgFailed(("Lone UTF-16 trail surrogate: %#x (%.*Rhxs)\n", wc, RT_MIN(cwc * 2, 10), pwsz));
83 return VERR_INVALID_UTF16_ENCODING;
84 }
85 else if (cwc < 2)
86 {
87 RTStrAssertMsgFailed(("Lone UTF-16 lead surrogate: %#x\n", wc));
88 return VERR_INVALID_UTF16_ENCODING;
89 }
90 else
91 {
92 RTUTF16 wcTrail = pwsz[1];
93 if (wcTrail < 0xdc00 || wcTrail > 0xdfff)
94 {
95 RTStrAssertMsgFailed(("Invalid UTF-16 trail surrogate: %#x (lead %#x)\n", wcTrail, wc));
96 return VERR_INVALID_UTF16_ENCODING;
97 }
98
99 cCodePoints++;
100 pwsz += 2;
101 cwc -= 2;
102 }
103 }
104
105 /* done */
106 *pcuc = cCodePoints;
107 if (pcwcActual)
108 *pcwcActual = pwsz - pwszStart;
109 return VINF_SUCCESS;
110}
111
112
113RTDECL(PRTUTF16) RTUtf16AllocTag(size_t cb, const char *pszTag)
114{
115 if (cb > sizeof(RTUTF16))
116 cb = RT_ALIGN_Z(cb, sizeof(RTUTF16));
117 else
118 cb = sizeof(RTUTF16);
119 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
120 if (pwsz)
121 *pwsz = '\0';
122 return pwsz;
123}
124RT_EXPORT_SYMBOL(RTUtf16AllocTag);
125
126
127RTDECL(int) RTUtf16ReallocTag(PRTUTF16 *ppwsz, size_t cbNew, const char *pszTag)
128{
129 PRTUTF16 pwszOld = *ppwsz;
130 cbNew = RT_ALIGN_Z(cbNew, sizeof(RTUTF16));
131 if (!cbNew)
132 {
133 RTMemFree(pwszOld);
134 *ppwsz = NULL;
135 }
136 else if (pwszOld)
137 {
138 PRTUTF16 pwszNew = (PRTUTF16)RTMemReallocTag(pwszOld, cbNew, pszTag);
139 if (!pwszNew)
140 return VERR_NO_STR_MEMORY;
141 pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0';
142 *ppwsz = pwszNew;
143 }
144 else
145 {
146 PRTUTF16 pwszNew = (PRTUTF16)RTMemAllocTag(cbNew, pszTag);
147 if (!pwszNew)
148 return VERR_NO_UTF16_MEMORY;
149 pwszNew[0] = '\0';
150 pwszNew[cbNew / sizeof(RTUTF16) - 1] = '\0';
151 *ppwsz = pwszNew;
152 }
153 return VINF_SUCCESS;
154}
155RT_EXPORT_SYMBOL(RTUtf16ReallocTag);
156
157
158RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
159{
160 if (pwszString)
161 RTMemTmpFree(pwszString);
162}
163RT_EXPORT_SYMBOL(RTUtf16Free);
164
165
166RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
167{
168 Assert(pwszString);
169 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
170 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
171 if (pwsz)
172 memcpy(pwsz, pwszString, cb);
173 return pwsz;
174}
175RT_EXPORT_SYMBOL(RTUtf16DupTag);
176
177
178RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
179{
180 Assert(pwszString);
181 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
182 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
183 if (pwsz)
184 {
185 memcpy(pwsz, pwszString, cb);
186 *ppwszString = pwsz;
187 return VINF_SUCCESS;
188 }
189 return VERR_NO_MEMORY;
190}
191RT_EXPORT_SYMBOL(RTUtf16DupExTag);
192
193
194RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
195{
196 if (!pwszString)
197 return 0;
198
199 PCRTUTF16 pwsz = pwszString;
200 while (*pwsz)
201 pwsz++;
202 return pwsz - pwszString;
203}
204RT_EXPORT_SYMBOL(RTUtf16Len);
205
206
207RTDECL(int) RTUtf16Cmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2)
208{
209 if (pwsz1 == pwsz2)
210 return 0;
211 if (!pwsz1)
212 return -1;
213 if (!pwsz2)
214 return 1;
215
216 for (;;)
217 {
218 RTUTF16 wcs = *pwsz1;
219 int iDiff = wcs - *pwsz2;
220 if (iDiff || !wcs)
221 return iDiff;
222 pwsz1++;
223 pwsz2++;
224 }
225}
226RT_EXPORT_SYMBOL(RTUtf16Cmp);
227
228
229RTDECL(int) RTUtf16CmpUtf8(PCRTUTF16 pwsz1, const char *psz2)
230{
231 /*
232 * NULL and empty strings are all the same.
233 */
234 if (!pwsz1)
235 return !psz2 || !*psz2 ? 0 : -1;
236 if (!psz2)
237 return !*pwsz1 ? 0 : 1;
238
239 /*
240 * Compare with a UTF-8 string by enumerating them char by char.
241 */
242 for (;;)
243 {
244 RTUNICP uc1;
245 int rc = RTUtf16GetCpEx(&pwsz1, &uc1);
246 AssertRCReturn(rc, 1);
247
248 RTUNICP uc2;
249 rc = RTStrGetCpEx(&psz2, &uc2);
250 AssertRCReturn(rc, -1);
251 if (uc1 == uc2)
252 {
253 if (uc1)
254 continue;
255 return 0;
256 }
257 return uc1 < uc2 ? -1 : 1;
258 }
259}
260RT_EXPORT_SYMBOL(RTUtf16CmpUtf8);
261
262
263RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz)
264{
265 return RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
266}
267RT_EXPORT_SYMBOL(RTUtf16ValidateEncoding);
268
269
270RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags)
271{
272 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED | RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)),
273 VERR_INVALID_PARAMETER);
274 AssertPtr(pwsz);
275
276 /*
277 * Use rtUtf16Length for the job.
278 */
279 size_t cwcActual = 0; /* Shut up cc1plus. */
280 size_t cCpsIgnored;
281 int rc = rtUtf16Length(pwsz, cwc, &cCpsIgnored, &cwcActual);
282 if (RT_SUCCESS(rc))
283 {
284 if (fFlags & RTSTR_VALIDATE_ENCODING_EXACT_LENGTH)
285 {
286 if (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
287 cwcActual++;
288 if (cwcActual == cwc)
289 rc = VINF_SUCCESS;
290 else if (cwcActual < cwc)
291 rc = VERR_BUFFER_UNDERFLOW;
292 else
293 rc = VERR_BUFFER_OVERFLOW;
294 }
295 else if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
296 && cwcActual >= cwc)
297 rc = VERR_BUFFER_OVERFLOW;
298 }
299 return rc;
300}
301RT_EXPORT_SYMBOL(RTUtf16ValidateEncodingEx);
302
303
304RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz)
305{
306 int rc = RTUtf16ValidateEncodingEx(pwsz, RTSTR_MAX, 0);
307 return RT_SUCCESS(rc);
308}
309RT_EXPORT_SYMBOL(RTUtf16IsValidEncoding);
310
311
312/**
313 * Helper for RTUtf16PurgeComplementSet.
314 *
315 * @returns true if @a Cp is valid, false if not.
316 * @param Cp The code point to validate.
317 * @param puszValidPairs Pair of valid code point sets.
318 * @param cValidPairs Number of pairs.
319 */
320DECLINLINE(bool) rtUtf16PurgeIsInSet(RTUNICP Cp, PCRTUNICP puszValidPairs, uint32_t cValidPairs)
321{
322 while (cValidPairs-- > 0)
323 {
324 if ( Cp >= puszValidPairs[0]
325 && Cp <= puszValidPairs[1])
326 return true;
327 puszValidPairs += 2;
328 }
329 return false;
330}
331
332
333RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidPairs, char chReplacement)
334{
335 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
336
337 /*
338 * Calc valid pairs and check that we've got an even number.
339 */
340 uint32_t cValidPairs = 0;
341 while (puszValidPairs[cValidPairs * 2])
342 {
343 AssertReturn(puszValidPairs[cValidPairs * 2 + 1], -1);
344 AssertMsg(puszValidPairs[cValidPairs * 2] <= puszValidPairs[cValidPairs * 2 + 1],
345 ("%#x vs %#x\n", puszValidPairs[cValidPairs * 2], puszValidPairs[cValidPairs * 2 + 1]));
346 cValidPairs++;
347 }
348
349 /*
350 * Do the replacing.
351 */
352 ssize_t cReplacements = 0;
353 for (;;)
354 {
355 PRTUTF16 pwszCur = pwsz;
356 RTUNICP Cp;
357 int rc = RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp);
358 if (RT_SUCCESS(rc))
359 {
360 if (Cp)
361 {
362 if (!rtUtf16PurgeIsInSet(Cp, puszValidPairs, cValidPairs))
363 {
364 for (; pwszCur != pwsz; ++pwszCur)
365 *pwszCur = chReplacement;
366 ++cReplacements;
367 }
368 }
369 else
370 break;
371 }
372 else
373 return -1;
374 }
375 return cReplacements;
376}
377RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet);
378
379
380/**
381 * Validate the UTF-16BE encoding and calculates the length of an UTF-8
382 * encoding.
383 *
384 * @returns iprt status code.
385 * @param pwsz The UTF-16BE string.
386 * @param cwc The max length of the UTF-16BE string to consider.
387 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
388 *
389 * @note rtUtf16LittleCalcUtf8Length | s/RT_LE2H_U16/RT_BE2H_U16/g
390 */
391static int rtUtf16BigCalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
392{
393 int rc = VINF_SUCCESS;
394 size_t cch = 0;
395 while (cwc > 0)
396 {
397 RTUTF16 wc = *pwsz++; cwc--;
398 if (!wc)
399 break;
400 wc = RT_BE2H_U16(wc);
401 if (wc < 0xd800 || wc > 0xdfff)
402 {
403 if (wc < 0x80)
404 cch++;
405 else if (wc < 0x800)
406 cch += 2;
407 else if (wc < 0xfffe)
408 cch += 3;
409 else
410 {
411 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
412 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
413 break;
414 }
415 }
416 else
417 {
418 if (wc >= 0xdc00)
419 {
420 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
421 rc = VERR_INVALID_UTF16_ENCODING;
422 break;
423 }
424 if (cwc <= 0)
425 {
426 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
427 rc = VERR_INVALID_UTF16_ENCODING;
428 break;
429 }
430 wc = *pwsz++; cwc--;
431 wc = RT_BE2H_U16(wc);
432 if (wc < 0xdc00 || wc > 0xdfff)
433 {
434 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
435 rc = VERR_INVALID_UTF16_ENCODING;
436 break;
437 }
438 cch += 4;
439 }
440 }
441
442
443 /* done */
444 *pcch = cch;
445 return rc;
446}
447
448
449/**
450 * Validate the UTF-16LE encoding and calculates the length of an UTF-8
451 * encoding.
452 *
453 * @returns iprt status code.
454 * @param pwsz The UTF-16LE string.
455 * @param cwc The max length of the UTF-16LE string to consider.
456 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
457 *
458 * @note rtUtf16BigCalcUtf8Length | s/RT_BE2H_U16/RT_LE2H_U16/g
459 */
460static int rtUtf16LittleCalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
461{
462 int rc = VINF_SUCCESS;
463 size_t cch = 0;
464 while (cwc > 0)
465 {
466 RTUTF16 wc = *pwsz++; cwc--;
467 if (!wc)
468 break;
469 wc = RT_LE2H_U16(wc);
470 if (wc < 0xd800 || wc > 0xdfff)
471 {
472 if (wc < 0x80)
473 cch++;
474 else if (wc < 0x800)
475 cch += 2;
476 else if (wc < 0xfffe)
477 cch += 3;
478 else
479 {
480 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
481 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
482 break;
483 }
484 }
485 else
486 {
487 if (wc >= 0xdc00)
488 {
489 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
490 rc = VERR_INVALID_UTF16_ENCODING;
491 break;
492 }
493 if (cwc <= 0)
494 {
495 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
496 rc = VERR_INVALID_UTF16_ENCODING;
497 break;
498 }
499 wc = *pwsz++; cwc--;
500 wc = RT_LE2H_U16(wc);
501 if (wc < 0xdc00 || wc > 0xdfff)
502 {
503 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
504 rc = VERR_INVALID_UTF16_ENCODING;
505 break;
506 }
507 cch += 4;
508 }
509 }
510
511
512 /* done */
513 *pcch = cch;
514 return rc;
515}
516
517
518/**
519 * Recodes an valid UTF-16BE string as UTF-8.
520 *
521 * @returns iprt status code.
522 * @param pwsz The UTF-16BE string.
523 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
524 * will stop when cwc or '\\0' is reached.
525 * @param psz Where to store the UTF-8 string.
526 * @param cch The size of the UTF-8 buffer, excluding the terminator.
527 * @param pcch Where to store the number of octets actually encoded.
528 *
529 * @note rtUtf16LittleRecodeAsUtf8 == s/RT_BE2H_U16/RT_LE2H_U16/g
530 */
531static int rtUtf16BigRecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
532{
533 unsigned char *pwch = (unsigned char *)psz;
534 int rc = VINF_SUCCESS;
535 while (cwc > 0)
536 {
537 RTUTF16 wc = *pwsz++; cwc--;
538 if (!wc)
539 break;
540 wc = RT_BE2H_U16(wc);
541 if (wc < 0xd800 || wc > 0xdfff)
542 {
543 if (wc < 0x80)
544 {
545 if (RT_UNLIKELY(cch < 1))
546 {
547 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
548 rc = VERR_BUFFER_OVERFLOW;
549 break;
550 }
551 cch--;
552 *pwch++ = (unsigned char)wc;
553 }
554 else if (wc < 0x800)
555 {
556 if (RT_UNLIKELY(cch < 2))
557 {
558 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
559 rc = VERR_BUFFER_OVERFLOW;
560 break;
561 }
562 cch -= 2;
563 *pwch++ = 0xc0 | (wc >> 6);
564 *pwch++ = 0x80 | (wc & 0x3f);
565 }
566 else if (wc < 0xfffe)
567 {
568 if (RT_UNLIKELY(cch < 3))
569 {
570 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
571 rc = VERR_BUFFER_OVERFLOW;
572 break;
573 }
574 cch -= 3;
575 *pwch++ = 0xe0 | (wc >> 12);
576 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
577 *pwch++ = 0x80 | (wc & 0x3f);
578 }
579 else
580 {
581 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
582 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
583 break;
584 }
585 }
586 else
587 {
588 if (wc >= 0xdc00)
589 {
590 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
591 rc = VERR_INVALID_UTF16_ENCODING;
592 break;
593 }
594 if (cwc <= 0)
595 {
596 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
597 rc = VERR_INVALID_UTF16_ENCODING;
598 break;
599 }
600 RTUTF16 wc2 = *pwsz++; cwc--;
601 wc2 = RT_BE2H_U16(wc2);
602 if (wc2 < 0xdc00 || wc2 > 0xdfff)
603 {
604 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
605 rc = VERR_INVALID_UTF16_ENCODING;
606 break;
607 }
608 uint32_t CodePoint = 0x10000
609 + ( ((wc & 0x3ff) << 10)
610 | (wc2 & 0x3ff));
611 if (RT_UNLIKELY(cch < 4))
612 {
613 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
614 rc = VERR_BUFFER_OVERFLOW;
615 break;
616 }
617 cch -= 4;
618 *pwch++ = 0xf0 | (CodePoint >> 18);
619 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
620 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
621 *pwch++ = 0x80 | (CodePoint & 0x3f);
622 }
623 }
624
625 /* done */
626 *pwch = '\0';
627 *pcch = (char *)pwch - psz;
628 return rc;
629}
630
631
632/**
633 * Recodes an valid UTF-16LE string as UTF-8.
634 *
635 * @returns iprt status code.
636 * @param pwsz The UTF-16LE string.
637 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
638 * will stop when cwc or '\\0' is reached.
639 * @param psz Where to store the UTF-8 string.
640 * @param cch The size of the UTF-8 buffer, excluding the terminator.
641 * @param pcch Where to store the number of octets actually encoded.
642 *
643 * @note rtUtf16LittleRecodeAsUtf8 == s/RT_LE2H_U16/RT_GE2H_U16/g
644 */
645static int rtUtf16LittleRecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
646{
647 unsigned char *pwch = (unsigned char *)psz;
648 int rc = VINF_SUCCESS;
649 while (cwc > 0)
650 {
651 RTUTF16 wc = *pwsz++; cwc--;
652 if (!wc)
653 break;
654 wc = RT_LE2H_U16(wc);
655 if (wc < 0xd800 || wc > 0xdfff)
656 {
657 if (wc < 0x80)
658 {
659 if (RT_UNLIKELY(cch < 1))
660 {
661 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
662 rc = VERR_BUFFER_OVERFLOW;
663 break;
664 }
665 cch--;
666 *pwch++ = (unsigned char)wc;
667 }
668 else if (wc < 0x800)
669 {
670 if (RT_UNLIKELY(cch < 2))
671 {
672 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
673 rc = VERR_BUFFER_OVERFLOW;
674 break;
675 }
676 cch -= 2;
677 *pwch++ = 0xc0 | (wc >> 6);
678 *pwch++ = 0x80 | (wc & 0x3f);
679 }
680 else if (wc < 0xfffe)
681 {
682 if (RT_UNLIKELY(cch < 3))
683 {
684 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
685 rc = VERR_BUFFER_OVERFLOW;
686 break;
687 }
688 cch -= 3;
689 *pwch++ = 0xe0 | (wc >> 12);
690 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
691 *pwch++ = 0x80 | (wc & 0x3f);
692 }
693 else
694 {
695 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
696 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
697 break;
698 }
699 }
700 else
701 {
702 if (wc >= 0xdc00)
703 {
704 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
705 rc = VERR_INVALID_UTF16_ENCODING;
706 break;
707 }
708 if (cwc <= 0)
709 {
710 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
711 rc = VERR_INVALID_UTF16_ENCODING;
712 break;
713 }
714 RTUTF16 wc2 = *pwsz++; cwc--;
715 wc2 = RT_LE2H_U16(wc2);
716 if (wc2 < 0xdc00 || wc2 > 0xdfff)
717 {
718 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
719 rc = VERR_INVALID_UTF16_ENCODING;
720 break;
721 }
722 uint32_t CodePoint = 0x10000
723 + ( ((wc & 0x3ff) << 10)
724 | (wc2 & 0x3ff));
725 if (RT_UNLIKELY(cch < 4))
726 {
727 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
728 rc = VERR_BUFFER_OVERFLOW;
729 break;
730 }
731 cch -= 4;
732 *pwch++ = 0xf0 | (CodePoint >> 18);
733 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
734 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
735 *pwch++ = 0x80 | (CodePoint & 0x3f);
736 }
737 }
738
739 /* done */
740 *pwch = '\0';
741 *pcch = (char *)pwch - psz;
742 return rc;
743}
744
745
746
747RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
748{
749 /*
750 * Validate input.
751 */
752 AssertPtr(ppszString);
753 AssertPtr(pwszString);
754 *ppszString = NULL;
755
756 /*
757 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
758 */
759 size_t cch;
760#ifdef RT_BIG_ENDIAN
761 int rc = rtUtf16BigCalcUtf8Length(pwszString, RTSTR_MAX, &cch);
762#else
763 int rc = rtUtf16LittleCalcUtf8Length(pwszString, RTSTR_MAX, &cch);
764#endif
765 if (RT_SUCCESS(rc))
766 {
767 /*
768 * Allocate buffer and recode it.
769 */
770 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
771 if (pszResult)
772 {
773#ifdef RT_BIG_ENDIAN
774 rc = rtUtf16BigRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
775#else
776 rc = rtUtf16LittleRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
777#endif
778 if (RT_SUCCESS(rc))
779 {
780 *ppszString = pszResult;
781 return rc;
782 }
783
784 RTMemFree(pszResult);
785 }
786 else
787 rc = VERR_NO_STR_MEMORY;
788 }
789 return rc;
790}
791RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
792
793
794RTDECL(int) RTUtf16BigToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
795{
796 /*
797 * Validate input.
798 */
799 AssertPtr(ppszString);
800 AssertPtr(pwszString);
801 *ppszString = NULL;
802
803 /*
804 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
805 */
806 size_t cch;
807 int rc = rtUtf16BigCalcUtf8Length(pwszString, RTSTR_MAX, &cch);
808 if (RT_SUCCESS(rc))
809 {
810 /*
811 * Allocate buffer and recode it.
812 */
813 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
814 if (pszResult)
815 {
816 rc = rtUtf16BigRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
817 if (RT_SUCCESS(rc))
818 {
819 *ppszString = pszResult;
820 return rc;
821 }
822
823 RTMemFree(pszResult);
824 }
825 else
826 rc = VERR_NO_STR_MEMORY;
827 }
828 return rc;
829}
830RT_EXPORT_SYMBOL(RTUtf16BigToUtf8Tag);
831
832
833RTDECL(int) RTUtf16LittleToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
834{
835 /*
836 * Validate input.
837 */
838 AssertPtr(ppszString);
839 AssertPtr(pwszString);
840 *ppszString = NULL;
841
842 /*
843 * Validate the UTF-16LE string and calculate the length of the UTF-8 encoding of it.
844 */
845 size_t cch;
846 int rc = rtUtf16LittleCalcUtf8Length(pwszString, RTSTR_MAX, &cch);
847 if (RT_SUCCESS(rc))
848 {
849 /*
850 * Allocate buffer and recode it.
851 */
852 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
853 if (pszResult)
854 {
855 rc = rtUtf16LittleRecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
856 if (RT_SUCCESS(rc))
857 {
858 *ppszString = pszResult;
859 return rc;
860 }
861
862 RTMemFree(pszResult);
863 }
864 else
865 rc = VERR_NO_STR_MEMORY;
866 }
867 return rc;
868}
869RT_EXPORT_SYMBOL(RTUtf16LittleToUtf8Tag);
870
871
872RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
873{
874 /*
875 * Validate input.
876 */
877 AssertPtr(pwszString);
878 AssertPtr(ppsz);
879 AssertPtrNull(pcch);
880
881 /*
882 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
883 */
884 size_t cchResult;
885#ifdef RT_BIG_ENDIAN
886 int rc = rtUtf16BigCalcUtf8Length(pwszString, cwcString, &cchResult);
887#else
888 int rc = rtUtf16LittleCalcUtf8Length(pwszString, cwcString, &cchResult);
889#endif
890 if (RT_SUCCESS(rc))
891 {
892 if (pcch)
893 *pcch = cchResult;
894
895 /*
896 * Check buffer size / Allocate buffer and recode it.
897 */
898 bool fShouldFree;
899 char *pszResult;
900 if (cch > 0 && *ppsz)
901 {
902 fShouldFree = false;
903 if (RT_UNLIKELY(cch <= cchResult))
904 return VERR_BUFFER_OVERFLOW;
905 pszResult = *ppsz;
906 }
907 else
908 {
909 *ppsz = NULL;
910 fShouldFree = true;
911 cch = RT_MAX(cch, cchResult + 1);
912 pszResult = (char *)RTStrAllocTag(cch, pszTag);
913 }
914 if (pszResult)
915 {
916#ifdef RT_BIG_ENDIAN
917 rc = rtUtf16BigRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
918#else
919 rc = rtUtf16LittleRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
920#endif
921 if (RT_SUCCESS(rc))
922 {
923 *ppsz = pszResult;
924 return rc;
925 }
926
927 if (fShouldFree)
928 RTStrFree(pszResult);
929 }
930 else
931 rc = VERR_NO_STR_MEMORY;
932 }
933 return rc;
934}
935RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
936
937
938RTDECL(int) RTUtf16BigToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
939{
940 /*
941 * Validate input.
942 */
943 AssertPtr(pwszString);
944 AssertPtr(ppsz);
945 AssertPtrNull(pcch);
946
947 /*
948 * Validate the UTF-16BE string and calculate the length of the UTF-8 encoding of it.
949 */
950 size_t cchResult;
951 int rc = rtUtf16BigCalcUtf8Length(pwszString, cwcString, &cchResult);
952 if (RT_SUCCESS(rc))
953 {
954 if (pcch)
955 *pcch = cchResult;
956
957 /*
958 * Check buffer size / Allocate buffer and recode it.
959 */
960 bool fShouldFree;
961 char *pszResult;
962 if (cch > 0 && *ppsz)
963 {
964 fShouldFree = false;
965 if (RT_UNLIKELY(cch <= cchResult))
966 return VERR_BUFFER_OVERFLOW;
967 pszResult = *ppsz;
968 }
969 else
970 {
971 *ppsz = NULL;
972 fShouldFree = true;
973 cch = RT_MAX(cch, cchResult + 1);
974 pszResult = (char *)RTStrAllocTag(cch, pszTag);
975 }
976 if (pszResult)
977 {
978 rc = rtUtf16BigRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
979 if (RT_SUCCESS(rc))
980 {
981 *ppsz = pszResult;
982 return rc;
983 }
984
985 if (fShouldFree)
986 RTStrFree(pszResult);
987 }
988 else
989 rc = VERR_NO_STR_MEMORY;
990 }
991 return rc;
992}
993RT_EXPORT_SYMBOL(RTUtf16BigToUtf8ExTag);
994
995
996RTDECL(int) RTUtf16LittleToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch,
997 const char *pszTag)
998{
999 /*
1000 * Validate input.
1001 */
1002 AssertPtr(pwszString);
1003 AssertPtr(ppsz);
1004 AssertPtrNull(pcch);
1005
1006 /*
1007 * Validate the UTF-16LE string and calculate the length of the UTF-8 encoding of it.
1008 */
1009 size_t cchResult;
1010 int rc = rtUtf16LittleCalcUtf8Length(pwszString, cwcString, &cchResult);
1011 if (RT_SUCCESS(rc))
1012 {
1013 if (pcch)
1014 *pcch = cchResult;
1015
1016 /*
1017 * Check buffer size / Allocate buffer and recode it.
1018 */
1019 bool fShouldFree;
1020 char *pszResult;
1021 if (cch > 0 && *ppsz)
1022 {
1023 fShouldFree = false;
1024 if (RT_UNLIKELY(cch <= cchResult))
1025 return VERR_BUFFER_OVERFLOW;
1026 pszResult = *ppsz;
1027 }
1028 else
1029 {
1030 *ppsz = NULL;
1031 fShouldFree = true;
1032 cch = RT_MAX(cch, cchResult + 1);
1033 pszResult = (char *)RTStrAllocTag(cch, pszTag);
1034 }
1035 if (pszResult)
1036 {
1037 rc = rtUtf16LittleRecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
1038 if (RT_SUCCESS(rc))
1039 {
1040 *ppsz = pszResult;
1041 return rc;
1042 }
1043
1044 if (fShouldFree)
1045 RTStrFree(pszResult);
1046 }
1047 else
1048 rc = VERR_NO_STR_MEMORY;
1049 }
1050 return rc;
1051}
1052RT_EXPORT_SYMBOL(RTUtf16BigToUtf8ExTag);
1053
1054
1055RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
1056{
1057 size_t cch;
1058#ifdef RT_BIG_ENDIAN
1059 int rc = rtUtf16BigCalcUtf8Length(pwsz, RTSTR_MAX, &cch);
1060#else
1061 int rc = rtUtf16LittleCalcUtf8Length(pwsz, RTSTR_MAX, &cch);
1062#endif
1063 return RT_SUCCESS(rc) ? cch : 0;
1064}
1065RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
1066
1067
1068RTDECL(size_t) RTUtf16BigCalcUtf8Len(PCRTUTF16 pwsz)
1069{
1070 size_t cch;
1071 int rc = rtUtf16BigCalcUtf8Length(pwsz, RTSTR_MAX, &cch);
1072 return RT_SUCCESS(rc) ? cch : 0;
1073}
1074RT_EXPORT_SYMBOL(RTUtf16BigCalcUtf8Len);
1075
1076
1077RTDECL(size_t) RTUtf16LittleCalcUtf8Len(PCRTUTF16 pwsz)
1078{
1079 size_t cch;
1080 int rc = rtUtf16LittleCalcUtf8Length(pwsz, RTSTR_MAX, &cch);
1081 return RT_SUCCESS(rc) ? cch : 0;
1082}
1083RT_EXPORT_SYMBOL(RTUtf16LittleCalcUtf8Len);
1084
1085
1086RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
1087{
1088 size_t cch;
1089#ifdef RT_BIG_ENDIAN
1090 int rc = rtUtf16BigCalcUtf8Length(pwsz, cwc, &cch);
1091#else
1092 int rc = rtUtf16LittleCalcUtf8Length(pwsz, cwc, &cch);
1093#endif
1094 if (pcch)
1095 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1096 return rc;
1097}
1098RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
1099
1100
1101RTDECL(int) RTUtf16BigCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
1102{
1103 size_t cch;
1104 int rc = rtUtf16BigCalcUtf8Length(pwsz, cwc, &cch);
1105 if (pcch)
1106 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1107 return rc;
1108}
1109RT_EXPORT_SYMBOL(RTUtf16BigCalcUtf8LenEx);
1110
1111
1112RTDECL(int) RTUtf16LittleCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
1113{
1114 size_t cch;
1115 int rc = rtUtf16LittleCalcUtf8Length(pwsz, cwc, &cch);
1116 if (pcch)
1117 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1118 return rc;
1119}
1120RT_EXPORT_SYMBOL(RTUtf16LittleCalcUtf8LenEx);
1121
1122
1123RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
1124{
1125 const RTUTF16 wc = *pwsz;
1126
1127 /* simple */
1128 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1129 return wc;
1130 if (wc < 0xfffe)
1131 {
1132 /* surrogate pair */
1133 if (wc < 0xdc00)
1134 {
1135 const RTUTF16 wc2 = pwsz[1];
1136 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
1137 {
1138 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
1139 return uc;
1140 }
1141
1142 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
1143 }
1144 else
1145 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
1146 }
1147 else
1148 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
1149 return RTUNICP_INVALID;
1150}
1151RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
1152
1153
1154RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1155{
1156 const RTUTF16 wc = **ppwsz;
1157
1158 /* simple */
1159 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1160 {
1161 (*ppwsz)++;
1162 *pCp = wc;
1163 return VINF_SUCCESS;
1164 }
1165
1166 int rc;
1167 if (wc < 0xfffe)
1168 {
1169 /* surrogate pair */
1170 if (wc < 0xdc00)
1171 {
1172 const RTUTF16 wc2 = (*ppwsz)[1];
1173 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
1174 {
1175 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
1176 *pCp = uc;
1177 (*ppwsz) += 2;
1178 return VINF_SUCCESS;
1179 }
1180
1181 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
1182 }
1183 else
1184 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
1185 rc = VERR_INVALID_UTF16_ENCODING;
1186 }
1187 else
1188 {
1189 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
1190 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
1191 }
1192 *pCp = RTUNICP_INVALID;
1193 (*ppwsz)++;
1194 return rc;
1195}
1196RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
1197
1198
1199RTDECL(int) RTUtf16GetCpNExInternal(PCRTUTF16 *ppwsz, size_t *pcwc, PRTUNICP pCp)
1200{
1201 int rc;
1202 const size_t cwc = *pcwc;
1203 if (cwc > 0)
1204 {
1205 PCRTUTF16 pwsz = *ppwsz;
1206 const RTUTF16 wc = **ppwsz;
1207
1208 /* simple */
1209 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1210 {
1211 *pCp = wc;
1212 *pcwc = cwc - 1;
1213 *ppwsz = pwsz + 1;
1214 return VINF_SUCCESS;
1215 }
1216
1217 if (wc < 0xfffe)
1218 {
1219 /* surrogate pair */
1220 if (wc < 0xdc00)
1221 {
1222 if (cwc >= 2)
1223 {
1224 const RTUTF16 wc2 = pwsz[1];
1225 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
1226 {
1227 *pCp = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
1228 *pcwc = cwc - 2;
1229 *ppwsz = pwsz + 2;
1230 return VINF_SUCCESS;
1231 }
1232
1233 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
1234 }
1235 else
1236 RTStrAssertMsgFailed(("wc=%#08x - incomplete surrogate pair\n", wc));
1237 }
1238 else
1239 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
1240 rc = VERR_INVALID_UTF16_ENCODING;
1241 }
1242 else
1243 {
1244 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
1245 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
1246 }
1247 *pcwc = cwc - 1;
1248 *ppwsz = pwsz + 1;
1249 }
1250 else
1251 rc = VERR_END_OF_STRING;
1252 *pCp = RTUNICP_INVALID;
1253 return rc;
1254}
1255RT_EXPORT_SYMBOL(RTUtf16GetCpNExInternal);
1256
1257
1258RTDECL(int) RTUtf16BigGetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1259{
1260 const RTUTF16 wc = RT_BE2H_U16(**ppwsz);
1261
1262 /* simple */
1263 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
1264 {
1265 (*ppwsz)++;
1266 *pCp = wc;
1267 return VINF_SUCCESS;
1268 }
1269
1270 int rc;
1271 if (wc < 0xfffe)
1272 {
1273 /* surrogate pair */
1274 if (wc < 0xdc00)
1275 {
1276 const RTUTF16 wc2 = RT_BE2H_U16((*ppwsz)[1]);
1277 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
1278 {
1279 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
1280 *pCp = uc;
1281 (*ppwsz) += 2;
1282 return VINF_SUCCESS;
1283 }
1284
1285 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
1286 }
1287 else
1288 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
1289 rc = VERR_INVALID_UTF16_ENCODING;
1290 }
1291 else
1292 {
1293 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
1294 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
1295 }
1296 *pCp = RTUNICP_INVALID;
1297 (*ppwsz)++;
1298 return rc;
1299}
1300RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
1301
1302
1303RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
1304{
1305 /* simple */
1306 if ( CodePoint < 0xd800
1307 || ( CodePoint > 0xdfff
1308 && CodePoint < 0xfffe))
1309 {
1310 *pwsz++ = (RTUTF16)CodePoint;
1311 return pwsz;
1312 }
1313
1314 /* surrogate pair */
1315 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
1316 {
1317 CodePoint -= 0x10000;
1318 *pwsz++ = 0xd800 | (CodePoint >> 10);
1319 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
1320 return pwsz;
1321 }
1322
1323 /* invalid code point. */
1324 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
1325 *pwsz++ = 0x7f;
1326 return pwsz;
1327}
1328RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
1329
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use