[1] | 1 | /* $Id: utf8-posix.cpp 98103 2023-01-17 14:15:46Z vboxsync $ */
|
---|
| 2 | /** @file
|
---|
[8245] | 3 | * IPRT - UTF-8 helpers, POSIX.
|
---|
[1] | 4 | */
|
---|
| 5 |
|
---|
| 6 | /*
|
---|
[98103] | 7 | * Copyright (C) 2006-2023 Oracle and/or its affiliates.
|
---|
[1] | 8 | *
|
---|
[96407] | 9 | * This file is part of VirtualBox base platform packages, as
|
---|
| 10 | * available from https://www.virtualbox.org.
|
---|
[5999] | 11 | *
|
---|
[96407] | 12 | * This program is free software; you can redistribute it and/or
|
---|
| 13 | * modify it under the terms of the GNU General Public License
|
---|
| 14 | * as published by the Free Software Foundation, in version 3 of the
|
---|
| 15 | * License.
|
---|
| 16 | *
|
---|
| 17 | * This program is distributed in the hope that it will be useful, but
|
---|
| 18 | * WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
| 20 | * General Public License for more details.
|
---|
| 21 | *
|
---|
| 22 | * You should have received a copy of the GNU General Public License
|
---|
| 23 | * along with this program; if not, see <https://www.gnu.org/licenses>.
|
---|
| 24 | *
|
---|
[5999] | 25 | * The contents of this file may alternatively be used under the terms
|
---|
| 26 | * of the Common Development and Distribution License Version 1.0
|
---|
[96407] | 27 | * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
|
---|
| 28 | * in the VirtualBox distribution, in which case the provisions of the
|
---|
[5999] | 29 | * CDDL are applicable instead of those of the GPL.
|
---|
| 30 | *
|
---|
| 31 | * You may elect to license modified versions of this file under the
|
---|
| 32 | * terms and conditions of either the GPL or the CDDL or both.
|
---|
[96407] | 33 | *
|
---|
| 34 | * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
|
---|
[1] | 35 | */
|
---|
| 36 |
|
---|
| 37 |
|
---|
[57358] | 38 | /*********************************************************************************************************************************
|
---|
| 39 | * Header Files *
|
---|
| 40 | *********************************************************************************************************************************/
|
---|
[1] | 41 | #include <iprt/string.h>
|
---|
[28903] | 42 | #include "internal/iprt.h"
|
---|
| 43 |
|
---|
[1] | 44 | #include <iprt/alloc.h>
|
---|
| 45 | #include <iprt/assert.h>
|
---|
[92671] | 46 | #include <iprt/ctype.h>
|
---|
[1] | 47 | #include <iprt/err.h>
|
---|
| 48 | #include <iprt/string.h>
|
---|
| 49 |
|
---|
| 50 | #include <errno.h>
|
---|
| 51 | #include <locale.h>
|
---|
[93543] | 52 | #ifdef RT_OS_DARWIN
|
---|
| 53 | # include <stdlib.h>
|
---|
| 54 | #endif
|
---|
[40651] | 55 |
|
---|
[40654] | 56 | /* iconv prototype changed with 165+ (thanks to PSARC/2010/160 Bugster 7037400) */
|
---|
[40651] | 57 | #if defined(RT_OS_SOLARIS)
|
---|
| 58 | # if !defined(_XPG6)
|
---|
[72624] | 59 | # define IPRT_XPG6_TMP_DEF
|
---|
[40651] | 60 | # define _XPG6
|
---|
| 61 | # endif
|
---|
| 62 | # if defined(__USE_LEGACY_PROTOTYPES__)
|
---|
[72624] | 63 | # define IPRT_LEGACY_PROTO_TMP_DEF
|
---|
[40651] | 64 | # undef __USE_LEGACY_PROTOTYPES__
|
---|
| 65 | # endif
|
---|
| 66 | #endif /* RT_OS_SOLARIS */
|
---|
| 67 |
|
---|
| 68 | # include <iconv.h>
|
---|
| 69 |
|
---|
| 70 | #if defined(RT_OS_SOLARIS)
|
---|
[72624] | 71 | # if defined(IPRT_XPG6_TMP_DEF)
|
---|
[40651] | 72 | # undef _XPG6
|
---|
[72624] | 73 | # undef IPRT_XPG6_TMP_DEF
|
---|
[40651] | 74 | # endif
|
---|
[72624] | 75 | # if defined(IPRT_LEGACY_PROTO_TMP_DEF)
|
---|
[40651] | 76 | # define __USE_LEGACY_PROTOTYPES__
|
---|
[72624] | 77 | # undef IPRT_LEGACY_PROTO_TMP_DEF
|
---|
[40651] | 78 | # endif
|
---|
| 79 | #endif /* RT_OS_SOLARIS */
|
---|
| 80 |
|
---|
[1] | 81 | #include <wctype.h>
|
---|
| 82 |
|
---|
[28903] | 83 | #include <langinfo.h>
|
---|
[1] | 84 |
|
---|
[20822] | 85 | #include "internal/alignmentchecks.h"
|
---|
[30294] | 86 | #include "internal/string.h"
|
---|
[28903] | 87 | #ifdef RT_WITH_ICONV_CACHE
|
---|
| 88 | # include "internal/thread.h"
|
---|
| 89 | AssertCompile(sizeof(iconv_t) <= sizeof(void *));
|
---|
| 90 | #endif
|
---|
[20822] | 91 |
|
---|
| 92 |
|
---|
[68578] | 93 | /* There are different opinions about the constness of the input buffer. */
|
---|
| 94 | #if defined(RT_OS_LINUX) || defined(RT_OS_HAIKU) || defined(RT_OS_SOLARIS) \
|
---|
| 95 | || (defined(RT_OS_DARWIN) && defined(_DARWIN_FEATURE_UNIX_CONFORMANCE))
|
---|
| 96 | # define NON_CONST_ICONV_INPUT
|
---|
| 97 | #endif
|
---|
| 98 | #ifdef RT_OS_FREEBSD
|
---|
| 99 | # include <sys/param.h>
|
---|
| 100 | # if __FreeBSD_version >= 1002000 /* Changed around 10.2.2 (https://svnweb.freebsd.org/base?view=revision&revision=281550) */
|
---|
| 101 | # define NON_CONST_ICONV_INPUT
|
---|
| 102 | # else
|
---|
| 103 | # error __FreeBSD_version__
|
---|
| 104 | # endif
|
---|
| 105 | #endif
|
---|
[82631] | 106 | #ifdef RT_OS_NETBSD
|
---|
| 107 | /* iconv constness was changed on 2019-10-24, shortly after 9.99.17 */
|
---|
| 108 | # include <sys/param.h>
|
---|
| 109 | # if __NetBSD_Prereq__(9,99,18)
|
---|
| 110 | # define NON_CONST_ICONV_INPUT
|
---|
| 111 | # endif
|
---|
| 112 | #endif
|
---|
[68578] | 113 |
|
---|
| 114 |
|
---|
[28903] | 115 | /**
|
---|
| 116 | * Gets the codeset of the current locale (LC_CTYPE).
|
---|
| 117 | *
|
---|
| 118 | * @returns Pointer to read-only string with the codeset name.
|
---|
| 119 | */
|
---|
[36555] | 120 | DECLHIDDEN(const char *) rtStrGetLocaleCodeset(void)
|
---|
[28903] | 121 | {
|
---|
[93543] | 122 | #ifdef RT_OS_DARWIN
|
---|
| 123 | /*
|
---|
| 124 | * @bugref{10153}: If no locale specified in the environment (typically the
|
---|
| 125 | * case when launched via Finder, LaunchPad or similar) default to UTF-8.
|
---|
| 126 | */
|
---|
| 127 | static int8_t volatile s_fIsUtf8 = -1;
|
---|
| 128 | int8_t fIsUtf8 = s_fIsUtf8;
|
---|
| 129 | if (fIsUtf8)
|
---|
| 130 | {
|
---|
| 131 | if (fIsUtf8 == true)
|
---|
| 132 | return "UTF-8";
|
---|
| 133 |
|
---|
| 134 | /* Initialize: */
|
---|
| 135 | fIsUtf8 = true;
|
---|
| 136 | static const char * const s_papszVariables[] = { "LC_ALL", "LC_CTYPE", "LANG" };
|
---|
| 137 | for (size_t i = 0; i < RT_ELEMENTS(s_papszVariables); i++)
|
---|
| 138 | {
|
---|
| 139 | const char *pszValue = getenv(s_papszVariables[i]);
|
---|
| 140 | if (pszValue && *pszValue)
|
---|
| 141 | {
|
---|
| 142 | fIsUtf8 = false;
|
---|
| 143 | break;
|
---|
| 144 | }
|
---|
| 145 | }
|
---|
| 146 | s_fIsUtf8 = fIsUtf8;
|
---|
| 147 | if (fIsUtf8 == true)
|
---|
| 148 | return "UTF-8";
|
---|
| 149 | }
|
---|
| 150 | #endif
|
---|
[28903] | 151 | return nl_langinfo(CODESET);
|
---|
| 152 | }
|
---|
[1] | 153 |
|
---|
| 154 |
|
---|
[92671] | 155 | /**
|
---|
| 156 | * Checks if the codeset specified by current locale (LC_CTYPE) is UTF-8.
|
---|
| 157 | *
|
---|
| 158 | * @returns true if UTF-8, false if not.
|
---|
| 159 | */
|
---|
| 160 | DECLHIDDEN(bool) rtStrIsLocaleCodesetUtf8(void)
|
---|
| 161 | {
|
---|
| 162 | return rtStrIsCodesetUtf8(rtStrGetLocaleCodeset());
|
---|
| 163 | }
|
---|
| 164 |
|
---|
| 165 |
|
---|
| 166 | /**
|
---|
| 167 | * Checks if @a pszCodeset specified UTF-8.
|
---|
| 168 | *
|
---|
| 169 | * @returns true if UTF-8, false if not.
|
---|
| 170 | * @param pszCodeset Codeset to test.
|
---|
| 171 | */
|
---|
| 172 | DECLHIDDEN(bool) rtStrIsCodesetUtf8(const char *pszCodeset)
|
---|
| 173 | {
|
---|
| 174 | if (pszCodeset)
|
---|
| 175 | {
|
---|
| 176 | /* Skip leading spaces just in case: */
|
---|
| 177 | while (RT_C_IS_SPACE(*pszCodeset))
|
---|
| 178 | pszCodeset++;
|
---|
| 179 |
|
---|
| 180 | /* If prefixed by 'ISO-10646/' skip that (iconv access this, dunno about
|
---|
| 181 | LC_CTYPE et al., but play it safe): */
|
---|
| 182 | if ( strncmp(pszCodeset, RT_STR_TUPLE("ISO-10646/")) == 0
|
---|
| 183 | || strncmp(pszCodeset, RT_STR_TUPLE("iso-10646/")) == 0)
|
---|
| 184 | pszCodeset += sizeof("ISO-10646/") - 1;
|
---|
| 185 |
|
---|
| 186 | /* Match 'utf': */
|
---|
| 187 | if ( (pszCodeset[0] == 'u' || pszCodeset[0] == 'U')
|
---|
| 188 | && (pszCodeset[1] == 't' || pszCodeset[1] == 'T')
|
---|
| 189 | && (pszCodeset[2] == 'f' || pszCodeset[2] == 'F'))
|
---|
| 190 | {
|
---|
| 191 | pszCodeset += 3;
|
---|
| 192 |
|
---|
| 193 | /* Treat the dash as optional: */
|
---|
| 194 | if (*pszCodeset == '-')
|
---|
| 195 | pszCodeset++;
|
---|
| 196 |
|
---|
| 197 | /* Match '8': */
|
---|
| 198 | if (*pszCodeset == '8')
|
---|
| 199 | {
|
---|
| 200 | do
|
---|
| 201 | pszCodeset++;
|
---|
| 202 | while (RT_C_IS_SPACE(*pszCodeset));
|
---|
| 203 |
|
---|
| 204 | /* We ignore modifiers here (e.g. "[be_BY.]utf8@latin"). */
|
---|
| 205 | if (!*pszCodeset || *pszCodeset == '@')
|
---|
| 206 | return true;
|
---|
| 207 | }
|
---|
| 208 | }
|
---|
| 209 | }
|
---|
| 210 | return false;
|
---|
| 211 | }
|
---|
| 212 |
|
---|
| 213 |
|
---|
| 214 |
|
---|
[28903] | 215 | #ifdef RT_WITH_ICONV_CACHE
|
---|
| 216 |
|
---|
[1] | 217 | /**
|
---|
[28903] | 218 | * Initializes the iconv handle cache associated with a thread.
|
---|
| 219 | *
|
---|
| 220 | * @param pThread The thread in question.
|
---|
| 221 | */
|
---|
[36555] | 222 | DECLHIDDEN(void) rtStrIconvCacheInit(PRTTHREADINT pThread)
|
---|
[28903] | 223 | {
|
---|
| 224 | for (size_t i = 0; i < RT_ELEMENTS(pThread->ahIconvs); i++)
|
---|
| 225 | pThread->ahIconvs[i] = (iconv_t)-1;
|
---|
| 226 | }
|
---|
| 227 |
|
---|
| 228 | /**
|
---|
| 229 | * Destroys the iconv handle cache associated with a thread.
|
---|
| 230 | *
|
---|
| 231 | * @param pThread The thread in question.
|
---|
| 232 | */
|
---|
[36555] | 233 | DECLHIDDEN(void) rtStrIconvCacheDestroy(PRTTHREADINT pThread)
|
---|
[28903] | 234 | {
|
---|
| 235 | for (size_t i = 0; i < RT_ELEMENTS(pThread->ahIconvs); i++)
|
---|
| 236 | {
|
---|
[28904] | 237 | iconv_t hIconv = (iconv_t)pThread->ahIconvs[i];
|
---|
| 238 | pThread->ahIconvs[i] = (iconv_t)-1;
|
---|
[28903] | 239 | if (hIconv != (iconv_t)-1)
|
---|
| 240 | iconv_close(hIconv);
|
---|
| 241 | }
|
---|
| 242 | }
|
---|
| 243 |
|
---|
| 244 |
|
---|
| 245 | /**
|
---|
[1] | 246 | * Converts a string from one charset to another.
|
---|
| 247 | *
|
---|
| 248 | * @returns iprt status code.
|
---|
| 249 | * @param pvInput Pointer to intput string.
|
---|
| 250 | * @param cbInput Size (in bytes) of input string. Excludes any terminators.
|
---|
| 251 | * @param pszInputCS Codeset of the input string.
|
---|
| 252 | * @param ppvOutput Pointer to pointer to output buffer if cbOutput > 0.
|
---|
| 253 | * If cbOutput is 0 this is where the pointer to the allocated
|
---|
| 254 | * buffer is stored.
|
---|
| 255 | * @param cbOutput Size of the passed in buffer.
|
---|
| 256 | * @param pszOutputCS Codeset of the input string.
|
---|
| 257 | * @param cFactor Input vs. output size factor.
|
---|
[28903] | 258 | * @param phIconv Pointer to the cache entry.
|
---|
[1] | 259 | */
|
---|
[28903] | 260 | static int rtstrConvertCached(const void *pvInput, size_t cbInput, const char *pszInputCS,
|
---|
| 261 | void **ppvOutput, size_t cbOutput, const char *pszOutputCS,
|
---|
| 262 | unsigned cFactor, iconv_t *phIconv)
|
---|
[1] | 263 | {
|
---|
| 264 | /*
|
---|
| 265 | * Allocate buffer
|
---|
| 266 | */
|
---|
[20822] | 267 | bool fUcs2Term;
|
---|
[1] | 268 | void *pvOutput;
|
---|
| 269 | size_t cbOutput2;
|
---|
| 270 | if (!cbOutput)
|
---|
| 271 | {
|
---|
| 272 | cbOutput2 = cbInput * cFactor;
|
---|
[7426] | 273 | pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
|
---|
[1] | 274 | if (!pvOutput)
|
---|
| 275 | return VERR_NO_TMP_MEMORY;
|
---|
[20822] | 276 | fUcs2Term = true;
|
---|
[1] | 277 | }
|
---|
| 278 | else
|
---|
| 279 | {
|
---|
| 280 | pvOutput = *ppvOutput;
|
---|
[28903] | 281 | fUcs2Term = !strcmp(pszOutputCS, "UCS-2")
|
---|
| 282 | || !strcmp(pszOutputCS, "UTF-16")
|
---|
| 283 | || !strcmp(pszOutputCS, "ucs-2")
|
---|
| 284 | || !strcmp(pszOutputCS, "utf-16");
|
---|
| 285 | cbOutput2 = cbOutput - (fUcs2Term ? sizeof(RTUTF16) : 1);
|
---|
| 286 | if (cbOutput2 > cbOutput)
|
---|
| 287 | return VERR_BUFFER_OVERFLOW;
|
---|
| 288 | }
|
---|
| 289 |
|
---|
| 290 | /*
|
---|
| 291 | * Use a loop here to retry with bigger buffers.
|
---|
| 292 | */
|
---|
| 293 | for (unsigned cTries = 10; cTries > 0; cTries--)
|
---|
| 294 | {
|
---|
| 295 | /*
|
---|
| 296 | * Create conversion object if necessary.
|
---|
| 297 | */
|
---|
| 298 | iconv_t hIconv = (iconv_t)*phIconv;
|
---|
| 299 | if (hIconv == (iconv_t)-1)
|
---|
| 300 | {
|
---|
[93544] | 301 | #if defined(RT_OS_SOLARIS) || defined(RT_OS_NETBSD) || /* @bugref{10153}: Default to UTF-8: */ defined(RT_OS_DARWIN)
|
---|
[62946] | 302 | /* Some systems don't grok empty codeset strings, so help them find the current codeset. */
|
---|
[28928] | 303 | if (!*pszInputCS)
|
---|
| 304 | pszInputCS = rtStrGetLocaleCodeset();
|
---|
| 305 | if (!*pszOutputCS)
|
---|
| 306 | pszOutputCS = rtStrGetLocaleCodeset();
|
---|
| 307 | #endif
|
---|
[28903] | 308 | IPRT_ALIGNMENT_CHECKS_DISABLE(); /* glibc causes trouble */
|
---|
| 309 | *phIconv = hIconv = iconv_open(pszOutputCS, pszInputCS);
|
---|
| 310 | IPRT_ALIGNMENT_CHECKS_ENABLE();
|
---|
| 311 | }
|
---|
| 312 | if (hIconv != (iconv_t)-1)
|
---|
| 313 | {
|
---|
| 314 | /*
|
---|
| 315 | * Do the conversion.
|
---|
| 316 | */
|
---|
| 317 | size_t cbInLeft = cbInput;
|
---|
| 318 | size_t cbOutLeft = cbOutput2;
|
---|
| 319 | const void *pvInputLeft = pvInput;
|
---|
| 320 | void *pvOutputLeft = pvOutput;
|
---|
[45260] | 321 | size_t cchNonRev;
|
---|
[68578] | 322 | #ifdef NON_CONST_ICONV_INPUT
|
---|
[45260] | 323 | cchNonRev = iconv(hIconv, (char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
|
---|
[28903] | 324 | #else
|
---|
[45260] | 325 | cchNonRev = iconv(hIconv, (const char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
|
---|
[28903] | 326 | #endif
|
---|
[45260] | 327 | if (cchNonRev != (size_t)-1)
|
---|
[28903] | 328 | {
|
---|
| 329 | if (!cbInLeft)
|
---|
| 330 | {
|
---|
| 331 | /*
|
---|
| 332 | * We're done, just add the terminator and return.
|
---|
| 333 | * (Two terminators to support UCS-2 output, too.)
|
---|
| 334 | */
|
---|
| 335 | ((char *)pvOutputLeft)[0] = '\0';
|
---|
| 336 | if (fUcs2Term)
|
---|
| 337 | ((char *)pvOutputLeft)[1] = '\0';
|
---|
| 338 | *ppvOutput = pvOutput;
|
---|
[45260] | 339 | if (cchNonRev == 0)
|
---|
| 340 | return VINF_SUCCESS;
|
---|
| 341 | return VWRN_NO_TRANSLATION;
|
---|
[28903] | 342 | }
|
---|
| 343 | errno = E2BIG;
|
---|
| 344 | }
|
---|
| 345 |
|
---|
| 346 | /*
|
---|
| 347 | * If we failed because of output buffer space we'll
|
---|
| 348 | * increase the output buffer size and retry.
|
---|
| 349 | */
|
---|
| 350 | if (errno == E2BIG)
|
---|
| 351 | {
|
---|
| 352 | if (!cbOutput)
|
---|
| 353 | {
|
---|
| 354 | RTMemTmpFree(pvOutput);
|
---|
| 355 | cbOutput2 *= 2;
|
---|
| 356 | pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
|
---|
| 357 | if (!pvOutput)
|
---|
| 358 | return VERR_NO_TMP_MEMORY;
|
---|
| 359 | continue;
|
---|
| 360 | }
|
---|
| 361 | return VERR_BUFFER_OVERFLOW;
|
---|
| 362 | }
|
---|
| 363 |
|
---|
| 364 | /*
|
---|
| 365 | * Close the handle on all other errors to make sure we won't carry
|
---|
| 366 | * any bad state with us.
|
---|
| 367 | */
|
---|
| 368 | *phIconv = (iconv_t)-1;
|
---|
| 369 | iconv_close(hIconv);
|
---|
| 370 | }
|
---|
| 371 | break;
|
---|
| 372 | }
|
---|
| 373 |
|
---|
| 374 | /* failure */
|
---|
| 375 | if (!cbOutput)
|
---|
| 376 | RTMemTmpFree(pvOutput);
|
---|
| 377 | return VERR_NO_TRANSLATION;
|
---|
| 378 | }
|
---|
| 379 |
|
---|
| 380 | #endif /* RT_WITH_ICONV_CACHE */
|
---|
| 381 |
|
---|
| 382 | /**
|
---|
| 383 | * Converts a string from one charset to another without using the handle cache.
|
---|
| 384 | *
|
---|
| 385 | * @returns IPRT status code.
|
---|
| 386 | *
|
---|
| 387 | * @param pvInput Pointer to intput string.
|
---|
| 388 | * @param cbInput Size (in bytes) of input string. Excludes any terminators.
|
---|
| 389 | * @param pszInputCS Codeset of the input string.
|
---|
| 390 | * @param ppvOutput Pointer to pointer to output buffer if cbOutput > 0.
|
---|
| 391 | * If cbOutput is 0 this is where the pointer to the allocated
|
---|
| 392 | * buffer is stored.
|
---|
| 393 | * @param cbOutput Size of the passed in buffer.
|
---|
| 394 | * @param pszOutputCS Codeset of the input string.
|
---|
| 395 | * @param cFactor Input vs. output size factor.
|
---|
| 396 | */
|
---|
| 397 | static int rtStrConvertUncached(const void *pvInput, size_t cbInput, const char *pszInputCS,
|
---|
| 398 | void **ppvOutput, size_t cbOutput, const char *pszOutputCS,
|
---|
| 399 | unsigned cFactor)
|
---|
| 400 | {
|
---|
| 401 | /*
|
---|
| 402 | * Allocate buffer
|
---|
| 403 | */
|
---|
| 404 | bool fUcs2Term;
|
---|
| 405 | void *pvOutput;
|
---|
| 406 | size_t cbOutput2;
|
---|
| 407 | if (!cbOutput)
|
---|
| 408 | {
|
---|
| 409 | cbOutput2 = cbInput * cFactor;
|
---|
| 410 | pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
|
---|
| 411 | if (!pvOutput)
|
---|
| 412 | return VERR_NO_TMP_MEMORY;
|
---|
| 413 | fUcs2Term = true;
|
---|
| 414 | }
|
---|
| 415 | else
|
---|
| 416 | {
|
---|
| 417 | pvOutput = *ppvOutput;
|
---|
[20822] | 418 | fUcs2Term = !strcmp(pszOutputCS, "UCS-2");
|
---|
| 419 | cbOutput2 = cbOutput - (fUcs2Term ? sizeof(RTUTF16) : 1);
|
---|
[1] | 420 | if (cbOutput2 > cbOutput)
|
---|
| 421 | return VERR_BUFFER_OVERFLOW;
|
---|
| 422 | }
|
---|
| 423 |
|
---|
| 424 | /*
|
---|
| 425 | * Use a loop here to retry with bigger buffers.
|
---|
| 426 | */
|
---|
| 427 | for (unsigned cTries = 10; cTries > 0; cTries--)
|
---|
| 428 | {
|
---|
| 429 | /*
|
---|
| 430 | * Create conversion object.
|
---|
| 431 | */
|
---|
[93544] | 432 | #if defined(RT_OS_SOLARIS) || defined(RT_OS_NETBSD) || /* @bugref{10153}: Default to UTF-8: */ defined(RT_OS_DARWIN)
|
---|
[62946] | 433 | /* Some systems don't grok empty codeset strings, so help them find the current codeset. */
|
---|
[3980] | 434 | if (!*pszInputCS)
|
---|
[28903] | 435 | pszInputCS = rtStrGetLocaleCodeset();
|
---|
[3980] | 436 | if (!*pszOutputCS)
|
---|
[28903] | 437 | pszOutputCS = rtStrGetLocaleCodeset();
|
---|
[3980] | 438 | #endif
|
---|
[20822] | 439 | IPRT_ALIGNMENT_CHECKS_DISABLE(); /* glibc causes trouble */
|
---|
[7426] | 440 | iconv_t icHandle = iconv_open(pszOutputCS, pszInputCS);
|
---|
[20822] | 441 | IPRT_ALIGNMENT_CHECKS_ENABLE();
|
---|
[1] | 442 | if (icHandle != (iconv_t)-1)
|
---|
| 443 | {
|
---|
| 444 | /*
|
---|
| 445 | * Do the conversion.
|
---|
| 446 | */
|
---|
| 447 | size_t cbInLeft = cbInput;
|
---|
| 448 | size_t cbOutLeft = cbOutput2;
|
---|
| 449 | const void *pvInputLeft = pvInput;
|
---|
| 450 | void *pvOutputLeft = pvOutput;
|
---|
[45260] | 451 | size_t cchNonRev;
|
---|
[68578] | 452 | #ifdef NON_CONST_ICONV_INPUT
|
---|
[45260] | 453 | cchNonRev = iconv(icHandle, (char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
|
---|
[1] | 454 | #else
|
---|
[45260] | 455 | cchNonRev = iconv(icHandle, (const char **)&pvInputLeft, &cbInLeft, (char **)&pvOutputLeft, &cbOutLeft);
|
---|
[1] | 456 | #endif
|
---|
[45260] | 457 | if (cchNonRev != (size_t)-1)
|
---|
[1] | 458 | {
|
---|
| 459 | if (!cbInLeft)
|
---|
| 460 | {
|
---|
| 461 | /*
|
---|
| 462 | * We're done, just add the terminator and return.
|
---|
| 463 | * (Two terminators to support UCS-2 output, too.)
|
---|
| 464 | */
|
---|
| 465 | iconv_close(icHandle);
|
---|
[20822] | 466 | ((char *)pvOutputLeft)[0] = '\0';
|
---|
| 467 | if (fUcs2Term)
|
---|
| 468 | ((char *)pvOutputLeft)[1] = '\0';
|
---|
[1] | 469 | *ppvOutput = pvOutput;
|
---|
[45260] | 470 | if (cchNonRev == 0)
|
---|
| 471 | return VINF_SUCCESS;
|
---|
| 472 | return VWRN_NO_TRANSLATION;
|
---|
[1] | 473 | }
|
---|
[20822] | 474 | errno = E2BIG;
|
---|
[1] | 475 | }
|
---|
| 476 | iconv_close(icHandle);
|
---|
| 477 |
|
---|
| 478 | /*
|
---|
| 479 | * If we failed because of output buffer space we'll
|
---|
| 480 | * increase the output buffer size and retry.
|
---|
| 481 | */
|
---|
| 482 | if (errno == E2BIG)
|
---|
| 483 | {
|
---|
| 484 | if (!cbOutput)
|
---|
| 485 | {
|
---|
| 486 | RTMemTmpFree(pvOutput);
|
---|
| 487 | cbOutput2 *= 2;
|
---|
[20822] | 488 | pvOutput = RTMemTmpAlloc(cbOutput2 + sizeof(RTUTF16));
|
---|
[1] | 489 | if (!pvOutput)
|
---|
| 490 | return VERR_NO_TMP_MEMORY;
|
---|
| 491 | continue;
|
---|
| 492 | }
|
---|
| 493 | return VERR_BUFFER_OVERFLOW;
|
---|
| 494 | }
|
---|
| 495 | }
|
---|
| 496 | break;
|
---|
| 497 | }
|
---|
| 498 |
|
---|
| 499 | /* failure */
|
---|
| 500 | if (!cbOutput)
|
---|
| 501 | RTMemTmpFree(pvOutput);
|
---|
| 502 | return VERR_NO_TRANSLATION;
|
---|
| 503 | }
|
---|
| 504 |
|
---|
| 505 |
|
---|
| 506 | /**
|
---|
[28903] | 507 | * Wrapper that selects rtStrConvertCached or rtStrConvertUncached.
|
---|
| 508 | *
|
---|
| 509 | * @returns IPRT status code.
|
---|
| 510 | *
|
---|
| 511 | * @param pszInput Pointer to intput string.
|
---|
| 512 | * @param cchInput Size (in bytes) of input string. Excludes any
|
---|
| 513 | * terminators.
|
---|
| 514 | * @param pszInputCS Codeset of the input string.
|
---|
| 515 | * @param ppszOutput Pointer to pointer to output buffer if cbOutput > 0.
|
---|
| 516 | * If cbOutput is 0 this is where the pointer to the
|
---|
| 517 | * allocated buffer is stored.
|
---|
| 518 | * @param cbOutput Size of the passed in buffer.
|
---|
| 519 | * @param pszOutputCS Codeset of the input string.
|
---|
| 520 | * @param cFactor Input vs. output size factor.
|
---|
| 521 | * @param enmCacheIdx The iconv cache index.
|
---|
| 522 | */
|
---|
| 523 | DECLINLINE(int) rtStrConvertWrapper(const char *pchInput, size_t cchInput, const char *pszInputCS,
|
---|
| 524 | char **ppszOutput, size_t cbOutput, const char *pszOutputCS,
|
---|
| 525 | unsigned cFactor, RTSTRICONV enmCacheIdx)
|
---|
| 526 | {
|
---|
| 527 | #ifdef RT_WITH_ICONV_CACHE
|
---|
| 528 | RTTHREAD hSelf = RTThreadSelf();
|
---|
| 529 | if (hSelf != NIL_RTTHREAD)
|
---|
| 530 | {
|
---|
| 531 | PRTTHREADINT pThread = rtThreadGet(hSelf);
|
---|
[31961] | 532 | if (pThread)
|
---|
| 533 | {
|
---|
| 534 | if ((pThread->fIntFlags & (RTTHREADINT_FLAGS_ALIEN | RTTHREADINT_FLAGS_MAIN)) != RTTHREADINT_FLAGS_ALIEN)
|
---|
| 535 | {
|
---|
| 536 | int rc = rtstrConvertCached(pchInput, cchInput, pszInputCS,
|
---|
| 537 | (void **)ppszOutput, cbOutput, pszOutputCS,
|
---|
| 538 | cFactor, (iconv_t *)&pThread->ahIconvs[enmCacheIdx]);
|
---|
| 539 | rtThreadRelease(pThread);
|
---|
| 540 | return rc;
|
---|
| 541 | }
|
---|
| 542 | rtThreadRelease(pThread);
|
---|
| 543 | }
|
---|
[28903] | 544 | }
|
---|
| 545 | #endif
|
---|
| 546 | return rtStrConvertUncached(pchInput, cchInput, pszInputCS,
|
---|
| 547 | (void **)ppszOutput, cbOutput, pszOutputCS,
|
---|
| 548 | cFactor);
|
---|
| 549 | }
|
---|
| 550 |
|
---|
| 551 |
|
---|
| 552 | /**
|
---|
| 553 | * Internal API for use by the path conversion code.
|
---|
| 554 | *
|
---|
| 555 | * @returns IPRT status code.
|
---|
| 556 | *
|
---|
| 557 | * @param pszInput Pointer to intput string.
|
---|
| 558 | * @param cchInput Size (in bytes) of input string. Excludes any
|
---|
| 559 | * terminators.
|
---|
| 560 | * @param pszInputCS Codeset of the input string.
|
---|
| 561 | * @param ppszOutput Pointer to pointer to output buffer if cbOutput > 0.
|
---|
| 562 | * If cbOutput is 0 this is where the pointer to the
|
---|
| 563 | * allocated buffer is stored.
|
---|
| 564 | * @param cbOutput Size of the passed in buffer.
|
---|
| 565 | * @param pszOutputCS Codeset of the input string.
|
---|
| 566 | * @param cFactor Input vs. output size factor.
|
---|
| 567 | * @param enmCacheIdx The iconv cache index.
|
---|
| 568 | */
|
---|
[36555] | 569 | DECLHIDDEN(int) rtStrConvert(const char *pchInput, size_t cchInput, const char *pszInputCS,
|
---|
| 570 | char **ppszOutput, size_t cbOutput, const char *pszOutputCS,
|
---|
| 571 | unsigned cFactor, RTSTRICONV enmCacheIdx)
|
---|
[28903] | 572 | {
|
---|
| 573 | Assert(enmCacheIdx >= 0 && enmCacheIdx < RTSTRICONV_END);
|
---|
| 574 | return rtStrConvertWrapper(pchInput, cchInput, pszInputCS,
|
---|
| 575 | ppszOutput, cbOutput, pszOutputCS,
|
---|
| 576 | cFactor, enmCacheIdx);
|
---|
| 577 | }
|
---|
| 578 |
|
---|
| 579 |
|
---|
[92671] | 580 | /**
|
---|
| 581 | * Initializes a local conversion cache for use with rtStrLocalCacheConvert.
|
---|
| 582 | *
|
---|
| 583 | * Call rtStrLocalCacheDelete when done.
|
---|
| 584 | */
|
---|
| 585 | DECLHIDDEN(void) rtStrLocalCacheInit(void **ppvTmpCache)
|
---|
| 586 | {
|
---|
| 587 | *ppvTmpCache = (iconv_t)-1;
|
---|
| 588 | }
|
---|
| 589 |
|
---|
| 590 |
|
---|
| 591 | /**
|
---|
| 592 | * Cleans up a local conversion cache.
|
---|
| 593 | */
|
---|
| 594 | DECLHIDDEN(void) rtStrLocalCacheDelete(void **ppvTmpCache)
|
---|
| 595 | {
|
---|
| 596 | #ifdef RT_WITH_ICONV_CACHE
|
---|
| 597 | iconv_t icHandle = (iconv_t)*ppvTmpCache;
|
---|
| 598 | if (icHandle != (iconv_t)-1)
|
---|
| 599 | iconv_close(icHandle);
|
---|
| 600 | #endif
|
---|
| 601 | *ppvTmpCache = (iconv_t)-1;
|
---|
| 602 | }
|
---|
| 603 |
|
---|
| 604 |
|
---|
| 605 | /**
|
---|
| 606 | * Internal API for use by the process creation conversion code.
|
---|
| 607 | *
|
---|
| 608 | * @returns IPRT status code.
|
---|
| 609 | *
|
---|
| 610 | * @param pszInput Pointer to intput string.
|
---|
| 611 | * @param cchInput Size (in bytes) of input string. Excludes any
|
---|
| 612 | * terminators.
|
---|
| 613 | * @param pszInputCS Codeset of the input string.
|
---|
| 614 | * @param ppszOutput Pointer to pointer to output buffer if cbOutput > 0.
|
---|
| 615 | * If cbOutput is 0 this is where the pointer to the
|
---|
| 616 | * allocated buffer is stored.
|
---|
| 617 | * @param cbOutput Size of the passed in buffer.
|
---|
| 618 | * @param pszOutputCS Codeset of the input string.
|
---|
| 619 | * @param ppvTmpCache Pointer to local temporary cache. Must be
|
---|
| 620 | * initialized by calling rtStrLocalCacheInit and
|
---|
| 621 | * cleaned up afterwards by rtStrLocalCacheDelete.
|
---|
| 622 | * Optional.
|
---|
| 623 | */
|
---|
| 624 | DECLHIDDEN(int) rtStrLocalCacheConvert(const char *pchInput, size_t cchInput, const char *pszInputCS,
|
---|
| 625 | char **ppszOutput, size_t cbOutput, const char *pszOutputCS,
|
---|
| 626 | void **ppvTmpCache)
|
---|
| 627 | {
|
---|
| 628 | #ifdef RT_WITH_ICONV_CACHE
|
---|
| 629 | if (ppvTmpCache)
|
---|
| 630 | return rtstrConvertCached(pchInput, cchInput, pszInputCS, (void **)ppszOutput, cbOutput, pszOutputCS,
|
---|
| 631 | 1 /*cFactor*/, (iconv_t *)ppvTmpCache);
|
---|
| 632 | #else
|
---|
| 633 | RT_NOREF(ppvTmpCache);
|
---|
| 634 | #endif
|
---|
| 635 |
|
---|
| 636 | return rtStrConvertUncached(pchInput, cchInput, pszInputCS, (void **)ppszOutput, cbOutput, pszOutputCS, 1 /*cFactor*/);
|
---|
| 637 | }
|
---|
| 638 |
|
---|
| 639 |
|
---|
[31157] | 640 | RTR3DECL(int) RTStrUtf8ToCurrentCPTag(char **ppszString, const char *pszString, const char *pszTag)
|
---|
[1] | 641 | {
|
---|
| 642 | Assert(ppszString);
|
---|
| 643 | Assert(pszString);
|
---|
| 644 | *ppszString = NULL;
|
---|
| 645 |
|
---|
| 646 | /*
|
---|
| 647 | * Assume result string length is not longer than UTF-8 string.
|
---|
| 648 | */
|
---|
| 649 | size_t cch = strlen(pszString);
|
---|
| 650 | if (cch <= 0)
|
---|
| 651 | {
|
---|
| 652 | /* zero length string passed. */
|
---|
[31157] | 653 | *ppszString = (char *)RTMemTmpAllocZTag(sizeof(char), pszTag);
|
---|
[1] | 654 | if (*ppszString)
|
---|
| 655 | return VINF_SUCCESS;
|
---|
| 656 | return VERR_NO_TMP_MEMORY;
|
---|
| 657 | }
|
---|
[28903] | 658 | return rtStrConvertWrapper(pszString, cch, "UTF-8", ppszString, 0, "", 1, RTSTRICONV_UTF8_TO_LOCALE);
|
---|
[1] | 659 | }
|
---|
| 660 |
|
---|
| 661 |
|
---|
[80764] | 662 | RTR3DECL(int) RTStrUtf8ToCurrentCPExTag(char **ppszString, const char *pszString, size_t cchString, const char *pszTag)
|
---|
| 663 | {
|
---|
| 664 | Assert(ppszString);
|
---|
| 665 | Assert(pszString);
|
---|
| 666 | *ppszString = NULL;
|
---|
| 667 |
|
---|
| 668 | /*
|
---|
| 669 | * Assume result string length is not longer than UTF-8 string.
|
---|
| 670 | */
|
---|
| 671 | cchString = RTStrNLen(pszString, cchString);
|
---|
| 672 | if (cchString < 1)
|
---|
| 673 | {
|
---|
| 674 | /* zero length string passed. */
|
---|
| 675 | *ppszString = (char *)RTMemTmpAllocZTag(sizeof(char), pszTag);
|
---|
| 676 | if (*ppszString)
|
---|
| 677 | return VINF_SUCCESS;
|
---|
| 678 | return VERR_NO_TMP_MEMORY;
|
---|
| 679 | }
|
---|
| 680 | return rtStrConvertWrapper(pszString, cchString, "UTF-8", ppszString, 0, "", 1, RTSTRICONV_UTF8_TO_LOCALE);
|
---|
| 681 | }
|
---|
| 682 |
|
---|
| 683 |
|
---|
[31157] | 684 | RTR3DECL(int) RTStrCurrentCPToUtf8Tag(char **ppszString, const char *pszString, const char *pszTag)
|
---|
[1] | 685 | {
|
---|
| 686 | Assert(ppszString);
|
---|
| 687 | Assert(pszString);
|
---|
| 688 | *ppszString = NULL;
|
---|
| 689 |
|
---|
| 690 | /*
|
---|
[28903] | 691 | * Attempt with UTF-8 length of 2x the native length.
|
---|
[1] | 692 | */
|
---|
| 693 | size_t cch = strlen(pszString);
|
---|
| 694 | if (cch <= 0)
|
---|
| 695 | {
|
---|
| 696 | /* zero length string passed. */
|
---|
[31157] | 697 | *ppszString = (char *)RTMemTmpAllocZTag(sizeof(char), pszTag);
|
---|
[1] | 698 | if (*ppszString)
|
---|
| 699 | return VINF_SUCCESS;
|
---|
| 700 | return VERR_NO_TMP_MEMORY;
|
---|
| 701 | }
|
---|
[28903] | 702 | return rtStrConvertWrapper(pszString, cch, "", ppszString, 0, "UTF-8", 2, RTSTRICONV_LOCALE_TO_UTF8);
|
---|
[1] | 703 | }
|
---|
| 704 |
|
---|
[93640] | 705 |
|
---|
| 706 | RTR3DECL(int) RTStrConsoleCPToUtf8Tag(char **ppszString, const char *pszString, const char *pszTag)
|
---|
| 707 | {
|
---|
| 708 | return RTStrCurrentCPToUtf8Tag(ppszString, pszString, pszTag);
|
---|
| 709 | }
|
---|