VirtualBox

source: vbox/trunk/src/libs/xpcom18a4/xpcom/io/nsNativeCharsetUtils.cpp@ 4837

Last change on this file since 4837 was 1, checked in by vboxsync, 54 years ago

import

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 37.5 KB
Line 
1/* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
8 *
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
13 *
14 * The Original Code is Mozilla.
15 *
16 * The Initial Developer of the Original Code is
17 * Netscape Communications Corporation.
18 * Portions created by the Initial Developer are Copyright (C) 2002
19 * the Initial Developer. All Rights Reserved.
20 *
21 * Contributor(s):
22 * Darin Fisher <darin@netscape.com>
23 * Brian Stell <bstell@ix.netcom.com>
24 * Frank Tang <ftang@netscape.com>
25 * Brendan Eich <brendan@mozilla.org>
26 * Sergei Dolgov <sergei_d@fi.fi.tartu.ee>
27 *
28 * Alternatively, the contents of this file may be used under the terms of
29 * either the GNU General Public License Version 2 or later (the "GPL"), or
30 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
31 * in which case the provisions of the GPL or the LGPL are applicable instead
32 * of those above. If you wish to allow use of your version of this file only
33 * under the terms of either the GPL or the LGPL, and not to allow others to
34 * use your version of this file under the terms of the MPL, indicate your
35 * decision by deleting the provisions above and replace them with the notice
36 * and other provisions required by the GPL or the LGPL. If you do not delete
37 * the provisions above, a recipient may use your version of this file under
38 * the terms of any one of the MPL, the GPL or the LGPL.
39 *
40 * ***** END LICENSE BLOCK ***** */
41
42#include "xpcom-private.h"
43
44//-----------------------------------------------------------------------------
45// XP_UNIX
46//-----------------------------------------------------------------------------
47#if defined(XP_UNIX)
48
49#include <stdlib.h> // mbtowc, wctomb
50#include <locale.h> // setlocale
51#include "nscore.h"
52#include "prlock.h"
53#include "nsAString.h"
54#include "nsReadableUtils.h"
55
56//
57// choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
58// but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
59// or not (see bug 206811 and
60// news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
61// iconv for all platforms where nltypes.h and nllanginfo.h are present
62// along with iconv.
63//
64#if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
65#define USE_ICONV 1
66#else
67#define USE_STDCONV 1
68#endif
69
70static void
71isolatin1_to_utf16(const char **input, PRUint32 *inputLeft, PRUnichar **output, PRUint32 *outputLeft)
72{
73 while (*inputLeft && *outputLeft) {
74 **output = (unsigned char) **input;
75 (*input)++;
76 (*inputLeft)--;
77 (*output)++;
78 (*outputLeft)--;
79 }
80}
81
82static void
83utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output, PRUint32 *outputLeft)
84{
85 while (*inputLeft && *outputLeft) {
86 **output = (unsigned char) **input;
87 (*input)++;
88 (*inputLeft)--;
89 (*output)++;
90 (*outputLeft)--;
91 }
92}
93
94//-----------------------------------------------------------------------------
95// conversion using iconv
96//-----------------------------------------------------------------------------
97#if defined(USE_ICONV)
98#include <nl_types.h> // CODESET
99#include <langinfo.h> // nl_langinfo
100#include <iconv.h> // iconv_open, iconv, iconv_close
101#include <errno.h>
102
103#if defined(HAVE_ICONV_WITH_CONST_INPUT)
104#define ICONV_INPUT(x) (x)
105#else
106#define ICONV_INPUT(x) ((char **)x)
107#endif
108
109// solaris definitely needs this, but we'll enable it by default
110// just in case... but we know for sure that iconv(3) in glibc
111// doesn't need this.
112#if !defined(__GLIBC__)
113#define ENABLE_UTF8_FALLBACK_SUPPORT
114#endif
115
116#define INVALID_ICONV_T ((iconv_t) -1)
117
118static inline size_t
119xp_iconv(iconv_t converter,
120 const char **input,
121 size_t *inputLeft,
122 char **output,
123 size_t *outputLeft)
124{
125 size_t res, outputAvail = outputLeft ? *outputLeft : 0;
126 res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
127 if (res == (size_t) -1) {
128 // on some platforms (e.g., linux) iconv will fail with
129 // E2BIG if it cannot convert _all_ of its input. it'll
130 // still adjust all of the in/out params correctly, so we
131 // can ignore this error. the assumption is that we will
132 // be called again to complete the conversion.
133 if ((errno == E2BIG) && (*outputLeft < outputAvail))
134 res = 0;
135 }
136 return res;
137}
138
139static inline void
140xp_iconv_reset(iconv_t converter)
141{
142 // NOTE: the man pages on Solaris claim that you can pass NULL
143 // for all parameter to reset the converter, but beware the
144 // evil Solaris crash if you go down this route >:-)
145
146 const char *zero_char_in_ptr = NULL;
147 char *zero_char_out_ptr = NULL;
148 size_t zero_size_in = 0,
149 zero_size_out = 0;
150
151 xp_iconv(converter, &zero_char_in_ptr,
152 &zero_size_in,
153 &zero_char_out_ptr,
154 &zero_size_out);
155}
156
157static inline iconv_t
158xp_iconv_open(const char **to_list, const char **from_list)
159{
160 iconv_t res;
161 const char **from_name;
162 const char **to_name;
163
164 // try all possible combinations to locate a converter.
165 to_name = to_list;
166 while (*to_name) {
167 if (**to_name) {
168 from_name = from_list;
169 while (*from_name) {
170 if (**from_name) {
171 res = iconv_open(*to_name, *from_name);
172 if (res != INVALID_ICONV_T)
173 return res;
174 }
175 from_name++;
176 }
177 }
178 to_name++;
179 }
180
181 return INVALID_ICONV_T;
182}
183
184/*
185 * PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
186 * have to use UTF-16 with iconv(3) on platforms where it's supported.
187 * However, the way UTF-16 and UCS-2 are interpreted varies across platforms
188 * and implementations of iconv(3). On Tru64, it also depends on the environment
189 * variable. To avoid the trouble arising from byte-swapping
190 * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
191 * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
192 * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
193 * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
194 * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
195 * variable ICONV_BYTEORDER is set to 'big-endian', about which not much
196 * can be done other than adding a note in the release notes. (bug 206811)
197 */
198static const char *UTF_16_NAMES[] = {
199#if defined(IS_LITTLE_ENDIAN)
200 "UTF-16LE",
201#if defined(__GLIBC__)
202 "UNICODELITTLE",
203#endif
204 "UCS-2LE",
205#else
206 "UTF-16BE",
207#if defined(__GLIBC__)
208 "UNICODEBIG",
209#endif
210 "UCS-2BE",
211#endif
212 "UTF-16",
213 "UCS-2",
214 "UCS2",
215 "UCS_2",
216 "ucs-2",
217 "ucs2",
218 "ucs_2",
219 NULL
220};
221
222#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
223static const char *UTF_8_NAMES[] = {
224 "UTF-8",
225 "UTF8",
226 "UTF_8",
227 "utf-8",
228 "utf8",
229 "utf_8",
230 NULL
231};
232#endif
233
234static const char *ISO_8859_1_NAMES[] = {
235 "ISO-8859-1",
236#if !defined(__GLIBC__)
237 "ISO8859-1",
238 "ISO88591",
239 "ISO_8859_1",
240 "ISO8859_1",
241 "iso-8859-1",
242 "iso8859-1",
243 "iso88591",
244 "iso_8859_1",
245 "iso8859_1",
246#endif
247 NULL
248};
249
250class nsNativeCharsetConverter
251{
252public:
253 nsNativeCharsetConverter();
254 ~nsNativeCharsetConverter();
255
256 nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
257 PRUnichar **output, PRUint32 *outputLeft);
258 nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
259 char **output, PRUint32 *outputLeft);
260
261 static void GlobalInit();
262 static void GlobalShutdown();
263
264private:
265 static iconv_t gNativeToUnicode;
266 static iconv_t gUnicodeToNative;
267#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
268 static iconv_t gNativeToUTF8;
269 static iconv_t gUTF8ToNative;
270 static iconv_t gUnicodeToUTF8;
271 static iconv_t gUTF8ToUnicode;
272#endif
273 static PRLock *gLock;
274 static PRBool gInitialized;
275
276 static void LazyInit();
277
278 static void Lock() { if (gLock) PR_Lock(gLock); }
279 static void Unlock() { if (gLock) PR_Unlock(gLock); }
280};
281
282iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
283iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
284#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
285iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
286iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
287iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
288iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
289#endif
290PRLock *nsNativeCharsetConverter::gLock = nsnull;
291PRBool nsNativeCharsetConverter::gInitialized = PR_FALSE;
292
293void
294nsNativeCharsetConverter::LazyInit()
295{
296 const char *blank_list[] = { "", NULL };
297 const char **native_charset_list = blank_list;
298 const char *native_charset = nl_langinfo(CODESET);
299 if (native_charset == nsnull) {
300 NS_ERROR("native charset is unknown");
301 // fallback to ISO-8859-1
302 native_charset_list = ISO_8859_1_NAMES;
303 }
304 else
305 native_charset_list[0] = native_charset;
306
307 gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
308 gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
309
310#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
311 if (gNativeToUnicode == INVALID_ICONV_T) {
312 gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
313 gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
314 NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
315 NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
316 }
317 if (gUnicodeToNative == INVALID_ICONV_T) {
318 gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
319 gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
320 NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
321 NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
322 }
323#else
324 NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
325 NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
326#endif
327
328 /*
329 * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
330 * prepend a byte order mark unicode character (BOM, u+FEFF) during
331 * the first use of the iconv converter. The same is the case of
332 * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
333 * However, we use 'UTF-16LE/BE' in both cases, instead so that we
334 * should be safe. But just in case...
335 *
336 * This dummy conversion gets rid of the BOMs and fixes bug 153562.
337 */
338 char dummy_input[1] = { ' ' };
339 char dummy_output[4];
340
341 if (gNativeToUnicode != INVALID_ICONV_T) {
342 const char *input = dummy_input;
343 size_t input_left = sizeof(dummy_input);
344 char *output = dummy_output;
345 size_t output_left = sizeof(dummy_output);
346
347 xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
348 }
349#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
350 if (gUTF8ToUnicode != INVALID_ICONV_T) {
351 const char *input = dummy_input;
352 size_t input_left = sizeof(dummy_input);
353 char *output = dummy_output;
354 size_t output_left = sizeof(dummy_output);
355
356 xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
357 }
358#endif
359
360 gInitialized = PR_TRUE;
361}
362
363void
364nsNativeCharsetConverter::GlobalInit()
365{
366 gLock = PR_NewLock();
367 NS_ASSERTION(gLock, "lock creation failed");
368}
369
370void
371nsNativeCharsetConverter::GlobalShutdown()
372{
373 if (gLock) {
374 PR_DestroyLock(gLock);
375 gLock = nsnull;
376 }
377
378 if (gNativeToUnicode != INVALID_ICONV_T) {
379 iconv_close(gNativeToUnicode);
380 gNativeToUnicode = INVALID_ICONV_T;
381 }
382
383 if (gUnicodeToNative != INVALID_ICONV_T) {
384 iconv_close(gUnicodeToNative);
385 gUnicodeToNative = INVALID_ICONV_T;
386 }
387
388#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
389 if (gNativeToUTF8 != INVALID_ICONV_T) {
390 iconv_close(gNativeToUTF8);
391 gNativeToUTF8 = INVALID_ICONV_T;
392 }
393 if (gUTF8ToNative != INVALID_ICONV_T) {
394 iconv_close(gUTF8ToNative);
395 gUTF8ToNative = INVALID_ICONV_T;
396 }
397 if (gUnicodeToUTF8 != INVALID_ICONV_T) {
398 iconv_close(gUnicodeToUTF8);
399 gUnicodeToUTF8 = INVALID_ICONV_T;
400 }
401 if (gUTF8ToUnicode != INVALID_ICONV_T) {
402 iconv_close(gUTF8ToUnicode);
403 gUTF8ToUnicode = INVALID_ICONV_T;
404 }
405#endif
406
407 gInitialized = PR_FALSE;
408}
409
410nsNativeCharsetConverter::nsNativeCharsetConverter()
411{
412 Lock();
413 if (!gInitialized)
414 LazyInit();
415}
416
417nsNativeCharsetConverter::~nsNativeCharsetConverter()
418{
419 // reset converters for next time
420 if (gNativeToUnicode != INVALID_ICONV_T)
421 xp_iconv_reset(gNativeToUnicode);
422 if (gUnicodeToNative != INVALID_ICONV_T)
423 xp_iconv_reset(gUnicodeToNative);
424#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
425 if (gNativeToUTF8 != INVALID_ICONV_T)
426 xp_iconv_reset(gNativeToUTF8);
427 if (gUTF8ToNative != INVALID_ICONV_T)
428 xp_iconv_reset(gUTF8ToNative);
429 if (gUnicodeToUTF8 != INVALID_ICONV_T)
430 xp_iconv_reset(gUnicodeToUTF8);
431 if (gUTF8ToUnicode != INVALID_ICONV_T)
432 xp_iconv_reset(gUTF8ToUnicode);
433#endif
434 Unlock();
435}
436
437nsresult
438nsNativeCharsetConverter::NativeToUnicode(const char **input,
439 PRUint32 *inputLeft,
440 PRUnichar **output,
441 PRUint32 *outputLeft)
442{
443 size_t res = 0;
444 size_t inLeft = (size_t) *inputLeft;
445 size_t outLeft = (size_t) *outputLeft * 2;
446
447 if (gNativeToUnicode != INVALID_ICONV_T) {
448
449 res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
450
451 *inputLeft = inLeft;
452 *outputLeft = outLeft / 2;
453 if (res != (size_t) -1)
454 return NS_OK;
455
456 NS_WARNING("conversion from native to utf-16 failed");
457
458 // reset converter
459 xp_iconv_reset(gNativeToUnicode);
460 }
461#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
462 else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
463 (gUTF8ToUnicode != INVALID_ICONV_T)) {
464 // convert first to UTF8, then from UTF8 to UCS2
465 const char *in = *input;
466
467 char ubuf[1024];
468
469 // we assume we're always called with enough space in |output|,
470 // so convert many chars at a time...
471 while (inLeft) {
472 char *p = ubuf;
473 size_t n = sizeof(ubuf);
474 res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
475 if (res == (size_t) -1) {
476 NS_ERROR("conversion from native to utf-8 failed");
477 break;
478 }
479 NS_ASSERTION(outLeft > 0, "bad assumption");
480 p = ubuf;
481 n = sizeof(ubuf) - n;
482 res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
483 if (res == (size_t) -1) {
484 NS_ERROR("conversion from utf-8 to utf-16 failed");
485 break;
486 }
487 }
488
489 (*input) += (*inputLeft - inLeft);
490 *inputLeft = inLeft;
491 *outputLeft = outLeft / 2;
492
493 if (res != (size_t) -1)
494 return NS_OK;
495
496 // reset converters
497 xp_iconv_reset(gNativeToUTF8);
498 xp_iconv_reset(gUTF8ToUnicode);
499 }
500#endif
501
502 // fallback: zero-pad and hope for the best
503 // XXX This is lame and we have to do better.
504 isolatin1_to_utf16(input, inputLeft, output, outputLeft);
505
506 return NS_OK;
507}
508
509nsresult
510nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
511 PRUint32 *inputLeft,
512 char **output,
513 PRUint32 *outputLeft)
514{
515 size_t res = 0;
516 size_t inLeft = (size_t) *inputLeft * 2;
517 size_t outLeft = (size_t) *outputLeft;
518
519 if (gUnicodeToNative != INVALID_ICONV_T) {
520 res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
521
522 if (res != (size_t) -1) {
523 *inputLeft = inLeft / 2;
524 *outputLeft = outLeft;
525 return NS_OK;
526 }
527
528 NS_ERROR("iconv failed");
529
530 // reset converter
531 xp_iconv_reset(gUnicodeToNative);
532 }
533#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
534 else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
535 (gUTF8ToNative != INVALID_ICONV_T)) {
536 const char *in = (const char *) *input;
537
538 char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
539
540 // convert one uchar at a time...
541 while (inLeft && outLeft) {
542 char *p = ubuf;
543 size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
544 res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
545 if (res == (size_t) -1) {
546 NS_ERROR("conversion from utf-16 to utf-8 failed");
547 break;
548 }
549 p = ubuf;
550 n = sizeof(ubuf) - n;
551 res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
552 if (res == (size_t) -1) {
553 if (errno == E2BIG) {
554 // not enough room for last uchar... back up and return.
555 in -= sizeof(PRUnichar);
556 res = 0;
557 }
558 else
559 NS_ERROR("conversion from utf-8 to native failed");
560 break;
561 }
562 inLeft -= sizeof(PRUnichar);
563 }
564
565 if (res != (size_t) -1) {
566 (*input) += (*inputLeft - inLeft/2);
567 *inputLeft = inLeft/2;
568 *outputLeft = outLeft;
569 return NS_OK;
570 }
571
572 // reset converters
573 xp_iconv_reset(gUnicodeToUTF8);
574 xp_iconv_reset(gUTF8ToNative);
575 }
576#endif
577
578 // fallback: truncate and hope for the best
579 utf16_to_isolatin1(input, inputLeft, output, outputLeft);
580
581 return NS_OK;
582}
583
584#endif // USE_ICONV
585
586//-----------------------------------------------------------------------------
587// conversion using mb[r]towc/wc[r]tomb
588//-----------------------------------------------------------------------------
589#if defined(USE_STDCONV)
590#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
591#include <wchar.h> // mbrtowc, wcrtomb
592#endif
593
594class nsNativeCharsetConverter
595{
596public:
597 nsNativeCharsetConverter();
598
599 nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
600 PRUnichar **output, PRUint32 *outputLeft);
601 nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
602 char **output, PRUint32 *outputLeft);
603
604 static void GlobalInit();
605 static void GlobalShutdown() { }
606
607private:
608 static PRBool gWCharIsUnicode;
609
610#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
611 mbstate_t ps;
612#endif
613};
614
615PRBool nsNativeCharsetConverter::gWCharIsUnicode = PR_FALSE;
616
617nsNativeCharsetConverter::nsNativeCharsetConverter()
618{
619#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
620 memset(&ps, 0, sizeof(ps));
621#endif
622}
623
624void
625nsNativeCharsetConverter::GlobalInit()
626{
627 // verify that wchar_t for the current locale is actually unicode.
628 // if it is not, then we should avoid calling mbtowc/wctomb and
629 // just fallback on zero-pad/truncation conversion.
630 //
631 // this test cannot be done at build time because the encoding of
632 // wchar_t may depend on the runtime locale. sad, but true!!
633 //
634 // so, if wchar_t is unicode then converting an ASCII character
635 // to wchar_t should not change its numeric value. we'll just
636 // check what happens with the ASCII 'a' character.
637 //
638 // this test is not perfect... obviously, it could yield false
639 // positives, but then at least ASCII text would be converted
640 // properly (or maybe just the 'a' character) -- oh well :(
641
642 char a = 'a';
643 unsigned int w = 0;
644
645#ifndef L4ENV
646 int res = mbtowc((wchar_t *) &w, &a, 1);
647
648 gWCharIsUnicode = (res != -1 && w == 'a');
649#else
650 gWCharIsUnicode = 0;
651#endif
652
653#ifdef DEBUG
654 if (!gWCharIsUnicode)
655 NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
656#endif
657}
658
659nsresult
660nsNativeCharsetConverter::NativeToUnicode(const char **input,
661 PRUint32 *inputLeft,
662 PRUnichar **output,
663 PRUint32 *outputLeft)
664{
665 if (gWCharIsUnicode) {
666#ifndef L4ENV
667 /* We don't have any wchar support built into uclibc just now */
668 int incr;
669
670 // cannot use wchar_t here since it may have been redefined (e.g.,
671 // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
672 unsigned int tmp = 0;
673 while (*inputLeft && *outputLeft) {
674#ifdef HAVE_MBRTOWC
675 incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
676#else
677 // XXX is this thread-safe?
678 incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
679#endif
680 if (incr < 0) {
681 NS_WARNING("mbtowc failed: possible charset mismatch");
682 // zero-pad and hope for the best
683 tmp = (unsigned char) **input;
684 incr = 1;
685 }
686 **output = (PRUnichar) tmp;
687 (*input) += incr;
688 (*inputLeft) -= incr;
689 (*output)++;
690 (*outputLeft)--;
691 }
692#endif /* not defined L4ENV */
693 }
694 else {
695 // wchar_t isn't unicode, so the best we can do is treat the
696 // input as if it is isolatin1 :(
697 isolatin1_to_utf16(input, inputLeft, output, outputLeft);
698 }
699
700 return NS_OK;
701}
702
703nsresult
704nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
705 PRUint32 *inputLeft,
706 char **output,
707 PRUint32 *outputLeft)
708{
709 if (gWCharIsUnicode) {
710#ifndef L4ENV
711 /* We don't have any wchar support built into uclibc just now */
712 int incr;
713
714 while (*inputLeft && *outputLeft >= MB_CUR_MAX) {
715#ifdef HAVE_WCRTOMB
716 incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
717#else
718 // XXX is this thread-safe?
719 incr = (int) wctomb(*output, (wchar_t) **input);
720#endif
721 if (incr < 0) {
722 NS_WARNING("mbtowc failed: possible charset mismatch");
723 **output = (unsigned char) **input; // truncate
724 incr = 1;
725 }
726 // most likely we're dead anyways if this assertion should fire
727 NS_ASSERTION(PRUint32(incr) <= *outputLeft, "wrote beyond end of string");
728 (*output) += incr;
729 (*outputLeft) -= incr;
730 (*input)++;
731 (*inputLeft)--;
732 }
733#endif /* not defined L4ENV */
734 }
735 else {
736 // wchar_t isn't unicode, so the best we can do is treat the
737 // input as if it is isolatin1 :(
738 utf16_to_isolatin1(input, inputLeft, output, outputLeft);
739 }
740
741 return NS_OK;
742}
743
744#endif // USE_STDCONV
745
746//-----------------------------------------------------------------------------
747// API implementation
748//-----------------------------------------------------------------------------
749
750NS_COM nsresult
751NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
752{
753 output.Truncate();
754
755 PRUint32 inputLen = input.Length();
756
757 nsACString::const_iterator iter;
758 input.BeginReading(iter);
759
760 //
761 // OPTIMIZATION: preallocate space for largest possible result; convert
762 // directly into the result buffer to avoid intermediate buffer copy.
763 //
764 // this will generally result in a larger allocation, but that seems
765 // better than an extra buffer copy.
766 //
767 output.SetLength(inputLen);
768 nsAString::iterator out_iter;
769 output.BeginWriting(out_iter);
770
771 PRUnichar *result = out_iter.get();
772 PRUint32 resultLeft = inputLen;
773
774 const char *buf = iter.get();
775 PRUint32 bufLeft = inputLen;
776
777 nsNativeCharsetConverter conv;
778 nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
779 if (NS_SUCCEEDED(rv)) {
780 NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
781 output.SetLength(inputLen - resultLeft);
782 }
783 return rv;
784}
785
786NS_COM nsresult
787NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
788{
789 output.Truncate();
790
791 nsAString::const_iterator iter, end;
792 input.BeginReading(iter);
793 input.EndReading(end);
794
795 // cannot easily avoid intermediate buffer copy.
796 char temp[4096];
797
798 nsNativeCharsetConverter conv;
799
800 const PRUnichar *buf = iter.get();
801 PRUint32 bufLeft = Distance(iter, end);
802 while (bufLeft) {
803 char *p = temp;
804 PRUint32 tempLeft = sizeof(temp);
805
806 nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
807 if (NS_FAILED(rv)) return rv;
808
809 if (tempLeft < sizeof(temp))
810 output.Append(temp, sizeof(temp) - tempLeft);
811 }
812 return NS_OK;
813}
814
815void
816NS_StartupNativeCharsetUtils()
817{
818 //
819 // need to initialize the locale or else charset conversion will fail.
820 // better not delay this in case some other component alters the locale
821 // settings.
822 //
823 // XXX we assume that we are called early enough that we should
824 // always be the first to care about the locale's charset.
825 //
826 setlocale(LC_CTYPE, "");
827
828 nsNativeCharsetConverter::GlobalInit();
829}
830
831void
832NS_ShutdownNativeCharsetUtils()
833{
834 nsNativeCharsetConverter::GlobalShutdown();
835}
836
837//-----------------------------------------------------------------------------
838// XP_BEOS
839//-----------------------------------------------------------------------------
840#elif defined(XP_BEOS)
841
842#include "nsAString.h"
843#include "nsReadableUtils.h"
844#include "nsString.h"
845
846NS_COM nsresult
847NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
848{
849 CopyUTF8toUTF16(input, output);
850 return NS_OK;
851}
852
853NS_COM nsresult
854NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
855{
856 CopyUTF16toUTF8(input, output);
857 return NS_OK;
858}
859
860void
861NS_StartupNativeCharsetUtils()
862{
863}
864
865void
866NS_ShutdownNativeCharsetUtils()
867{
868}
869
870//-----------------------------------------------------------------------------
871// XP_WIN
872//-----------------------------------------------------------------------------
873#elif defined(XP_WIN)
874
875#include <windows.h>
876#include "nsAString.h"
877
878NS_COM nsresult
879NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
880{
881 PRUint32 inputLen = input.Length();
882
883 nsACString::const_iterator iter;
884 input.BeginReading(iter);
885
886 const char *buf = iter.get();
887
888 // determine length of result
889 PRUint32 resultLen = 0;
890 int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, NULL, 0);
891 if (n > 0)
892 resultLen += n;
893
894 // allocate sufficient space
895 output.SetLength(resultLen);
896 if (resultLen > 0) {
897 nsAString::iterator out_iter;
898 output.BeginWriting(out_iter);
899
900 PRUnichar *result = out_iter.get();
901
902 ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, result, resultLen);
903 }
904 return NS_OK;
905}
906
907NS_COM nsresult
908NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
909{
910 PRUint32 inputLen = input.Length();
911
912 nsAString::const_iterator iter;
913 input.BeginReading(iter);
914
915 const PRUnichar *buf = iter.get();
916
917 // determine length of result
918 PRUint32 resultLen = 0;
919
920 int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, NULL, 0, NULL, NULL);
921 if (n > 0)
922 resultLen += n;
923
924 // allocate sufficient space
925 output.SetLength(resultLen);
926 if (resultLen > 0) {
927 nsACString::iterator out_iter;
928 output.BeginWriting(out_iter);
929
930 // default "defaultChar" is '?', which is an illegal character on windows
931 // file system. That will cause file uncreatable. Change it to '_'
932 const char defaultChar = '_';
933
934 char *result = out_iter.get();
935
936 ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
937 &defaultChar, NULL);
938 }
939 return NS_OK;
940}
941
942void
943NS_StartupNativeCharsetUtils()
944{
945}
946
947void
948NS_ShutdownNativeCharsetUtils()
949{
950}
951
952//-----------------------------------------------------------------------------
953// XP_OS2
954//-----------------------------------------------------------------------------
955#elif defined(XP_OS2)
956
957#define INCL_DOS
958#include <os2.h>
959#include <uconv.h>
960#include "nsAString.h"
961#include <ulserrno.h>
962#include "nsNativeCharsetUtils.h"
963
964static UconvObject UnicodeConverter = NULL;
965
966NS_COM nsresult
967NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
968{
969 PRUint32 inputLen = input.Length();
970
971 nsACString::const_iterator iter;
972 input.BeginReading(iter);
973 const char *inputStr = iter.get();
974
975 // determine length of result
976 PRUint32 resultLen = inputLen;
977 output.SetLength(resultLen);
978
979 nsAString::iterator out_iter;
980 output.BeginWriting(out_iter);
981 UniChar *result = (UniChar*)out_iter.get();
982
983 size_t cSubs = 0;
984 size_t resultLeft = resultLen;
985
986 if (!UnicodeConverter)
987 NS_StartupNativeCharsetUtils();
988
989 int unirc = ::UniUconvToUcs(UnicodeConverter, (void**)&inputStr, &inputLen,
990 &result, &resultLeft, &cSubs);
991
992 NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
993
994 if (unirc != ULS_SUCCESS) {
995 output.Truncate();
996 return NS_ERROR_FAILURE;
997 }
998
999 // Need to update string length to reflect how many bytes were actually
1000 // written.
1001 output.Truncate(resultLen - resultLeft);
1002 return NS_OK;
1003}
1004
1005NS_COM nsresult
1006NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1007{
1008 size_t inputLen = input.Length();
1009
1010 nsAString::const_iterator iter;
1011 input.BeginReading(iter);
1012 UniChar* inputStr = (UniChar*) NS_CONST_CAST(PRUnichar*, iter.get());
1013
1014 // maximum length of unicode string of length x converted to native
1015 // codepage is x*2
1016 size_t resultLen = inputLen * 2;
1017 output.SetLength(resultLen);
1018
1019 nsACString::iterator out_iter;
1020 output.BeginWriting(out_iter);
1021 char *result = out_iter.get();
1022
1023 size_t cSubs = 0;
1024 size_t resultLeft = resultLen;
1025
1026 if (!UnicodeConverter)
1027 NS_StartupNativeCharsetUtils();
1028
1029 int unirc = ::UniUconvFromUcs(UnicodeConverter, &inputStr, &inputLen,
1030 (void**)&result, &resultLeft, &cSubs);
1031
1032 NS_ASSERTION(unirc != UCONV_E2BIG, "Path too big");
1033
1034 if (unirc != ULS_SUCCESS) {
1035 output.Truncate();
1036 return NS_ERROR_FAILURE;
1037 }
1038
1039 // Need to update string length to reflect how many bytes were actually
1040 // written.
1041 output.Truncate(resultLen - resultLeft);
1042 return NS_OK;
1043}
1044
1045void
1046NS_StartupNativeCharsetUtils()
1047{
1048 ULONG ulLength;
1049 ULONG ulCodePage;
1050 DosQueryCp(sizeof(ULONG), &ulCodePage, &ulLength);
1051
1052 UniChar codepage[20];
1053 int unirc = ::UniMapCpToUcsCp(ulCodePage, codepage, 20);
1054 if (unirc == ULS_SUCCESS) {
1055 unirc = ::UniCreateUconvObject(codepage, &UnicodeConverter);
1056 if (unirc == ULS_SUCCESS) {
1057 uconv_attribute_t attr;
1058 ::UniQueryUconvObject(UnicodeConverter, &attr, sizeof(uconv_attribute_t),
1059 NULL, NULL, NULL);
1060 attr.options = UCONV_OPTION_SUBSTITUTE_BOTH;
1061 attr.subchar_len=1;
1062 attr.subchar[0]='_';
1063 ::UniSetUconvObject(UnicodeConverter, &attr);
1064 }
1065 }
1066}
1067
1068void
1069NS_ShutdownNativeCharsetUtils()
1070{
1071 ::UniFreeUconvObject(UnicodeConverter);
1072}
1073
1074//-----------------------------------------------------------------------------
1075// XP_MAC
1076//-----------------------------------------------------------------------------
1077#elif defined(XP_MAC)
1078
1079#include <UnicodeConverter.h>
1080#include <TextCommon.h>
1081#include <Script.h>
1082#include <MacErrors.h>
1083#include "nsAString.h"
1084
1085class nsFSStringConversionMac {
1086public:
1087 static nsresult UCSToFS(const nsAString& aIn, nsACString& aOut);
1088 static nsresult FSToUCS(const nsACString& ain, nsAString& aOut);
1089
1090 static void CleanUp();
1091
1092private:
1093 static TextEncoding GetSystemEncoding();
1094 static nsresult PrepareEncoder();
1095 static nsresult PrepareDecoder();
1096
1097 static UnicodeToTextInfo sEncoderInfo;
1098 static TextToUnicodeInfo sDecoderInfo;
1099};
1100
1101UnicodeToTextInfo nsFSStringConversionMac::sEncoderInfo = nsnull;
1102TextToUnicodeInfo nsFSStringConversionMac::sDecoderInfo = nsnull;
1103
1104nsresult nsFSStringConversionMac::UCSToFS(const nsAString& aIn, nsACString& aOut)
1105{
1106 nsresult rv = PrepareEncoder();
1107 if (NS_FAILED(rv)) return rv;
1108
1109 OSStatus err = noErr;
1110 char stackBuffer[512];
1111
1112 aOut.Truncate();
1113
1114 // for each chunk of |aIn|...
1115 nsReadingIterator<PRUnichar> iter;
1116 aIn.BeginReading(iter);
1117
1118 PRUint32 fragmentLength = PRUint32(iter.size_forward());
1119 UInt32 bytesLeft = fragmentLength * sizeof(UniChar);
1120
1121 do {
1122 UInt32 bytesRead = 0, bytesWritten = 0;
1123 err = ::ConvertFromUnicodeToText(sEncoderInfo,
1124 bytesLeft,
1125 (const UniChar*)iter.get(),
1126 kUnicodeUseFallbacksMask | kUnicodeLooseMappingsMask,
1127 0, nsnull, nsnull, nsnull,
1128 sizeof(stackBuffer),
1129 &bytesRead,
1130 &bytesWritten,
1131 stackBuffer);
1132 if (err == kTECUsedFallbacksStatus)
1133 err = noErr;
1134 else if (err == kTECOutputBufferFullStatus) {
1135 bytesLeft -= bytesRead;
1136 iter.advance(bytesRead / sizeof(UniChar));
1137 }
1138 aOut.Append(stackBuffer, bytesWritten);
1139 }
1140 while (err == kTECOutputBufferFullStatus);
1141
1142 return (err == noErr) ? NS_OK : NS_ERROR_FAILURE;
1143}
1144
1145nsresult nsFSStringConversionMac::FSToUCS(const nsACString& aIn, nsAString& aOut)
1146{
1147 nsresult rv = PrepareDecoder();
1148 if (NS_FAILED(rv)) return rv;
1149
1150 OSStatus err = noErr;
1151 UniChar stackBuffer[512];
1152
1153 aOut.Truncate(0);
1154
1155 // for each chunk of |aIn|...
1156 nsReadingIterator<char> iter;
1157 aIn.BeginReading(iter);
1158
1159 PRUint32 fragmentLength = PRUint32(iter.size_forward());
1160 UInt32 bytesLeft = fragmentLength;
1161
1162 do {
1163 UInt32 bytesRead = 0, bytesWritten = 0;
1164 err = ::ConvertFromTextToUnicode(sDecoderInfo,
1165 bytesLeft,
1166 iter.get(),
1167 kUnicodeUseFallbacksMask | kUnicodeLooseMappingsMask,
1168 0, nsnull, nsnull, nsnull,
1169 sizeof(stackBuffer),
1170 &bytesRead,
1171 &bytesWritten,
1172 stackBuffer);
1173 if (err == kTECUsedFallbacksStatus)
1174 err = noErr;
1175 else if (err == kTECOutputBufferFullStatus) {
1176 bytesLeft -= bytesRead;
1177 iter.advance(bytesRead);
1178 }
1179 aOut.Append((PRUnichar *)stackBuffer, bytesWritten / sizeof(PRUnichar));
1180 }
1181 while (err == kTECOutputBufferFullStatus);
1182
1183 return (err == noErr) ? NS_OK : NS_ERROR_FAILURE;
1184}
1185
1186void nsFSStringConversionMac::CleanUp()
1187{
1188 if (sDecoderInfo) {
1189 ::DisposeTextToUnicodeInfo(&sDecoderInfo);
1190 sDecoderInfo = nsnull;
1191 }
1192 if (sEncoderInfo) {
1193 ::DisposeUnicodeToTextInfo(&sEncoderInfo);
1194 sEncoderInfo = nsnull;
1195 }
1196}
1197
1198TextEncoding nsFSStringConversionMac::GetSystemEncoding()
1199{
1200 OSStatus err;
1201 TextEncoding theEncoding;
1202
1203 err = ::UpgradeScriptInfoToTextEncoding(smSystemScript, kTextLanguageDontCare,
1204 kTextRegionDontCare, NULL, &theEncoding);
1205
1206 if (err != noErr)
1207 theEncoding = kTextEncodingMacRoman;
1208
1209 return theEncoding;
1210}
1211
1212nsresult nsFSStringConversionMac::PrepareEncoder()
1213{
1214 nsresult rv = NS_OK;
1215 if (!sEncoderInfo) {
1216 OSStatus err;
1217 err = ::CreateUnicodeToTextInfoByEncoding(GetSystemEncoding(), &sEncoderInfo);
1218 if (err)
1219 rv = NS_ERROR_FAILURE;
1220 }
1221 return rv;
1222}
1223
1224nsresult nsFSStringConversionMac::PrepareDecoder()
1225{
1226 nsresult rv = NS_OK;
1227 if (!sDecoderInfo) {
1228 OSStatus err;
1229 err = ::CreateTextToUnicodeInfoByEncoding(GetSystemEncoding(), &sDecoderInfo);
1230 if (err)
1231 rv = NS_ERROR_FAILURE;
1232 }
1233 return rv;
1234}
1235
1236NS_COM nsresult
1237NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1238{
1239 return nsFSStringConversionMac::FSToUCS(input, output);
1240}
1241
1242NS_COM nsresult
1243NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1244{
1245 return nsFSStringConversionMac::UCSToFS(input, output);
1246}
1247
1248void
1249NS_StartupNativeCharsetUtils()
1250{
1251}
1252
1253void
1254NS_ShutdownNativeCharsetUtils()
1255{
1256 nsFSStringConversionMac::CleanUp();
1257}
1258
1259//-----------------------------------------------------------------------------
1260// default : truncate/zeropad
1261//-----------------------------------------------------------------------------
1262#else
1263
1264#include "nsReadableUtils.h"
1265
1266NS_COM nsresult
1267NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
1268{
1269 CopyASCIItoUCS2(input, output);
1270 return NS_OK;
1271}
1272
1273NS_COM nsresult
1274NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
1275{
1276 CopyUCS2toASCII(input, output);
1277 return NS_OK;
1278}
1279
1280void
1281NS_StartupNativeCharsetUtils()
1282{
1283}
1284
1285void
1286NS_ShutdownNativeCharsetUtils()
1287{
1288}
1289
1290#endif
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use