utf16.h

Last change on this file was 106061, checked in by vboxsync, 8 months ago
Copyright year updates by scm.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 64.9 KB

Line
1	/** @file
2	* IPRT - String Manipulation, UTF-16 encoding.
3	*/
4
5	/*
6	* Copyright (C) 2006-2024 Oracle and/or its affiliates.
7	*
8	* This file is part of VirtualBox base platform packages, as
9	* available from https://www.virtualbox.org.
10	*
11	* This program is free software; you can redistribute it and/or
12	* modify it under the terms of the GNU General Public License
13	* as published by the Free Software Foundation, in version 3 of the
14	* License.
15	*
16	* This program is distributed in the hope that it will be useful, but
17	* WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19	* General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, see <https://www.gnu.org/licenses>.
23	*
24	* The contents of this file may alternatively be used under the terms
25	* of the Common Development and Distribution License Version 1.0
26	* (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27	* in the VirtualBox distribution, in which case the provisions of the
28	* CDDL are applicable instead of those of the GPL.
29	*
30	* You may elect to license modified versions of this file under the
31	* terms and conditions of either the GPL or the CDDL or both.
32	*
33	* SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
34	*/
35
36	#ifndef IPRT_INCLUDED_utf16_h
37	#define IPRT_INCLUDED_utf16_h
38	#ifndef RT_WITHOUT_PRAGMA_ONCE
39	# pragma once
40	#endif
41
42	#include <iprt/string.h>
43
44	RT_C_DECLS_BEGIN
45
46
47	/** @defgroup rt_str_utf16 UTF-16 String Manipulation
48	* @ingroup grp_rt_str
49	* @{
50	*/
51
52	/**
53	* Allocates memory for UTF-16 string storage (default tag).
54	*
55	* You should normally not use this function, except if there is some very
56	* custom string handling you need doing that isn't covered by any of the other
57	* APIs.
58	*
59	* @returns Pointer to the allocated UTF-16 string. The first wide char is
60	* always set to the string terminator char, the contents of the
61	* remainder of the memory is undefined. The string must be freed by
62	* calling RTUtf16Free.
63	*
64	* NULL is returned if the allocation failed. Please translate this to
65	* VERR_NO_UTF16_MEMORY and not VERR_NO_MEMORY. Also consider
66	* RTUtf16AllocEx if an IPRT status code is required.
67	*
68	* @param cb How many bytes to allocate, will be rounded up
69	* to a multiple of two. If this is zero, we will
70	* allocate a terminator wide char anyway.
71	*/
72	#define RTUtf16Alloc(cb) RTUtf16AllocTag((cb), RTSTR_TAG)
73
74	/**
75	* Allocates memory for UTF-16 string storage (custom tag).
76	*
77	* You should normally not use this function, except if there is some very
78	* custom string handling you need doing that isn't covered by any of the other
79	* APIs.
80	*
81	* @returns Pointer to the allocated UTF-16 string. The first wide char is
82	* always set to the string terminator char, the contents of the
83	* remainder of the memory is undefined. The string must be freed by
84	* calling RTUtf16Free.
85	*
86	* NULL is returned if the allocation failed. Please translate this to
87	* VERR_NO_UTF16_MEMORY and not VERR_NO_MEMORY. Also consider
88	* RTUtf16AllocExTag if an IPRT status code is required.
89	*
90	* @param cb How many bytes to allocate, will be rounded up
91	* to a multiple of two. If this is zero, we will
92	* allocate a terminator wide char anyway.
93	* @param pszTag Allocation tag used for statistics and such.
94	*/
95	RTDECL(PRTUTF16) RTUtf16AllocTag(size_t cb, const char *pszTag);
96
97	/**
98	* Reallocates the specified UTF-16 string (default tag).
99	*
100	* You should normally not use this function, except if there is some very
101	* custom string handling you need doing that isn't covered by any of the other
102	* APIs.
103	*
104	* @returns VINF_SUCCESS.
105	* @retval VERR_NO_UTF16_MEMORY if we failed to reallocate the string, @a
106	* *ppwsz remains unchanged.
107	*
108	* @param ppwsz Pointer to the string variable containing the
109	* input and output string.
110	*
111	* When not freeing the string, the result will
112	* always have the last RTUTF16 set to the
113	* terminator character so that when used for
114	* string truncation the result will be a valid
115	* C-style string (your job to keep it a valid
116	* UTF-16 string).
117	*
118	* When the input string is NULL and we're supposed
119	* to reallocate, the returned string will also
120	* have the first RTUTF16 set to the terminator
121	* char so it will be a valid C-style string.
122	*
123	* @param cbNew When @a cbNew is zero, we'll behave like
124	* RTUtf16Free and @a *ppwsz will be set to NULL.
125	*
126	* When not zero, this will be rounded up to a
127	* multiple of two, and used as the new size of the
128	* memory backing the string, i.e. it includes the
129	* terminator (RTUTF16) char.
130	*/
131	#define RTUtf16Realloc(ppwsz, cbNew) RTUtf16ReallocTag((ppwsz), (cbNew), RTSTR_TAG)
132
133	/**
134	* Reallocates the specified UTF-16 string (custom tag).
135	*
136	* You should normally not use this function, except if there is some very
137	* custom string handling you need doing that isn't covered by any of the other
138	* APIs.
139	*
140	* @returns VINF_SUCCESS.
141	* @retval VERR_NO_UTF16_MEMORY if we failed to reallocate the string, @a
142	* *ppwsz remains unchanged.
143	*
144	* @param ppwsz Pointer to the string variable containing the
145	* input and output string.
146	*
147	* When not freeing the string, the result will
148	* always have the last RTUTF16 set to the
149	* terminator character so that when used for
150	* string truncation the result will be a valid
151	* C-style string (your job to keep it a valid
152	* UTF-16 string).
153	*
154	* When the input string is NULL and we're supposed
155	* to reallocate, the returned string will also
156	* have the first RTUTF16 set to the terminator
157	* char so it will be a valid C-style string.
158	*
159	* @param cbNew When @a cbNew is zero, we'll behave like
160	* RTUtf16Free and @a *ppwsz will be set to NULL.
161	*
162	* When not zero, this will be rounded up to a
163	* multiple of two, and used as the new size of the
164	* memory backing the string, i.e. it includes the
165	* terminator (RTUTF16) char.
166	* @param pszTag Allocation tag used for statistics and such.
167	*/
168	RTDECL(int) RTUtf16ReallocTag(PRTUTF16 ppwsz, size_t cbNew, const char pszTag);
169
170	/**
171	* Free a UTF-16 string allocated by RTStrToUtf16(), RTStrToUtf16Ex(),
172	* RTLatin1ToUtf16(), RTLatin1ToUtf16Ex(), RTUtf16Dup() or RTUtf16DupEx().
173	*
174	* @param pwszString The UTF-16 string to free. NULL is accepted.
175	*/
176	RTDECL(void) RTUtf16Free(PRTUTF16 pwszString);
177
178	/**
179	* Allocates a new copy of the specified UTF-16 string (default tag).
180	*
181	* @returns Pointer to the allocated string copy. Use RTUtf16Free() to free it.
182	* @returns NULL when out of memory.
183	* @param pwszString UTF-16 string to duplicate.
184	* @remark This function will not make any attempt to validate the encoding.
185	*/
186	#define RTUtf16Dup(pwszString) RTUtf16DupTag((pwszString), RTSTR_TAG)
187
188	/**
189	* Allocates a new copy of the specified UTF-16 string (custom tag).
190	*
191	* @returns Pointer to the allocated string copy. Use RTUtf16Free() to free it.
192	* @returns NULL when out of memory.
193	* @param pwszString UTF-16 string to duplicate.
194	* @param pszTag Allocation tag used for statistics and such.
195	* @remark This function will not make any attempt to validate the encoding.
196	*/
197	RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag);
198
199	/**
200	* Allocates a new copy of the specified UTF-16 string (default tag).
201	*
202	* @returns iprt status code.
203	* @param ppwszString Receives pointer of the allocated UTF-16 string.
204	* The returned pointer must be freed using RTUtf16Free().
205	* @param pwszString UTF-16 string to duplicate.
206	* @param cwcExtra Number of extra RTUTF16 items to allocate.
207	* @remark This function will not make any attempt to validate the encoding.
208	*/
209	#define RTUtf16DupEx(ppwszString, pwszString, cwcExtra) \
210	RTUtf16DupExTag((ppwszString), (pwszString), (cwcExtra), RTSTR_TAG)
211
212	/**
213	* Allocates a new copy of the specified UTF-16 string (custom tag).
214	*
215	* @returns iprt status code.
216	* @param ppwszString Receives pointer of the allocated UTF-16 string.
217	* The returned pointer must be freed using RTUtf16Free().
218	* @param pwszString UTF-16 string to duplicate.
219	* @param cwcExtra Number of extra RTUTF16 items to allocate.
220	* @param pszTag Allocation tag used for statistics and such.
221	* @remark This function will not make any attempt to validate the encoding.
222	*/
223	RTDECL(int) RTUtf16DupExTag(PRTUTF16 ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char pszTag);
224
225	/**
226	* Returns the length of a UTF-16 string in UTF-16 characters
227	* without trailing '\\0'.
228	*
229	* Surrogate pairs counts as two UTF-16 characters here. Use RTUtf16CpCnt()
230	* to get the exact number of code points in the string.
231	*
232	* @returns The number of RTUTF16 items in the string.
233	* @param pwszString Pointer the UTF-16 string.
234	* @remark This function will not make any attempt to validate the encoding.
235	*/
236	RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString);
237
238	/**
239	* Find the length of a zero-terminated byte string, given a max string length.
240	*
241	* @returns The string length or cbMax. The returned length does not include
242	* the zero terminator if it was found.
243	*
244	* @param pwszString The string.
245	* @param cwcMax The max string length in RTUTF16s.
246	* @sa RTUtf16NLenEx, RTStrNLen.
247	*/
248	RTDECL(size_t) RTUtf16NLen(PCRTUTF16 pwszString, size_t cwcMax);
249
250	/**
251	* Find the length of a zero-terminated byte string, given
252	* a max string length.
253	*
254	* @returns IPRT status code.
255	* @retval VINF_SUCCESS if the string has a length less than cchMax.
256	* @retval VERR_BUFFER_OVERFLOW if the end of the string wasn't found
257	* before cwcMax was reached.
258	*
259	* @param pwszString The string.
260	* @param cwcMax The max string length in RTUTF16s.
261	* @param pcwc Where to store the string length excluding the
262	* terminator. This is set to cwcMax if the terminator
263	* isn't found.
264	* @sa RTUtf16NLen, RTStrNLenEx.
265	*/
266	RTDECL(int) RTUtf16NLenEx(PCRTUTF16 pwszString, size_t cwcMax, size_t *pcwc);
267
268	/**
269	* Find the zero terminator in a string with a limited length.
270	*
271	* @returns Pointer to the zero terminator.
272	* @returns NULL if the zero terminator was not found.
273	*
274	* @param pwszString The string.
275	* @param cwcMax The max string length. RTSTR_MAX is fine.
276	*/
277	RTDECL(PCRTUTF16) RTUtf16End(PCRTUTF16 pwszString, size_t cwcMax);
278
279	/**
280	* Finds a give UTF-16 character in a UTF-16 string.
281	*
282	* @returns Pointer to the first occurence of @a wc.
283	* @returns NULL if @a wc was not found.
284	*
285	* @param pwszString The string to search.
286	* @param wc The UTF-16 character to search for.
287	*/
288	RTDECL(PRTUTF16) RTUtf16Chr(PCRTUTF16 pwszString, RTUTF16 wc);
289
290	/**
291	* Strips blankspaces from both ends of the string.
292	*
293	* @returns Pointer to first non-blank char in the string.
294	* @param pwsz The string to strip.
295	*/
296	RTDECL(PRTUTF16) RTUtf16Strip(PRTUTF16 pwsz);
297
298	/**
299	* Strips blankspaces from the start of the string.
300	*
301	* @returns Pointer to first non-blank char in the string.
302	* @param pwsz The string to strip.
303	*/
304	RTDECL(PRTUTF16) RTUtf16StripL(PCRTUTF16 pwsz);
305
306	/**
307	* Strips blankspaces from the end of the string.
308	*
309	* @returns pwsz.
310	* @param pwsz The string to strip.
311	*/
312	RTDECL(PRTUTF16) RTUtf16StripR(PRTUTF16 pwsz);
313
314	/**
315	* String copy with overflow handling.
316	*
317	* @retval VINF_SUCCESS on success.
318	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
319	* buffer will contain as much of the string as it can hold, fully
320	* terminated.
321	*
322	* @param pwszDst The destination buffer.
323	* @param cwcDst The size of the destination buffer in RTUTF16s.
324	* @param pwszSrc The source string. NULL is not OK.
325	*/
326	RTDECL(int) RTUtf16Copy(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc);
327
328	/**
329	* String copy with overflow handling, ASCII source.
330	*
331	* @retval VINF_SUCCESS on success.
332	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
333	* buffer will contain as much of the string as it can hold, fully
334	* terminated.
335	*
336	* @param pwszDst The destination buffer.
337	* @param cwcDst The size of the destination buffer in RTUTF16s.
338	* @param pszSrc The source string, pure ASCII. NULL is not OK.
339	*/
340	RTDECL(int) RTUtf16CopyAscii(PRTUTF16 pwszDst, size_t cwcDst, const char *pszSrc);
341
342	/**
343	* String copy with overflow handling.
344	*
345	* @retval VINF_SUCCESS on success.
346	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
347	* buffer will contain as much of the string as it can hold, fully
348	* terminated.
349	*
350	* @param pwszDst The destination buffer.
351	* @param cwcDst The size of the destination buffer in RTUTF16s.
352	* @param pwszSrc The source string. NULL is not OK.
353	* @param cwcSrcMax The maximum number of chars (not code points) to
354	* copy from the source string, not counting the
355	* terminator as usual.
356	*/
357	RTDECL(int) RTUtf16CopyEx(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc, size_t cwcSrcMax);
358
359	/**
360	* String concatenation with overflow handling.
361	*
362	* @retval VINF_SUCCESS on success.
363	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
364	* buffer will contain as much of the string as it can hold, fully
365	* terminated.
366	*
367	* @param pwszDst The destination buffer.
368	* @param cwcDst The size of the destination buffer in RTUTF16s.
369	* @param pwszSrc The source string. NULL is not OK.
370	*/
371	RTDECL(int) RTUtf16Cat(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc);
372
373	/**
374	* String concatenation with overflow handling, ASCII source.
375	*
376	* @retval VINF_SUCCESS on success.
377	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
378	* buffer will contain as much of the string as it can hold, fully
379	* terminated.
380	*
381	* @param pwszDst The destination buffer.
382	* @param cwcDst The size of the destination buffer in RTUTF16s.
383	* @param pszSrc The source string, pure ASCII. NULL is not OK.
384	*/
385	RTDECL(int) RTUtf16CatAscii(PRTUTF16 pwszDst, size_t cwcDst, const char *pszSrc);
386
387	/**
388	* String concatenation with overflow handling.
389	*
390	* @retval VINF_SUCCESS on success.
391	* @retval VERR_BUFFER_OVERFLOW if the destination buffer is too small. The
392	* buffer will contain as much of the string as it can hold, fully
393	* terminated.
394	*
395	* @param pwszDst The destination buffer.
396	* @param cwcDst The size of the destination buffer in RTUTF16s.
397	* @param pwszSrc The source string. NULL is not OK.
398	* @param cwcSrcMax The maximum number of UTF-16 chars (not code
399	* points) to copy from the source string, not
400	* counting the terminator as usual.
401	*/
402	RTDECL(int) RTUtf16CatEx(PRTUTF16 pwszDst, size_t cwcDst, PCRTUTF16 pwszSrc, size_t cwcSrcMax);
403
404	/**
405	* Performs a case sensitive string compare between two UTF-16 strings.
406	*
407	* @returns < 0 if the first string less than the second string.
408	* @returns 0 if the first string identical to the second string.
409	* @returns > 0 if the first string greater than the second string.
410	* @param pwsz1 First UTF-16 string. Null is allowed.
411	* @param pwsz2 Second UTF-16 string. Null is allowed.
412	* @remark This function will not make any attempt to validate the encoding.
413	*/
414	RTDECL(int) RTUtf16Cmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
415
416	/**
417	* Performs a case sensitive string compare between an UTF-16 string and a pure
418	* ASCII string.
419	*
420	* @returns < 0 if the first string less than the second string.
421	* @returns 0 if the first string identical to the second string.
422	* @returns > 0 if the first string greater than the second string.
423	* @param pwsz1 First UTF-16 string. Null is allowed.
424	* @param psz2 Second string, pure ASCII. Null is allowed.
425	* @remark This function will not make any attempt to validate the encoding.
426	*/
427	RTDECL(int) RTUtf16CmpAscii(PCRTUTF16 pwsz1, const char *psz2);
428
429	/**
430	* Performs a case sensitive string compare between an UTF-16 string and a UTF-8
431	* string.
432	*
433	* @returns < 0 if the first string less than the second string.
434	* @returns 0 if the first string identical to the second string.
435	* @returns > 0 if the first string greater than the second string.
436	* @param pwsz1 First UTF-16 string. Null is allowed.
437	* @param psz2 Second string, UTF-8. Null is allowed.
438	* @remarks NULL and empty strings are treated equally.
439	*/
440	RTDECL(int) RTUtf16CmpUtf8(PCRTUTF16 pwsz1, const char *psz2);
441
442
443	/**
444	* Performs a case sensitive and length limited string compare between two UTF-16 strings.
445	*
446	* @returns < 0 if the first string less than the second string.
447	* @returns 0 if the first string identical to the second string.
448	* @returns > 0 if the first string greater than the second string.
449	* @param pwsz1 First UTF-16 string. Null is allowed.
450	* @param pwsz2 Second UTF-16 string. Null is allowed.
451	* @param cwcMax Maximum number of characters (RTUTF16) from the first
452	* @remark This function will not make any attempt to validate the encoding.
453	*/
454	RTDECL(int) RTUtf16NCmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2, size_t cwcMax);
455
456	/**
457	* Performs a case sensitive and length limited string compare between an UTF-16
458	* string and a pure ASCII string.
459	*
460	* @returns < 0 if the first string less than the second string.
461	* @returns 0 if the first string identical to the second string.
462	* @returns > 0 if the first string greater than the second string.
463	* @param pwsz1 First UTF-16 string. Null is allowed.
464	* @param psz2 Second string, pure ASCII. Null is allowed.
465	* @param cwcMax Maximum number of characters (RTUTF16) to compare.
466	* @remark This function will not make any attempt to validate the encoding.
467	*/
468	RTDECL(int) RTUtf16NCmpAscii(PCRTUTF16 pwsz1, const char *psz2, size_t cwcMax);
469
470	/**
471	* Performs a case sensitive and length limited string compare between an UTF-16
472	* string and a UTF-8 string.
473	*
474	* @returns < 0 if the first string less than the second string.
475	* @returns 0 if the first string identical to the second string.
476	* @returns > 0 if the first string greater than the second string.
477	* @param pwsz1 First UTF-16 string. Null is allowed.
478	* @param psz2 Second string, UTF-8. Null is allowed.
479	* @param cwcMax1 Maximum number of UTF-16 characters (RTUTF16) from the
480	* first string to compare.
481	* @param cchMax2 Maximum number of UTF-8 characters (char) from the
482	* second string to compare.
483	* @remarks NULL and empty strings are treated equally.
484	*/
485	RTDECL(int) RTUtf16NCmpUtf8(PCRTUTF16 pwsz1, const char *psz2, size_t cwcMax1, size_t cchMax2);
486
487
488	/**
489	* Performs a case insensitive string compare between two UTF-16 strings.
490	*
491	* This is a simplified compare, as only the simplified lower/upper case folding
492	* specified by the unicode specs are used. It does not consider character pairs
493	* as they are used in some languages, just simple upper & lower case compares.
494	*
495	* @returns < 0 if the first string less than the second string.
496	* @returns 0 if the first string identical to the second string.
497	* @returns > 0 if the first string greater than the second string.
498	* @param pwsz1 First UTF-16 string. Null is allowed.
499	* @param pwsz2 Second UTF-16 string. Null is allowed.
500	*/
501	RTDECL(int) RTUtf16ICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
502
503	/**
504	* Performs a case insensitive string compare between two big endian UTF-16
505	* strings.
506	*
507	* This is a simplified compare, as only the simplified lower/upper case folding
508	* specified by the unicode specs are used. It does not consider character pairs
509	* as they are used in some languages, just simple upper & lower case compares.
510	*
511	* @returns < 0 if the first string less than the second string.
512	* @returns 0 if the first string identical to the second string.
513	* @returns > 0 if the first string greater than the second string.
514	* @param pwsz1 First big endian UTF-16 string. Null is allowed.
515	* @param pwsz2 Second big endian UTF-16 string. Null is allowed.
516	*/
517	RTDECL(int) RTUtf16BigICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
518
519	/**
520	* Performs a case insensitive string compare between an UTF-16 string and a
521	* UTF-8 string.
522	*
523	* @returns < 0 if the first string less than the second string.s
524	* @returns 0 if the first string identical to the second string.
525	* @returns > 0 if the first string greater than the second string.
526	* @param pwsz1 First UTF-16 string. Null is allowed.
527	* @param psz2 Second string, UTF-8. Null is allowed.
528	* @remarks NULL and empty strings are treated equally.
529	*/
530	RTDECL(int) RTUtf16ICmpUtf8(PCRTUTF16 pwsz1, const char *psz2);
531
532	/**
533	* Performs a case insensitive string compare between an UTF-16 string and a
534	* pure ASCII string.
535	*
536	* Since this compare only takes cares about the first 128 codepoints in
537	* unicode, no tables are needed and there aren't any real complications.
538	*
539	* @returns < 0 if the first string less than the second string.
540	* @returns 0 if the first string identical to the second string.
541	* @returns > 0 if the first string greater than the second string.
542	* @param pwsz1 First UTF-16 string. Null is allowed.
543	* @param psz2 Second string, pure ASCII. Null is allowed.
544	*/
545	RTDECL(int) RTUtf16ICmpAscii(PCRTUTF16 pwsz1, const char *psz2);
546
547	/**
548	* Performs a case insensitive string compare between two UTF-16 strings
549	* using the current locale of the process (if applicable).
550	*
551	* This differs from RTUtf16ICmp() in that it will try, if a locale with the
552	* required data is available, to do a correct case-insensitive compare. It
553	* follows that it is more complex and thereby likely to be more expensive.
554	*
555	* @returns < 0 if the first string less than the second string.
556	* @returns 0 if the first string identical to the second string.
557	* @returns > 0 if the first string greater than the second string.
558	* @param pwsz1 First UTF-16 string. Null is allowed.
559	* @param pwsz2 Second UTF-16 string. Null is allowed.
560	*/
561	RTDECL(int) RTUtf16LocaleICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2);
562
563	/**
564	* Performs a case insensitive string compare between two UTF-16 strings,
565	* stopping after N characters.
566	*
567	* This is a simplified compare, as only the simplified lower/upper case folding
568	* specified by the unicode specs are used. It does not consider character pairs
569	* as they are used in some languages, just simple upper & lower case compares.
570	*
571	* @returns < 0 if the first string less than the second string.
572	* @returns 0 if the first string identical to the second string.
573	* @returns > 0 if the first string greater than the second string.
574	* @param pwsz1 First UTF-16 string. Null is allowed.
575	* @param pwsz2 Second UTF-16 string. Null is allowed.
576	* @param cwcMax Maximum number of characters to compare.
577	*/
578	RTDECL(int) RTUtf16NICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2, size_t cwcMax);
579
580	/**
581	* Performs a case insensitive string compare between two big endian UTF-16
582	* strings, stopping after N characters.
583	*
584	* This is a simplified compare, as only the simplified lower/upper case folding
585	* specified by the unicode specs are used. It does not consider character pairs
586	* as they are used in some languages, just simple upper & lower case compares.
587	*
588	* @returns < 0 if the first string less than the second string.
589	* @returns 0 if the first string identical to the second string.
590	* @returns > 0 if the first string greater than the second string.
591	* @param pwsz1 First big endian UTF-16 string. Null is allowed.
592	* @param pwsz2 Second big endian UTF-16 string. Null is allowed.
593	* @param cwcMax Maximum number of characters to compare.
594	*/
595	RTDECL(int) RTUtf16BigNICmp(PCRTUTF16 pwsz1, PCRTUTF16 pwsz2, size_t cwcMax);
596
597	/**
598	* Performs a case insensitive string compare between a UTF-16 string and a pure
599	* ASCII string, stopping after N characters.
600	*
601	* Since this compare only takes cares about the first 128 codepoints in
602	* unicode, no tables are needed and there aren't any real complications.
603	*
604	* @returns < 0 if the first string less than the second string.
605	* @returns 0 if the first string identical to the second string.
606	* @returns > 0 if the first string greater than the second string.
607	* @param pwsz1 The UTF-16 first string. Null is allowed.
608	* @param psz2 The pure ASCII second string. Null is allowed.
609	* @param cwcMax Maximum number of UTF-16 characters to compare.
610	*/
611	RTDECL(int) RTUtf16NICmpAscii(PCRTUTF16 pwsz1, const char *psz2, size_t cwcMax);
612
613
614	/**
615	* Locates a substring, ascii version.
616	*
617	* @returns Offset into @a pwszString of the substring if found, -1 if not.
618	* @param pwszString The UTF-16 to search. NULL is allowed (no match).
619	* @param pszSubStr The pure ASCII substring to locate. NULL is allowed (not
620	* matching anything, just like an empty string).
621	*/
622	RTDECL(ssize_t) RTUtf16FindAscii(PCRTUTF16 pwszString, const char *pszSubStr);
623
624
625	/**
626	* Folds a UTF-16 string to lowercase.
627	*
628	* This is a very simple folding; is uses the simple lowercase
629	* code point, it is not related to any locale just the most common
630	* lowercase codepoint setup by the unicode specs, and it will not
631	* create new surrogate pairs or remove existing ones.
632	*
633	* @returns Pointer to the passed in string.
634	* @param pwsz The string to fold.
635	*/
636	RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz);
637
638	/**
639	* Folds a UTF-16 string to uppercase.
640	*
641	* This is a very simple folding; is uses the simple uppercase
642	* code point, it is not related to any locale just the most common
643	* uppercase codepoint setup by the unicode specs, and it will not
644	* create new surrogate pairs or remove existing ones.
645	*
646	* @returns Pointer to the passed in string.
647	* @param pwsz The string to fold.
648	*/
649	RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz);
650
651	/**
652	* Validates the UTF-16 encoding of the string.
653	*
654	* @returns iprt status code.
655	* @param pwsz The string.
656	*/
657	RTDECL(int) RTUtf16ValidateEncoding(PCRTUTF16 pwsz);
658
659	/**
660	* Validates the UTF-16 encoding of the string.
661	*
662	* @returns iprt status code.
663	* @param pwsz The string.
664	* @param cwc The max string length (/ size) in UTF-16 units. Use
665	* RTSTR_MAX to process the entire string.
666	* @param fFlags Combination of RTSTR_VALIDATE_ENCODING_XXX flags.
667	*/
668	RTDECL(int) RTUtf16ValidateEncodingEx(PCRTUTF16 pwsz, size_t cwc, uint32_t fFlags);
669
670	/**
671	* Checks if the UTF-16 encoding is valid.
672	*
673	* @returns true / false.
674	* @param pwsz The string.
675	*/
676	RTDECL(bool) RTUtf16IsValidEncoding(PCRTUTF16 pwsz);
677
678	/**
679	* Sanitise a (valid) UTF-16 string by replacing all characters outside a white
680	* list in-place by an ASCII replacement character.
681	*
682	* Surrogate paris will be replaced by two chars.
683	*
684	* @returns The number of code points replaced. In the case of an incorrectly
685	* encoded string -1 will be returned, and the string is not completely
686	* processed. In the case of puszValidPairs having an odd number of
687	* code points, -1 will be also return but without any modification to
688	* the string.
689	* @param pwsz The string to sanitise.
690	* @param puszValidPairs A zero-terminated array of pairs of Unicode points.
691	* Each pair is the start and end point of a range,
692	* and the union of these ranges forms the white list.
693	* @param chReplacement The ASCII replacement character.
694	* @sa RTStrPurgeComplementSet
695	*/
696	RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidPairs, char chReplacement);
697
698
699	/**
700	* Translate a UTF-16 string into a UTF-8 allocating the result buffer (default
701	* tag).
702	*
703	* @returns iprt status code.
704	* @param pwszString UTF-16 string to convert.
705	* @param ppszString Receives pointer of allocated UTF-8 string on
706	* success, and is always set to NULL on failure.
707	* The returned pointer must be freed using RTStrFree().
708	*/
709	#define RTUtf16ToUtf8(pwszString, ppszString) RTUtf16ToUtf8Tag((pwszString), (ppszString), RTSTR_TAG)
710
711	/**
712	* Translate a UTF-16 string into a UTF-8 allocating the result buffer.
713	*
714	* @returns iprt status code.
715	* @param pwszString UTF-16 string to convert.
716	* @param ppszString Receives pointer of allocated UTF-8 string on
717	* success, and is always set to NULL on failure.
718	* The returned pointer must be freed using RTStrFree().
719	* @param pszTag Allocation tag used for statistics and such.
720	*/
721	RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char *ppszString, const char pszTag);
722
723	/**
724	* Translate a UTF-16BE string into a UTF-8 allocating the result buffer
725	* (default tag).
726	*
727	* This differs from RTUtf16ToUtf8 in that the input is always a
728	* big-endian string.
729	*
730	* @returns iprt status code.
731	* @param pwszString UTF-16BE string to convert.
732	* @param ppszString Receives pointer of allocated UTF-8 string on
733	* success, and is always set to NULL on failure.
734	* The returned pointer must be freed using RTStrFree().
735	*/
736	#define RTUtf16BigToUtf8(pwszString, ppszString) RTUtf16BigToUtf8Tag((pwszString), (ppszString), RTSTR_TAG)
737
738	/**
739	* Translate a UTF-16BE string into a UTF-8 allocating the result buffer.
740	*
741	* This differs from RTUtf16ToUtf8Tag in that the input is always a
742	* big-endian string.
743	*
744	* @returns iprt status code.
745	* @param pwszString UTF-16BE string to convert.
746	* @param ppszString Receives pointer of allocated UTF-8 string on
747	* success, and is always set to NULL on failure.
748	* The returned pointer must be freed using RTStrFree().
749	* @param pszTag Allocation tag used for statistics and such.
750	*/
751	RTDECL(int) RTUtf16BigToUtf8Tag(PCRTUTF16 pwszString, char *ppszString, const char pszTag);
752
753	/**
754	* Translate a UTF-16LE string into a UTF-8 allocating the result buffer
755	* (default tag).
756	*
757	* This differs from RTUtf16ToUtf8 in that the input is always a
758	* little-endian string.
759	*
760	* @returns iprt status code.
761	* @param pwszString UTF-16LE string to convert.
762	* @param ppszString Receives pointer of allocated UTF-8 string on
763	* success, and is always set to NULL on failure.
764	* The returned pointer must be freed using RTStrFree().
765	*/
766	#define RTUtf16LittleToUtf8(pwszString, ppszString) RTUtf16LittleToUtf8Tag((pwszString), (ppszString), RTSTR_TAG)
767
768	/**
769	* Translate a UTF-16LE string into a UTF-8 allocating the result buffer.
770	*
771	* This differs from RTUtf16ToUtf8Tag in that the input is always a
772	* little-endian string.
773	*
774	* @returns iprt status code.
775	* @param pwszString UTF-16LE string to convert.
776	* @param ppszString Receives pointer of allocated UTF-8 string on
777	* success, and is always set to NULL on failure.
778	* The returned pointer must be freed using RTStrFree().
779	* @param pszTag Allocation tag used for statistics and such.
780	*/
781	RTDECL(int) RTUtf16LittleToUtf8Tag(PCRTUTF16 pwszString, char *ppszString, const char pszTag);
782
783
784	/**
785	* Translates UTF-16 to UTF-8 using buffer provided by the caller or a fittingly
786	* sized buffer allocated by the function (default tag).
787	*
788	* @returns iprt status code.
789	* @param pwszString The UTF-16 string to convert.
790	* @param cwcString The number of RTUTF16 items to translate from pwszString.
791	* The translation will stop when reaching cwcString or the terminator ('\\0').
792	* Use RTSTR_MAX to translate the entire string.
793	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
794	* a buffer of the specified size, or pointer to a NULL pointer.
795	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
796	* will be allocated to hold the translated string.
797	* If a buffer was requested it must be freed using RTStrFree().
798	* @param cch The buffer size in chars (the type). This includes the terminator.
799	* @param pcch Where to store the length of the translated string,
800	* excluding the terminator. (Optional)
801	*
802	* This may be set under some error conditions,
803	* however, only for VERR_BUFFER_OVERFLOW and
804	* VERR_NO_STR_MEMORY will it contain a valid string
805	* length that can be used to resize the buffer.
806	*/
807	#define RTUtf16ToUtf8Ex(pwszString, cwcString, ppsz, cch, pcch) \
808	RTUtf16ToUtf8ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
809
810	/**
811	* Translates UTF-16 to UTF-8 using buffer provided by the caller or a fittingly
812	* sized buffer allocated by the function (custom tag).
813	*
814	* @returns iprt status code.
815	* @param pwszString The UTF-16 string to convert.
816	* @param cwcString The number of RTUTF16 items to translate from pwszString.
817	* The translation will stop when reaching cwcString or the terminator ('\\0').
818	* Use RTSTR_MAX to translate the entire string.
819	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
820	* a buffer of the specified size, or pointer to a NULL pointer.
821	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
822	* will be allocated to hold the translated string.
823	* If a buffer was requested it must be freed using RTStrFree().
824	* @param cch The buffer size in chars (the type). This includes the terminator.
825	* @param pcch Where to store the length of the translated string,
826	* excluding the terminator. (Optional)
827	*
828	* This may be set under some error conditions,
829	* however, only for VERR_BUFFER_OVERFLOW and
830	* VERR_NO_STR_MEMORY will it contain a valid string
831	* length that can be used to resize the buffer.
832	* @param pszTag Allocation tag used for statistics and such.
833	*/
834	RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char *ppsz, size_t cch, size_t pcch, const char *pszTag);
835
836	/**
837	* Translates UTF-16BE to UTF-8 using buffer provided by the caller or a
838	* fittingly sized buffer allocated by the function (default tag).
839	*
840	* This differs from RTUtf16ToUtf8Ex in that the input is always a
841	* big-endian string.
842	*
843	* @returns iprt status code.
844	* @param pwszString The UTF-16BE string to convert.
845	* @param cwcString The number of RTUTF16 items to translate from pwszString.
846	* The translation will stop when reaching cwcString or the terminator ('\\0').
847	* Use RTSTR_MAX to translate the entire string.
848	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
849	* a buffer of the specified size, or pointer to a NULL pointer.
850	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
851	* will be allocated to hold the translated string.
852	* If a buffer was requested it must be freed using RTStrFree().
853	* @param cch The buffer size in chars (the type). This includes the terminator.
854	* @param pcch Where to store the length of the translated string,
855	* excluding the terminator. (Optional)
856	*
857	* This may be set under some error conditions,
858	* however, only for VERR_BUFFER_OVERFLOW and
859	* VERR_NO_STR_MEMORY will it contain a valid string
860	* length that can be used to resize the buffer.
861	*/
862	#define RTUtf16BigToUtf8Ex(pwszString, cwcString, ppsz, cch, pcch) \
863	RTUtf16BigToUtf8ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
864
865	/**
866	* Translates UTF-16BE to UTF-8 using buffer provided by the caller or a
867	* fittingly sized buffer allocated by the function (custom tag).
868	*
869	* This differs from RTUtf16ToUtf8ExTag in that the input is always a
870	* big-endian string.
871	*
872	* @returns iprt status code.
873	* @param pwszString The UTF-16BE string to convert.
874	* @param cwcString The number of RTUTF16 items to translate from pwszString.
875	* The translation will stop when reaching cwcString or the terminator ('\\0').
876	* Use RTSTR_MAX to translate the entire string.
877	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
878	* a buffer of the specified size, or pointer to a NULL pointer.
879	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
880	* will be allocated to hold the translated string.
881	* If a buffer was requested it must be freed using RTStrFree().
882	* @param cch The buffer size in chars (the type). This includes the terminator.
883	* @param pcch Where to store the length of the translated string,
884	* excluding the terminator. (Optional)
885	*
886	* This may be set under some error conditions,
887	* however, only for VERR_BUFFER_OVERFLOW and
888	* VERR_NO_STR_MEMORY will it contain a valid string
889	* length that can be used to resize the buffer.
890	* @param pszTag Allocation tag used for statistics and such.
891	*/
892	RTDECL(int) RTUtf16BigToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char *ppsz, size_t cch, size_t pcch, const char *pszTag);
893
894	/**
895	* Translates UTF-16LE to UTF-8 using buffer provided by the caller or a
896	* fittingly sized buffer allocated by the function (default tag).
897	*
898	* This differs from RTUtf16ToUtf8Ex in that the input is always a
899	* little-endian string.
900	*
901	* @returns iprt status code.
902	* @param pwszString The UTF-16LE string to convert.
903	* @param cwcString The number of RTUTF16 items to translate from pwszString.
904	* The translation will stop when reaching cwcString or the terminator ('\\0').
905	* Use RTSTR_MAX to translate the entire string.
906	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
907	* a buffer of the specified size, or pointer to a NULL pointer.
908	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
909	* will be allocated to hold the translated string.
910	* If a buffer was requested it must be freed using RTStrFree().
911	* @param cch The buffer size in chars (the type). This includes the terminator.
912	* @param pcch Where to store the length of the translated string,
913	* excluding the terminator. (Optional)
914	*
915	* This may be set under some error conditions,
916	* however, only for VERR_BUFFER_OVERFLOW and
917	* VERR_NO_STR_MEMORY will it contain a valid string
918	* length that can be used to resize the buffer.
919	*/
920	#define RTUtf16LittleToUtf8Ex(pwszString, cwcString, ppsz, cch, pcch) \
921	RTUtf16LittleToUtf8ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
922
923	/**
924	* Translates UTF-16LE to UTF-8 using buffer provided by the caller or a
925	* fittingly sized buffer allocated by the function (custom tag).
926	*
927	* This differs from RTUtf16ToUtf8ExTag in that the input is always a
928	* little-endian string.
929	*
930	* @returns iprt status code.
931	* @param pwszString The UTF-16LE string to convert.
932	* @param cwcString The number of RTUTF16 items to translate from pwszString.
933	* The translation will stop when reaching cwcString or the terminator ('\\0').
934	* Use RTSTR_MAX to translate the entire string.
935	* @param ppsz If cch is non-zero, this must either be pointing to a pointer to
936	* a buffer of the specified size, or pointer to a NULL pointer.
937	* If *ppsz is NULL or cch is zero a buffer of at least cch chars
938	* will be allocated to hold the translated string.
939	* If a buffer was requested it must be freed using RTStrFree().
940	* @param cch The buffer size in chars (the type). This includes the terminator.
941	* @param pcch Where to store the length of the translated string,
942	* excluding the terminator. (Optional)
943	*
944	* This may be set under some error conditions,
945	* however, only for VERR_BUFFER_OVERFLOW and
946	* VERR_NO_STR_MEMORY will it contain a valid string
947	* length that can be used to resize the buffer.
948	* @param pszTag Allocation tag used for statistics and such.
949	*/
950	RTDECL(int) RTUtf16LittleToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char *ppsz, size_t cch, size_t pcch,
951	const char *pszTag);
952
953	/**
954	* Calculates the length of the UTF-16 string in UTF-8 chars (bytes).
955	*
956	* This function will validate the string, and incorrectly encoded UTF-16
957	* strings will be rejected. The primary purpose of this function is to
958	* help allocate buffers for RTUtf16ToUtf8() of the correct size. For most
959	* other purposes RTUtf16ToUtf8Ex() should be used.
960	*
961	* @returns Number of char (bytes).
962	* @returns 0 if the string was incorrectly encoded.
963	* @param pwsz The UTF-16 string.
964	*/
965	RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz);
966
967	/**
968	* Calculates the length of the UTF-16BE string in UTF-8 chars (bytes).
969	*
970	* This function will validate the string, and incorrectly encoded UTF-16BE
971	* strings will be rejected. The primary purpose of this function is to
972	* help allocate buffers for RTUtf16BigToUtf8() of the correct size. For most
973	* other purposes RTUtf16BigToUtf8Ex() should be used.
974	*
975	* @returns Number of char (bytes).
976	* @returns 0 if the string was incorrectly encoded.
977	* @param pwsz The UTF-16BE string.
978	*/
979	RTDECL(size_t) RTUtf16BigCalcUtf8Len(PCRTUTF16 pwsz);
980
981	/**
982	* Calculates the length of the UTF-16LE string in UTF-8 chars (bytes).
983	*
984	* This function will validate the string, and incorrectly encoded UTF-16LE
985	* strings will be rejected. The primary purpose of this function is to
986	* help allocate buffers for RTUtf16LittleToUtf8() of the correct size. For
987	* most other purposes RTUtf16LittleToUtf8Ex() should be used.
988	*
989	* @returns Number of char (bytes).
990	* @returns 0 if the string was incorrectly encoded.
991	* @param pwsz The UTF-16LE string.
992	*/
993	RTDECL(size_t) RTUtf16LittleCalcUtf8Len(PCRTUTF16 pwsz);
994
995	/**
996	* Calculates the length of the UTF-16 string in UTF-8 chars (bytes).
997	*
998	* This function will validate the string, and incorrectly encoded UTF-16
999	* strings will be rejected.
1000	*
1001	* @returns iprt status code.
1002	* @param pwsz The string.
1003	* @param cwc The max string length. Use RTSTR_MAX to process the entire string.
1004	* @param pcch Where to store the string length (in bytes). Optional.
1005	* This is undefined on failure.
1006	*/
1007	RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
1008
1009	/**
1010	* Calculates the length of the UTF-16BE string in UTF-8 chars (bytes).
1011	*
1012	* This function will validate the string, and incorrectly encoded UTF-16BE
1013	* strings will be rejected.
1014	*
1015	* @returns iprt status code.
1016	* @param pwsz The string.
1017	* @param cwc The max string length. Use RTSTR_MAX to process the entire string.
1018	* @param pcch Where to store the string length (in bytes). Optional.
1019	* This is undefined on failure.
1020	*/
1021	RTDECL(int) RTUtf16BigCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
1022
1023	/**
1024	* Calculates the length of the UTF-16LE string in UTF-8 chars (bytes).
1025	*
1026	* This function will validate the string, and incorrectly encoded UTF-16LE
1027	* strings will be rejected.
1028	*
1029	* @returns iprt status code.
1030	* @param pwsz The string.
1031	* @param cwc The max string length. Use RTSTR_MAX to process the entire string.
1032	* @param pcch Where to store the string length (in bytes). Optional.
1033	* This is undefined on failure.
1034	*/
1035	RTDECL(int) RTUtf16LittleCalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
1036
1037	/**
1038	* Translate a UTF-16 string into a Latin-1 (ISO-8859-1) allocating the result
1039	* buffer (default tag).
1040	*
1041	* @returns iprt status code.
1042	* @param pwszString UTF-16 string to convert.
1043	* @param ppszString Receives pointer of allocated Latin1 string on
1044	* success, and is always set to NULL on failure.
1045	* The returned pointer must be freed using RTStrFree().
1046	*/
1047	#define RTUtf16ToLatin1(pwszString, ppszString) RTUtf16ToLatin1Tag((pwszString), (ppszString), RTSTR_TAG)
1048
1049	/**
1050	* Translate a UTF-16 string into a Latin-1 (ISO-8859-1) allocating the result
1051	* buffer (custom tag).
1052	*
1053	* @returns iprt status code.
1054	* @param pwszString UTF-16 string to convert.
1055	* @param ppszString Receives pointer of allocated Latin1 string on
1056	* success, and is always set to NULL on failure.
1057	* The returned pointer must be freed using RTStrFree().
1058	* @param pszTag Allocation tag used for statistics and such.
1059	*/
1060	RTDECL(int) RTUtf16ToLatin1Tag(PCRTUTF16 pwszString, char *ppszString, const char pszTag);
1061
1062	/**
1063	* Translates UTF-16 to Latin-1 (ISO-8859-1) using buffer provided by the caller
1064	* or a fittingly sized buffer allocated by the function (default tag).
1065	*
1066	* @returns iprt status code.
1067	* @param pwszString The UTF-16 string to convert.
1068	* @param cwcString The number of RTUTF16 items to translate from
1069	* pwszString. The translation will stop when reaching
1070	* cwcString or the terminator ('\\0'). Use RTSTR_MAX
1071	* to translate the entire string.
1072	* @param ppsz Pointer to the pointer to the Latin-1 string. The
1073	* buffer can optionally be preallocated by the caller.
1074	*
1075	* If cch is zero, *ppsz is undefined.
1076	*
1077	* If cch is non-zero and *ppsz is not NULL, then this
1078	* will be used as the output buffer.
1079	* VERR_BUFFER_OVERFLOW will be returned if this is
1080	* insufficient.
1081	*
1082	* If cch is zero or *ppsz is NULL, then a buffer of
1083	* sufficient size is allocated. cch can be used to
1084	* specify a minimum size of this buffer. Use
1085	* RTUtf16Free() to free the result.
1086	*
1087	* @param cch The buffer size in chars (the type). This includes
1088	* the terminator.
1089	* @param pcch Where to store the length of the translated string,
1090	* excluding the terminator. (Optional)
1091	*
1092	* This may be set under some error conditions,
1093	* however, only for VERR_BUFFER_OVERFLOW and
1094	* VERR_NO_STR_MEMORY will it contain a valid string
1095	* length that can be used to resize the buffer.
1096	*/
1097	#define RTUtf16ToLatin1Ex(pwszString, cwcString, ppsz, cch, pcch) \
1098	RTUtf16ToLatin1ExTag((pwszString), (cwcString), (ppsz), (cch), (pcch), RTSTR_TAG)
1099
1100	/**
1101	* Translates UTF-16 to Latin-1 (ISO-8859-1) using buffer provided by the caller
1102	* or a fittingly sized buffer allocated by the function (custom tag).
1103	*
1104	* @returns iprt status code.
1105	* @param pwszString The UTF-16 string to convert.
1106	* @param cwcString The number of RTUTF16 items to translate from
1107	* pwszString. The translation will stop when reaching
1108	* cwcString or the terminator ('\\0'). Use RTSTR_MAX
1109	* to translate the entire string.
1110	* @param ppsz Pointer to the pointer to the Latin-1 string. The
1111	* buffer can optionally be preallocated by the caller.
1112	*
1113	* If cch is zero, *ppsz is undefined.
1114	*
1115	* If cch is non-zero and *ppsz is not NULL, then this
1116	* will be used as the output buffer.
1117	* VERR_BUFFER_OVERFLOW will be returned if this is
1118	* insufficient.
1119	*
1120	* If cch is zero or *ppsz is NULL, then a buffer of
1121	* sufficient size is allocated. cch can be used to
1122	* specify a minimum size of this buffer. Use
1123	* RTUtf16Free() to free the result.
1124	*
1125	* @param cch The buffer size in chars (the type). This includes
1126	* the terminator.
1127	* @param pcch Where to store the length of the translated string,
1128	* excluding the terminator. (Optional)
1129	*
1130	* This may be set under some error conditions,
1131	* however, only for VERR_BUFFER_OVERFLOW and
1132	* VERR_NO_STR_MEMORY will it contain a valid string
1133	* length that can be used to resize the buffer.
1134	* @param pszTag Allocation tag used for statistics and such.
1135	*/
1136	RTDECL(int) RTUtf16ToLatin1ExTag(PCRTUTF16 pwszString, size_t cwcString, char *ppsz, size_t cch, size_t pcch, const char *pszTag);
1137
1138	/**
1139	* Calculates the length of the UTF-16 string in Latin-1 (ISO-8859-1) chars.
1140	*
1141	* This function will validate the string, and incorrectly encoded UTF-16
1142	* strings will be rejected. The primary purpose of this function is to
1143	* help allocate buffers for RTUtf16ToLatin1() of the correct size. For most
1144	* other purposes RTUtf16ToLatin1Ex() should be used.
1145	*
1146	* @returns Number of char (bytes).
1147	* @returns 0 if the string was incorrectly encoded.
1148	* @param pwsz The UTF-16 string.
1149	*/
1150	RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz);
1151
1152	/**
1153	* Calculates the length of the UTF-16 string in Latin-1 (ISO-8859-1) chars.
1154	*
1155	* This function will validate the string, and incorrectly encoded UTF-16
1156	* strings will be rejected.
1157	*
1158	* @returns iprt status code.
1159	* @param pwsz The string.
1160	* @param cwc The max string length. Use RTSTR_MAX to process the
1161	* entire string.
1162	* @param pcch Where to store the string length (in bytes). Optional.
1163	* This is undefined on failure.
1164	*/
1165	RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch);
1166
1167	/**
1168	* Get the unicode code point at the given string position.
1169	*
1170	* @returns unicode code point.
1171	* @returns RTUNICP_INVALID if the encoding is invalid.
1172	* @param pwsz The string.
1173	*
1174	* @remark This is an internal worker for RTUtf16GetCp().
1175	*/
1176	RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz);
1177
1178	/**
1179	* Get the unicode code point at the given string position.
1180	*
1181	* @returns iprt status code.
1182	* @param ppwsz Pointer to the string pointer. This will be updated to
1183	* point to the char following the current code point.
1184	* @param pCp Where to store the code point.
1185	* RTUNICP_INVALID is stored here on failure.
1186	*
1187	* @remark This is an internal worker for RTUtf16GetCpEx().
1188	*/
1189	RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp);
1190
1191	/**
1192	* Get the unicode code point at the given string position with length
1193	* restriction.
1194	*
1195	* @returns iprt status code.
1196	* @param ppwsz Pointer to the string pointer. This will be updated to
1197	* point to the char following the current code point.
1198	* @param pcwc Pointer to the max string length. This will be
1199	* decremented corrsponding to the advancement of @a ppwsz.
1200	* @param pCp Where to store the code point.
1201	* RTUNICP_INVALID is stored here on failure.
1202	*
1203	* @remark This is an internal worker for RTUtf16GetCpNEx().
1204	*/
1205	RTDECL(int) RTUtf16GetCpNExInternal(PCRTUTF16 ppwsz, size_t pcwc, PRTUNICP pCp);
1206
1207	/**
1208	* Get the unicode code point at the given string position, big endian.
1209	*
1210	* @returns iprt status code.
1211	* @param ppwsz Pointer to the string pointer. This will be updated to
1212	* point to the char following the current code point.
1213	* @param pCp Where to store the code point.
1214	* RTUNICP_INVALID is stored here on failure.
1215	*
1216	* @remark This is an internal worker for RTUtf16BigGetCpEx().
1217	*/
1218	RTDECL(int) RTUtf16BigGetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp);
1219
1220	/**
1221	* Put the unicode code point at the given string position
1222	* and return the pointer to the char following it.
1223	*
1224	* This function will not consider anything at or following the
1225	* buffer area pointed to by pwsz. It is therefore not suitable for
1226	* inserting code points into a string, only appending/overwriting.
1227	*
1228	* @returns pointer to the char following the written code point.
1229	* @param pwsz The string.
1230	* @param CodePoint The code point to write.
1231	* This should not be RTUNICP_INVALID or any other
1232	* character out of the UTF-16 range.
1233	*
1234	* @remark This is an internal worker for RTUtf16GetCpEx().
1235	*/
1236	RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint);
1237
1238	/**
1239	* Get the unicode code point at the given string position.
1240	*
1241	* @returns unicode code point.
1242	* @returns RTUNICP_INVALID if the encoding is invalid.
1243	* @param pwsz The string.
1244	*
1245	* @remark We optimize this operation by using an inline function for
1246	* everything which isn't a surrogate pair or an endian indicator.
1247	*/
1248	DECLINLINE(RTUNICP) RTUtf16GetCp(PCRTUTF16 pwsz)
1249	{
1250	const RTUTF16 wc = *pwsz;
1251	if (wc < 0xd800 \|\| (wc > 0xdfff && wc < 0xfffe))
1252	return wc;
1253	return RTUtf16GetCpInternal(pwsz);
1254	}
1255
1256	/**
1257	* Get the unicode code point at the given string position.
1258	*
1259	* @returns iprt status code.
1260	* @param ppwsz Pointer to the string pointer. This will be updated to
1261	* point to the char following the current code point.
1262	* @param pCp Where to store the code point.
1263	* RTUNICP_INVALID is stored here on failure.
1264	*
1265	* @remark We optimize this operation by using an inline function for
1266	* everything which isn't a surrogate pair or and endian indicator.
1267	*/
1268	DECLINLINE(int) RTUtf16GetCpEx(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1269	{
1270	const RTUTF16 wc = **ppwsz;
1271	if (wc < 0xd800 \|\| (wc > 0xdfff && wc < 0xfffe))
1272	{
1273	(*ppwsz)++;
1274	*pCp = wc;
1275	return VINF_SUCCESS;
1276	}
1277	return RTUtf16GetCpExInternal(ppwsz, pCp);
1278	}
1279
1280	/**
1281	* Get the unicode code point at the given string position.
1282	*
1283	* @returns iprt status code.
1284	* @param ppwsz Pointer to the string pointer. This will be updated to
1285	* point to the char following the current code point.
1286	* @param pcwc Pointer to the max string length. This will be
1287	* decremented corrsponding to the advancement of @a ppwsz.
1288	* @param pCp Where to store the code point. RTUNICP_INVALID is stored
1289	* here on failure.
1290	*
1291	* @remark We optimize this operation by using an inline function for
1292	* everything which isn't a surrogate pair or and endian indicator.
1293	*/
1294	DECLINLINE(int) RTUtf16GetCpNEx(PCRTUTF16 ppwsz, size_t pcwc, PRTUNICP pCp)
1295	{
1296	const size_t cwc = *pcwc;
1297	if (cwc > 0)
1298	{
1299	const PCRTUTF16 pwsz = *ppwsz;
1300	const RTUTF16 wc = *pwsz;
1301	if (wc < 0xd800 \|\| (wc > 0xdfff && wc < 0xfffe))
1302	{
1303	*pCp = wc;
1304	*pcwc = cwc - 1;
1305	*ppwsz = pwsz + 1;
1306	return VINF_SUCCESS;
1307	}
1308	}
1309	return RTUtf16GetCpNExInternal(ppwsz, pcwc, pCp);
1310	}
1311
1312	/**
1313	* Get the unicode code point at the given string position, big endian version.
1314	*
1315	* @returns iprt status code.
1316	* @param ppwsz Pointer to the string pointer. This will be updated to
1317	* point to the char following the current code point.
1318	* @param pCp Where to store the code point.
1319	* RTUNICP_INVALID is stored here on failure.
1320	*
1321	* @remark We optimize this operation by using an inline function for
1322	* everything which isn't a surrogate pair or and endian indicator.
1323	*/
1324	DECLINLINE(int) RTUtf16BigGetCpEx(PCRTUTF16 *ppwsz, PRTUNICP pCp)
1325	{
1326	#ifdef RT_BIG_ENDIAN
1327	return RTUtf16GetCpEx(ppwsz, pCp);
1328	#else
1329	# ifdef IPRT_INCLUDED_asm_h
1330	const RTUTF16 wc = RT_BE2H_U16(**ppwsz);
1331	if (wc < 0xd800 \|\| (wc > 0xdfff && wc < 0xfffe))
1332	{
1333	(*ppwsz)++;
1334	*pCp = wc;
1335	return VINF_SUCCESS;
1336	}
1337	# endif
1338	return RTUtf16BigGetCpExInternal(ppwsz, pCp);
1339	#endif
1340	}
1341
1342	/**
1343	* Put the unicode code point at the given string position
1344	* and return the pointer to the char following it.
1345	*
1346	* This function will not consider anything at or following the
1347	* buffer area pointed to by pwsz. It is therefore not suitable for
1348	* inserting code points into a string, only appending/overwriting.
1349	*
1350	* @returns pointer to the char following the written code point.
1351	* @param pwsz The string.
1352	* @param CodePoint The code point to write.
1353	* This should not be RTUNICP_INVALID or any other
1354	* character out of the UTF-16 range.
1355	*
1356	* @remark We optimize this operation by using an inline function for
1357	* everything which isn't a surrogate pair or and endian indicator.
1358	*/
1359	DECLINLINE(PRTUTF16) RTUtf16PutCp(PRTUTF16 pwsz, RTUNICP CodePoint)
1360	{
1361	if (CodePoint < 0xd800 \|\| (CodePoint > 0xd800 && CodePoint < 0xfffe))
1362	{
1363	*pwsz++ = (RTUTF16)CodePoint;
1364	return pwsz;
1365	}
1366	return RTUtf16PutCpInternal(pwsz, CodePoint);
1367	}
1368
1369	/**
1370	* Skips ahead, past the current code point.
1371	*
1372	* @returns Pointer to the char after the current code point.
1373	* @param pwsz Pointer to the current code point.
1374	* @remark This will not move the next valid code point, only past the current one.
1375	*/
1376	DECLINLINE(PRTUTF16) RTUtf16NextCp(PCRTUTF16 pwsz)
1377	{
1378	RTUNICP Cp;
1379	RTUtf16GetCpEx(&pwsz, &Cp);
1380	return (PRTUTF16)pwsz;
1381	}
1382
1383	/**
1384	* Skips backwards, to the previous code point.
1385	*
1386	* @returns Pointer to the char after the current code point.
1387	* @param pwszStart Pointer to the start of the string.
1388	* @param pwsz Pointer to the current code point.
1389	*/
1390	RTDECL(PRTUTF16) RTUtf16PrevCp(PCRTUTF16 pwszStart, PCRTUTF16 pwsz);
1391
1392
1393	/**
1394	* Checks if the UTF-16 char is the high surrogate char (i.e.
1395	* the 1st char in the pair).
1396	*
1397	* @returns true if it is.
1398	* @returns false if it isn't.
1399	* @param wc The character to investigate.
1400	*/
1401	DECLINLINE(bool) RTUtf16IsHighSurrogate(RTUTF16 wc)
1402	{
1403	return wc >= 0xd800 && wc <= 0xdbff;
1404	}
1405
1406	/**
1407	* Checks if the UTF-16 char is the low surrogate char (i.e.
1408	* the 2nd char in the pair).
1409	*
1410	* @returns true if it is.
1411	* @returns false if it isn't.
1412	* @param wc The character to investigate.
1413	*/
1414	DECLINLINE(bool) RTUtf16IsLowSurrogate(RTUTF16 wc)
1415	{
1416	return wc >= 0xdc00 && wc <= 0xdfff;
1417	}
1418
1419
1420	/**
1421	* Checks if the two UTF-16 chars form a valid surrogate pair.
1422	*
1423	* @returns true if they do.
1424	* @returns false if they doesn't.
1425	* @param wcHigh The high (1st) character.
1426	* @param wcLow The low (2nd) character.
1427	*/
1428	DECLINLINE(bool) RTUtf16IsSurrogatePair(RTUTF16 wcHigh, RTUTF16 wcLow)
1429	{
1430	return RTUtf16IsHighSurrogate(wcHigh)
1431	&& RTUtf16IsLowSurrogate(wcLow);
1432	}
1433
1434	/**
1435	* Formats a buffer stream as hex bytes.
1436	*
1437	* The default is no separating spaces or line breaks or anything.
1438	*
1439	* @returns IPRT status code.
1440	* @retval VERR_INVALID_POINTER if any of the pointers are wrong.
1441	* @retval VERR_BUFFER_OVERFLOW if the buffer is insufficent to hold the bytes.
1442	*
1443	* @param pwszBuf Output string buffer.
1444	* @param cwcBuf The size of the output buffer in RTUTF16 units.
1445	* @param pv Pointer to the bytes to stringify.
1446	* @param cb The number of bytes to stringify.
1447	* @param fFlags Combination of RTSTRPRINTHEXBYTES_F_XXX values.
1448	* @sa RTStrPrintHexBytes.
1449	*/
1450	RTDECL(int) RTUtf16PrintHexBytes(PRTUTF16 pwszBuf, size_t cwcBuf, void const *pv, size_t cb, uint32_t fFlags);
1451
1452	/**
1453	* String printf producing UTF-16 output.
1454	*
1455	* @returns On success, positive count of formatted RTUTF16 units excluding the
1456	* terminator. On buffer overflow, negative number giving the required
1457	* buffer size (including terminator) in RTUTF16 units.
1458	*
1459	* @param pwszBuffer Output buffer.
1460	* @param cwcBuffer Size of the output buffer in RTUTF16 units.
1461	* @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1462	* @param args The format argument.
1463	*
1464	* @note This is similar to RTStrPrintf2V (not RTStrPrintfV)!
1465	*/
1466	RTDECL(ssize_t) RTUtf16PrintfV(PRTUTF16 pwszBuffer, size_t cwcBuffer, const char *pszFormat, va_list args) RT_IPRT_FORMAT_ATTR(3, 0);
1467
1468	/**
1469	* String printf producing UTF-16 output.
1470	*
1471	* @returns On success, positive count of formatted RTUTF16 units excluding the
1472	* terminator. On buffer overflow, negative number giving the required
1473	* buffer size (including terminator) in RTUTF16 units.
1474	*
1475	* @param pwszBuffer Output buffer.
1476	* @param cwcBuffer Size of the output buffer in RTUTF16 units.
1477	* @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1478	* @param ... The format argument.
1479	*
1480	* @note This is similar to RTStrPrintf2 (not RTStrPrintf)!
1481	*/
1482	RTDECL(ssize_t) RTUtf16Printf(PRTUTF16 pwszBuffer, size_t cwcBuffer, const char *pszFormat, ...) RT_IPRT_FORMAT_ATTR(3, 4);
1483
1484	/**
1485	* String printf producing UTF-16 output with custom formatting.
1486	*
1487	* @returns On success, positive count of formatted RTUTF16 units excluding the
1488	* terminator. On buffer overflow, negative number giving the required
1489	* buffer size (including terminator) in RTUTF16 units.
1490	*
1491	* @param pfnFormat Pointer to handler function for the custom formats.
1492	* @param pvArg Argument to the pfnFormat function.
1493	* @param pwszBuffer Output buffer.
1494	* @param cwcBuffer Size of the output buffer in RTUTF16 units.
1495	* @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1496	* @param args The format argument.
1497	*
1498	* @note This is similar to RTStrPrintf2ExV (not RTStrPrintfExV)!
1499	*/
1500	RTDECL(ssize_t) RTUtf16PrintfExV(PFNSTRFORMAT pfnFormat, void *pvArg, PRTUTF16 pwszBuffer, size_t cwcBuffer,
1501	const char *pszFormat, va_list args) RT_IPRT_FORMAT_ATTR(5, 0);
1502
1503	/**
1504	* String printf producing UTF-16 output with custom formatting.
1505	*
1506	* @returns On success, positive count of formatted RTUTF16 units excluding the
1507	* terminator. On buffer overflow, negative number giving the required
1508	* buffer size (including terminator) in RTUTF16 units.
1509	*
1510	* @param pfnFormat Pointer to handler function for the custom formats.
1511	* @param pvArg Argument to the pfnFormat function.
1512	* @param pwszBuffer Output buffer.
1513	* @param cwcBuffer Size of the output buffer in RTUTF16 units.
1514	* @param pszFormat Pointer to the format string, @see pg_rt_str_format.
1515	* @param ... The format argument.
1516	*
1517	* @note This is similar to RTStrPrintf2Ex (not RTStrPrintfEx)!
1518	*/
1519	RTDECL(ssize_t) RTUtf16PrintfEx(PFNSTRFORMAT pfnFormat, void *pvArg, PRTUTF16 pwszBuffer, size_t cwcBuffer,
1520	const char *pszFormat, ...) RT_IPRT_FORMAT_ATTR(5, 6);
1521
1522	/** @} */
1523	RT_C_DECLS_END
1524
1525	#endif /* !IPRT_INCLUDED_utf16_h */
1526

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/include/iprt/utf16.h

Download in other formats: