Context Navigation

xmlstring.c

Last change on this file was 104106, checked in by vboxsync, 8 weeks ago
libxml2-2.9.14: Applied and adjusted our libxml2 changes to 2.9.14. bugref:10640
Property svn:eol-style set to `native`
File size: 26.0 KB

Line
1	/*
2	* string.c : an XML string utilities module
3	*
4	* This module provides various utility functions for manipulating
5	* the xmlChar* type. All functions named xmlStr* have been moved here
6	* from the parser.c file (their original home).
7	*
8	* See Copyright for the status of this software.
9	*
10	* UTF8 string routines from:
11	* William Brack <wbrack@mmm.com.hk>
12	*
13	* daniel@veillard.com
14	*/
15
16	#define IN_LIBXML
17	#include "libxml.h"
18
19	#include <stdlib.h>
20	#include <string.h>
21	#include <limits.h>
22	#include <libxml/xmlmemory.h>
23	#include <libxml/parserInternals.h>
24	#include <libxml/xmlstring.h>
25
26	#include "private/parser.h"
27	#include "private/string.h"
28
29	/************************************************************************
30	* *
31	* Commodity functions to handle xmlChars *
32	* *
33	************************************************************************/
34
35	/**
36	* xmlStrndup:
37	* @cur: the input xmlChar *
38	* @len: the len of @cur
39	*
40	* a strndup for array of xmlChar's
41	*
42	* Returns a new xmlChar * or NULL
43	*/
44	xmlChar *
45	xmlStrndup(const xmlChar *cur, int len) {
46	xmlChar *ret;
47
48	if ((cur == NULL) \|\| (len < 0)) return(NULL);
49	ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
50	if (ret == NULL) {
51	return(NULL);
52	}
53	memcpy(ret, cur, len);
54	ret[len] = 0;
55	return(ret);
56	}
57
58	/**
59	* xmlStrdup:
60	* @cur: the input xmlChar *
61	*
62	* a strdup for array of xmlChar's. Since they are supposed to be
63	* encoded in UTF-8 or an encoding with 8bit based chars, we assume
64	* a termination mark of '0'.
65	*
66	* Returns a new xmlChar * or NULL
67	*/
68	xmlChar *
69	xmlStrdup(const xmlChar *cur) {
70	const xmlChar *p = cur;
71
72	if (cur == NULL) return(NULL);
73	while (p != 0) p++; / non input consuming */
74	return(xmlStrndup(cur, p - cur));
75	}
76
77	/**
78	* xmlCharStrndup:
79	* @cur: the input char *
80	* @len: the len of @cur
81	*
82	* a strndup for char's to xmlChar's
83	*
84	* Returns a new xmlChar * or NULL
85	*/
86
87	xmlChar *
88	xmlCharStrndup(const char *cur, int len) {
89	int i;
90	xmlChar *ret;
91
92	if ((cur == NULL) \|\| (len < 0)) return(NULL);
93	ret = (xmlChar *) xmlMallocAtomic((size_t) len + 1);
94	if (ret == NULL) {
95	return(NULL);
96	}
97	for (i = 0;i < len;i++) {
98	/* Explicit sign change */
99	ret[i] = (xmlChar) cur[i];
100	if (ret[i] == 0) return(ret);
101	}
102	ret[len] = 0;
103	return(ret);
104	}
105
106	/**
107	* xmlCharStrdup:
108	* @cur: the input char *
109	*
110	* a strdup for char's to xmlChar's
111	*
112	* Returns a new xmlChar * or NULL
113	*/
114
115	xmlChar *
116	xmlCharStrdup(const char *cur) {
117	const char *p = cur;
118
119	if (cur == NULL) return(NULL);
120	while (p != '\0') p++; / non input consuming */
121	return(xmlCharStrndup(cur, p - cur));
122	}
123
124	/**
125	* xmlStrcmp:
126	* @str1: the first xmlChar *
127	* @str2: the second xmlChar *
128	*
129	* a strcmp for xmlChar's
130	*
131	* Returns the integer result of the comparison
132	*/
133
134	int
135	xmlStrcmp(const xmlChar str1, const xmlChar str2) {
136	if (str1 == str2) return(0);
137	if (str1 == NULL) return(-1);
138	if (str2 == NULL) return(1);
139	#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
140	return(strcmp((const char )str1, (const char )str2));
141	#else
142	do {
143	int tmp = str1++ - str2;
144	if (tmp != 0) return(tmp);
145	} while (*str2++ != 0);
146	return 0;
147	#endif
148	}
149
150	/**
151	* xmlStrEqual:
152	* @str1: the first xmlChar *
153	* @str2: the second xmlChar *
154	*
155	* Check if both strings are equal of have same content.
156	* Should be a bit more readable and faster than xmlStrcmp()
157	*
158	* Returns 1 if they are equal, 0 if they are different
159	*/
160
161	int
162	xmlStrEqual(const xmlChar str1, const xmlChar str2) {
163	if (str1 == str2) return(1);
164	if (str1 == NULL) return(0);
165	if (str2 == NULL) return(0);
166	#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
167	return(strcmp((const char )str1, (const char )str2) == 0);
168	#else
169	do {
170	if (str1++ != str2) return(0);
171	} while (*str2++);
172	return(1);
173	#endif
174	}
175
176	/**
177	* xmlStrQEqual:
178	* @pref: the prefix of the QName
179	* @name: the localname of the QName
180	* @str: the second xmlChar *
181	*
182	* Check if a QName is Equal to a given string
183	*
184	* Returns 1 if they are equal, 0 if they are different
185	*/
186
187	int
188	xmlStrQEqual(const xmlChar pref, const xmlChar name, const xmlChar *str) {
189	if (pref == NULL) return(xmlStrEqual(name, str));
190	if (name == NULL) return(0);
191	if (str == NULL) return(0);
192
193	do {
194	if (pref++ != str) return(0);
195	} while ((str++) && (pref));
196	if (*str++ != ':') return(0);
197	do {
198	if (name++ != str) return(0);
199	} while (*str++);
200	return(1);
201	}
202
203	/**
204	* xmlStrncmp:
205	* @str1: the first xmlChar *
206	* @str2: the second xmlChar *
207	* @len: the max comparison length
208	*
209	* a strncmp for xmlChar's
210	*
211	* Returns the integer result of the comparison
212	*/
213
214	int
215	xmlStrncmp(const xmlChar str1, const xmlChar str2, int len) {
216	if (len <= 0) return(0);
217	if (str1 == str2) return(0);
218	if (str1 == NULL) return(-1);
219	if (str2 == NULL) return(1);
220	#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
221	return(strncmp((const char )str1, (const char )str2, len));
222	#else
223	do {
224	int tmp = str1++ - str2;
225	if (tmp != 0 \|\| --len == 0) return(tmp);
226	} while (*str2++ != 0);
227	return 0;
228	#endif
229	}
230
231	static const xmlChar casemap[256] = {
232	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
233	0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
234	0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
235	0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
236	0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
237	0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
238	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
239	0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
240	0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
241	0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
242	0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
243	0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
244	0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
245	0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
246	0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
247	0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
248	0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
249	0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
250	0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
251	0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
252	0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
253	0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
254	0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
255	0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
256	0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
257	0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
258	0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
259	0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
260	0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
261	0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
262	0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
263	0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
264	};
265
266	/**
267	* xmlStrcasecmp:
268	* @str1: the first xmlChar *
269	* @str2: the second xmlChar *
270	*
271	* a strcasecmp for xmlChar's
272	*
273	* Returns the integer result of the comparison
274	*/
275
276	int
277	xmlStrcasecmp(const xmlChar str1, const xmlChar str2) {
278	register int tmp;
279
280	if (str1 == str2) return(0);
281	if (str1 == NULL) return(-1);
282	if (str2 == NULL) return(1);
283	do {
284	tmp = casemap[str1++] - casemap[str2];
285	if (tmp != 0) return(tmp);
286	} while (*str2++ != 0);
287	return 0;
288	}
289
290	/**
291	* xmlStrncasecmp:
292	* @str1: the first xmlChar *
293	* @str2: the second xmlChar *
294	* @len: the max comparison length
295	*
296	* a strncasecmp for xmlChar's
297	*
298	* Returns the integer result of the comparison
299	*/
300
301	int
302	xmlStrncasecmp(const xmlChar str1, const xmlChar str2, int len) {
303	register int tmp;
304
305	if (len <= 0) return(0);
306	if (str1 == str2) return(0);
307	if (str1 == NULL) return(-1);
308	if (str2 == NULL) return(1);
309	do {
310	tmp = casemap[str1++] - casemap[str2];
311	if (tmp != 0 \|\| --len == 0) return(tmp);
312	} while (*str2++ != 0);
313	return 0;
314	}
315
316	/**
317	* xmlStrchr:
318	* @str: the xmlChar * array
319	* @val: the xmlChar to search
320	*
321	* a strchr for xmlChar's
322	*
323	* Returns the xmlChar * for the first occurrence or NULL.
324	*/
325
326	const xmlChar *
327	xmlStrchr(const xmlChar *str, xmlChar val) {
328	if (str == NULL) return(NULL);
329	while (str != 0) { / non input consuming */
330	if (str == val) return((xmlChar ) str);
331	str++;
332	}
333	return(NULL);
334	}
335
336	/**
337	* xmlStrstr:
338	* @str: the xmlChar * array (haystack)
339	* @val: the xmlChar to search (needle)
340	*
341	* a strstr for xmlChar's
342	*
343	* Returns the xmlChar * for the first occurrence or NULL.
344	*/
345
346	const xmlChar *
347	xmlStrstr(const xmlChar str, const xmlChar val) {
348	int n;
349
350	if (str == NULL) return(NULL);
351	if (val == NULL) return(NULL);
352	n = xmlStrlen(val);
353
354	if (n == 0) return(str);
355	while (str != 0) { / non input consuming */
356	if (str == val) {
357	if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
358	}
359	str++;
360	}
361	return(NULL);
362	}
363
364	/**
365	* xmlStrcasestr:
366	* @str: the xmlChar * array (haystack)
367	* @val: the xmlChar to search (needle)
368	*
369	* a case-ignoring strstr for xmlChar's
370	*
371	* Returns the xmlChar * for the first occurrence or NULL.
372	*/
373
374	const xmlChar *
375	xmlStrcasestr(const xmlChar str, const xmlChar val) {
376	int n;
377
378	if (str == NULL) return(NULL);
379	if (val == NULL) return(NULL);
380	n = xmlStrlen(val);
381
382	if (n == 0) return(str);
383	while (str != 0) { / non input consuming */
384	if (casemap[str] == casemap[val])
385	if (!xmlStrncasecmp(str, val, n)) return(str);
386	str++;
387	}
388	return(NULL);
389	}
390
391	/**
392	* xmlStrsub:
393	* @str: the xmlChar * array (haystack)
394	* @start: the index of the first char (zero based)
395	* @len: the length of the substring
396	*
397	* Extract a substring of a given string
398	*
399	* Returns the xmlChar * for the first occurrence or NULL.
400	*/
401
402	xmlChar *
403	xmlStrsub(const xmlChar *str, int start, int len) {
404	int i;
405
406	if (str == NULL) return(NULL);
407	if (start < 0) return(NULL);
408	if (len < 0) return(NULL);
409
410	for (i = 0;i < start;i++) {
411	if (*str == 0) return(NULL);
412	str++;
413	}
414	if (*str == 0) return(NULL);
415	return(xmlStrndup(str, len));
416	}
417
418	/**
419	* xmlStrlen:
420	* @str: the xmlChar * array
421	*
422	* length of a xmlChar's string
423	*
424	* Returns the number of xmlChar contained in the ARRAY.
425	*/
426
427	int
428	xmlStrlen(const xmlChar *str) {
429	size_t len = str ? strlen((const char *)str) : 0;
430	return(len > INT_MAX ? 0 : len);
431	}
432
433	/**
434	* xmlStrncat:
435	* @cur: the original xmlChar * array
436	* @add: the xmlChar * array added
437	* @len: the length of @add
438	*
439	* a strncat for array of xmlChar's, it will extend @cur with the len
440	* first bytes of @add. Note that if @len < 0 then this is an API error
441	* and NULL will be returned.
442	*
443	* Returns a new xmlChar *, the original @cur is reallocated and should
444	* not be freed.
445	*/
446
447	xmlChar *
448	xmlStrncat(xmlChar cur, const xmlChar add, int len) {
449	int size;
450	xmlChar *ret;
451
452	if ((add == NULL) \|\| (len == 0))
453	return(cur);
454	if (len < 0)
455	return(NULL);
456	if (cur == NULL)
457	return(xmlStrndup(add, len));
458
459	size = xmlStrlen(cur);
460	if ((size < 0) \|\| (size > INT_MAX - len))
461	return(NULL);
462	ret = (xmlChar *) xmlRealloc(cur, (size_t) size + len + 1);
463	if (ret == NULL) {
464	return(cur);
465	}
466	memcpy(&ret[size], add, len);
467	ret[size + len] = 0;
468	return(ret);
469	}
470
471	/**
472	* xmlStrncatNew:
473	* @str1: first xmlChar string
474	* @str2: second xmlChar string
475	* @len: the len of @str2 or < 0
476	*
477	* same as xmlStrncat, but creates a new string. The original
478	* two strings are not freed. If @len is < 0 then the length
479	* will be calculated automatically.
480	*
481	* Returns a new xmlChar * or NULL
482	*/
483	xmlChar *
484	xmlStrncatNew(const xmlChar str1, const xmlChar str2, int len) {
485	int size;
486	xmlChar *ret;
487
488	if (len < 0) {
489	len = xmlStrlen(str2);
490	if (len < 0)
491	return(NULL);
492	}
493	if ((str2 == NULL) \|\| (len == 0))
494	return(xmlStrdup(str1));
495	if (str1 == NULL)
496	return(xmlStrndup(str2, len));
497
498	size = xmlStrlen(str1);
499	if ((size < 0) \|\| (size > INT_MAX - len))
500	return(NULL);
501	ret = (xmlChar *) xmlMalloc((size_t) size + len + 1);
502	if (ret == NULL) {
503	return(xmlStrndup(str1, size));
504	}
505	memcpy(ret, str1, size);
506	memcpy(&ret[size], str2, len);
507	ret[size + len] = 0;
508	return(ret);
509	}
510
511	/**
512	* xmlStrcat:
513	* @cur: the original xmlChar * array
514	* @add: the xmlChar * array added
515	*
516	* a strcat for array of xmlChar's. Since they are supposed to be
517	* encoded in UTF-8 or an encoding with 8bit based chars, we assume
518	* a termination mark of '0'.
519	*
520	* Returns a new xmlChar * containing the concatenated string. The original
521	* @cur is reallocated and should not be freed.
522	*/
523	xmlChar *
524	xmlStrcat(xmlChar cur, const xmlChar add) {
525	const xmlChar *p = add;
526
527	if (add == NULL) return(cur);
528	if (cur == NULL)
529	return(xmlStrdup(add));
530
531	while (p != 0) p++; / non input consuming */
532	return(xmlStrncat(cur, add, p - add));
533	}
534
535	/**
536	* xmlStrPrintf:
537	* @buf: the result buffer.
538	* @len: the result buffer length.
539	* @msg: the message with printf formatting.
540	* @...: extra parameters for the message.
541	*
542	* Formats @msg and places result into @buf.
543	*
544	* Returns the number of characters written to @buf or -1 if an error occurs.
545	*/
546	int
547	xmlStrPrintf(xmlChar buf, int len, const char msg, ...) {
548	va_list args;
549	int ret;
550
551	if((buf == NULL) \|\| (msg == NULL)) {
552	return(-1);
553	}
554
555	va_start(args, msg);
556	ret = vsnprintf((char ) buf, len, (const char ) msg, args);
557	va_end(args);
558	buf[len - 1] = 0; /* be safe ! */
559
560	return(ret);
561	}
562
563	/**
564	* xmlStrVPrintf:
565	* @buf: the result buffer.
566	* @len: the result buffer length.
567	* @msg: the message with printf formatting.
568	* @ap: extra parameters for the message.
569	*
570	* Formats @msg and places result into @buf.
571	*
572	* Returns the number of characters written to @buf or -1 if an error occurs.
573	*/
574	int
575	xmlStrVPrintf(xmlChar buf, int len, const char msg, va_list ap) {
576	int ret;
577
578	if((buf == NULL) \|\| (msg == NULL)) {
579	return(-1);
580	}
581
582	ret = vsnprintf((char ) buf, len, (const char ) msg, ap);
583	buf[len - 1] = 0; /* be safe ! */
584
585	return(ret);
586	}
587
588	/************************************************************************
589	* *
590	* Generic UTF8 handling routines *
591	* *
592	* From rfc2044: encoding of the Unicode values on UTF-8: *
593	* *
594	* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
595	* 0000 0000-0000 007F 0xxxxxxx *
596	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
597	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
598	* *
599	* I hope we won't use values > 0xFFFF anytime soon ! *
600	* *
601	************************************************************************/
602
603
604	/**
605	* xmlUTF8Size:
606	* @utf: pointer to the UTF8 character
607	*
608	* calculates the internal size of a UTF8 character
609	*
610	* returns the numbers of bytes in the character, -1 on format error
611	*/
612	int
613	xmlUTF8Size(const xmlChar *utf) {
614	xmlChar mask;
615	int len;
616
617	if (utf == NULL)
618	return -1;
619	if (*utf < 0x80)
620	return 1;
621	/* check valid UTF8 character */
622	if (!(*utf & 0x40))
623	return -1;
624	/* determine number of bytes in char */
625	len = 2;
626	for (mask=0x20; mask != 0; mask>>=1) {
627	if (!(*utf & mask))
628	return len;
629	len++;
630	}
631	return -1;
632	}
633
634	/**
635	* xmlUTF8Charcmp:
636	* @utf1: pointer to first UTF8 char
637	* @utf2: pointer to second UTF8 char
638	*
639	* compares the two UCS4 values
640	*
641	* returns result of the compare as with xmlStrncmp
642	*/
643	int
644	xmlUTF8Charcmp(const xmlChar utf1, const xmlChar utf2) {
645
646	if (utf1 == NULL ) {
647	if (utf2 == NULL)
648	return 0;
649	return -1;
650	}
651	return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
652	}
653
654	/**
655	* xmlUTF8Strlen:
656	* @utf: a sequence of UTF-8 encoded bytes
657	*
658	* compute the length of an UTF8 string, it doesn't do a full UTF8
659	* checking of the content of the string.
660	*
661	* Returns the number of characters in the string or -1 in case of error
662	*/
663	int
664	xmlUTF8Strlen(const xmlChar *utf) {
665	size_t ret = 0;
666
667	if (utf == NULL)
668	return(-1);
669
670	while (*utf != 0) {
671	if (utf[0] & 0x80) {
672	if ((utf[1] & 0xc0) != 0x80)
673	return(-1);
674	if ((utf[0] & 0xe0) == 0xe0) {
675	if ((utf[2] & 0xc0) != 0x80)
676	return(-1);
677	if ((utf[0] & 0xf0) == 0xf0) {
678	if ((utf[0] & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
679	return(-1);
680	utf += 4;
681	} else {
682	utf += 3;
683	}
684	} else {
685	utf += 2;
686	}
687	} else {
688	utf++;
689	}
690	ret++;
691	}
692	return(ret > INT_MAX ? 0 : ret);
693	}
694
695	/**
696	* xmlGetUTF8Char:
697	* @utf: a sequence of UTF-8 encoded bytes
698	* @len: a pointer to the minimum number of bytes present in
699	* the sequence. This is used to assure the next character
700	* is completely contained within the sequence.
701	*
702	* Read the first UTF8 character from @utf
703	*
704	* Returns the char value or -1 in case of error, and sets *len to
705	* the actual number of bytes consumed (0 in case of error)
706	*/
707	int
708	xmlGetUTF8Char(const unsigned char utf, int len) {
709	unsigned int c;
710
711	if (utf == NULL)
712	goto error;
713	if (len == NULL)
714	goto error;
715
716	c = utf[0];
717	if (c < 0x80) {
718	if (*len < 1)
719	goto error;
720	/* 1-byte code */
721	*len = 1;
722	} else {
723	if ((*len < 2) \|\| ((utf[1] & 0xc0) != 0x80))
724	goto error;
725	if (c < 0xe0) {
726	if (c < 0xc2)
727	goto error;
728	/* 2-byte code */
729	*len = 2;
730	c = (c & 0x1f) << 6;
731	c \|= utf[1] & 0x3f;
732	} else {
733	if ((*len < 3) \|\| ((utf[2] & 0xc0) != 0x80))
734	goto error;
735	if (c < 0xf0) {
736	/* 3-byte code */
737	*len = 3;
738	c = (c & 0xf) << 12;
739	c \|= (utf[1] & 0x3f) << 6;
740	c \|= utf[2] & 0x3f;
741	if ((c < 0x800) \|\| ((c >= 0xd800) && (c < 0xe000)))
742	goto error;
743	} else {
744	if ((*len < 4) \|\| ((utf[3] & 0xc0) != 0x80))
745	goto error;
746	*len = 4;
747	/* 4-byte code */
748	c = (c & 0x7) << 18;
749	c \|= (utf[1] & 0x3f) << 12;
750	c \|= (utf[2] & 0x3f) << 6;
751	c \|= utf[3] & 0x3f;
752	if ((c < 0x10000) \|\| (c >= 0x110000))
753	goto error;
754	}
755	}
756	}
757	return(c);
758
759	error:
760	if (len != NULL)
761	*len = 0;
762	return(-1);
763	}
764
765	/**
766	* xmlCheckUTF8:
767	* @utf: Pointer to putative UTF-8 encoded string.
768	*
769	* Checks @utf for being valid UTF-8. @utf is assumed to be
770	* null-terminated. This function is not super-strict, as it will
771	* allow longer UTF-8 sequences than necessary. Note that Java is
772	* capable of producing these sequences if provoked. Also note, this
773	* routine checks for the 4-byte maximum size, but does not check for
774	* 0x10ffff maximum value.
775	*
776	* Return value: true if @utf is valid.
777	**/
778	int
779	xmlCheckUTF8(const unsigned char *utf)
780	{
781	int ix;
782	unsigned char c;
783
784	if (utf == NULL)
785	return(0);
786	/*
787	* utf is a string of 1, 2, 3 or 4 bytes. The valid strings
788	* are as follows (in "bit format"):
789	* 0xxxxxxx valid 1-byte
790	* 110xxxxx 10xxxxxx valid 2-byte
791	* 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
792	* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
793	*/
794	while ((c = utf[0])) { /* string is 0-terminated */
795	ix = 0;
796	if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
797	ix = 1;
798	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
799	if ((utf[1] & 0xc0 ) != 0x80)
800	return 0;
801	ix = 2;
802	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
803	if (((utf[1] & 0xc0) != 0x80) \|\|
804	((utf[2] & 0xc0) != 0x80))
805	return 0;
806	ix = 3;
807	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
808	if (((utf[1] & 0xc0) != 0x80) \|\|
809	((utf[2] & 0xc0) != 0x80) \|\|
810	((utf[3] & 0xc0) != 0x80))
811	return 0;
812	ix = 4;
813	} else /* unknown encoding */
814	return 0;
815	utf += ix;
816	}
817	return(1);
818	}
819
820	/**
821	* xmlUTF8Strsize:
822	* @utf: a sequence of UTF-8 encoded bytes
823	* @len: the number of characters in the array
824	*
825	* storage size of an UTF8 string
826	* the behaviour is not guaranteed if the input string is not UTF-8
827	*
828	* Returns the storage size of
829	* the first 'len' characters of ARRAY
830	*/
831
832	int
833	xmlUTF8Strsize(const xmlChar *utf, int len) {
834	const xmlChar *ptr=utf;
835	int ch;
836	size_t ret;
837
838	if (utf == NULL)
839	return(0);
840
841	if (len <= 0)
842	return(0);
843
844	while ( len-- > 0) {
845	if ( !*ptr )
846	break;
847	if ( (ch = *ptr++) & 0x80)
848	while ((ch<<=1) & 0x80 ) {
849	if (*ptr == 0) break;
850	ptr++;
851	}
852	}
853	ret = ptr - utf;
854	return (ret > INT_MAX ? 0 : ret);
855	}
856
857
858	/**
859	* xmlUTF8Strndup:
860	* @utf: the input UTF8 *
861	* @len: the len of @utf (in chars)
862	*
863	* a strndup for array of UTF8's
864	*
865	* Returns a new UTF8 * or NULL
866	*/
867	xmlChar *
868	xmlUTF8Strndup(const xmlChar *utf, int len) {
869	xmlChar *ret;
870	int i;
871
872	if ((utf == NULL) \|\| (len < 0)) return(NULL);
873	i = xmlUTF8Strsize(utf, len);
874	ret = (xmlChar *) xmlMallocAtomic((size_t) i + 1);
875	if (ret == NULL) {
876	return(NULL);
877	}
878	memcpy(ret, utf, i);
879	ret[i] = 0;
880	return(ret);
881	}
882
883	/**
884	* xmlUTF8Strpos:
885	* @utf: the input UTF8 *
886	* @pos: the position of the desired UTF8 char (in chars)
887	*
888	* a function to provide the equivalent of fetching a
889	* character from a string array
890	*
891	* Returns a pointer to the UTF8 character or NULL
892	*/
893	const xmlChar *
894	xmlUTF8Strpos(const xmlChar *utf, int pos) {
895	int ch;
896
897	if (utf == NULL) return(NULL);
898	if (pos < 0)
899	return(NULL);
900	while (pos--) {
901	if ((ch=*utf++) == 0) return(NULL);
902	if ( ch & 0x80 ) {
903	/* if not simple ascii, verify proper format */
904	if ( (ch & 0xc0) != 0xc0 )
905	return(NULL);
906	/* then skip over remaining bytes for this char */
907	while ( (ch <<= 1) & 0x80 )
908	if ( (*utf++ & 0xc0) != 0x80 )
909	return(NULL);
910	}
911	}
912	return((xmlChar *)utf);
913	}
914
915	/**
916	* xmlUTF8Strloc:
917	* @utf: the input UTF8 *
918	* @utfchar: the UTF8 character to be found
919	*
920	* a function to provide the relative location of a UTF8 char
921	*
922	* Returns the relative character position of the desired char
923	* or -1 if not found
924	*/
925	int
926	xmlUTF8Strloc(const xmlChar utf, const xmlChar utfchar) {
927	size_t i;
928	int size;
929	int ch;
930
931	if (utf==NULL \|\| utfchar==NULL) return -1;
932	size = xmlUTF8Strsize(utfchar, 1);
933	for(i=0; (ch=*utf) != 0; i++) {
934	if (xmlStrncmp(utf, utfchar, size)==0)
935	return(i > INT_MAX ? 0 : i);
936	utf++;
937	if ( ch & 0x80 ) {
938	/* if not simple ascii, verify proper format */
939	if ( (ch & 0xc0) != 0xc0 )
940	return(-1);
941	/* then skip over remaining bytes for this char */
942	while ( (ch <<= 1) & 0x80 )
943	if ( (*utf++ & 0xc0) != 0x80 )
944	return(-1);
945	}
946	}
947
948	return(-1);
949	}
950	/**
951	* xmlUTF8Strsub:
952	* @utf: a sequence of UTF-8 encoded bytes
953	* @start: relative pos of first char
954	* @len: total number to copy
955	*
956	* Create a substring from a given UTF-8 string
957	* Note: positions are given in units of UTF-8 chars
958	*
959	* Returns a pointer to a newly created string
960	* or NULL if any problem
961	*/
962
963	xmlChar *
964	xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
965	int i;
966	int ch;
967
968	if (utf == NULL) return(NULL);
969	if (start < 0) return(NULL);
970	if (len < 0) return(NULL);
971
972	/*
973	* Skip over any leading chars
974	*/
975	for (i = 0;i < start;i++) {
976	if ((ch=*utf++) == 0) return(NULL);
977	if ( ch & 0x80 ) {
978	/* if not simple ascii, verify proper format */
979	if ( (ch & 0xc0) != 0xc0 )
980	return(NULL);
981	/* then skip over remaining bytes for this char */
982	while ( (ch <<= 1) & 0x80 )
983	if ( (*utf++ & 0xc0) != 0x80 )
984	return(NULL);
985	}
986	}
987
988	return(xmlUTF8Strndup(utf, len));
989	}
990
991	/**
992	* xmlEscapeFormatString:
993	* @msg: a pointer to the string in which to escape '%' characters.
994	* Must be a heap-allocated buffer created by libxml2 that may be
995	* returned, or that may be freed and replaced.
996	*
997	* Replaces the string pointed to by 'msg' with an escaped string.
998	* Returns the same string with all '%' characters escaped.
999	*/
1000	xmlChar *
1001	xmlEscapeFormatString(xmlChar **msg)
1002	{
1003	xmlChar *msgPtr = NULL;
1004	xmlChar *result = NULL;
1005	xmlChar *resultPtr = NULL;
1006	size_t count = 0;
1007	size_t msgLen = 0;
1008	size_t resultLen = 0;
1009
1010	if (!msg \|\| !*msg)
1011	return(NULL);
1012
1013	for (msgPtr = msg; msgPtr != '\0'; ++msgPtr) {
1014	++msgLen;
1015	if (*msgPtr == '%')
1016	++count;
1017	}
1018
1019	if (count == 0)
1020	return(*msg);
1021
1022	if ((count > INT_MAX) \|\| (msgLen > INT_MAX - count))
1023	return(NULL);
1024	resultLen = msgLen + count + 1;
1025	result = (xmlChar *) xmlMallocAtomic(resultLen);
1026	if (result == NULL) {
1027	/* Clear *msg to prevent format string vulnerabilities in
1028	out-of-memory situations. */
1029	xmlFree(*msg);
1030	*msg = NULL;
1031	return(NULL);
1032	}
1033
1034	for (msgPtr = msg, resultPtr = result; msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1035	resultPtr = msgPtr;
1036	if (*msgPtr == '%')
1037	*(++resultPtr) = '%';
1038	}
1039	result[resultLen - 1] = '\0';
1040
1041	xmlFree(*msg);
1042	*msg = result;
1043
1044	return *msg;
1045	}
1046

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vbox/trunk/src/libs/libxml2-2.12.6/xmlstring.c

Download in other formats: