Context Navigation

HTMLparser.c

Last change on this file was 104106, checked in by vboxsync, 2 months ago
libxml2-2.9.14: Applied and adjusted our libxml2 changes to 2.9.14. bugref:10640
Property svn:eol-style set to `native`
File size: 200.7 KB

Line
1	/*
2	* HTMLparser.c : an HTML 4.0 non-verifying parser
3	*
4	* See Copyright for the status of this software.
5	*
6	* daniel@veillard.com
7	*/
8
9	#define IN_LIBXML
10	#include "libxml.h"
11	#ifdef LIBXML_HTML_ENABLED
12
13	#include <string.h>
14	#include <ctype.h>
15	#include <stdlib.h>
16
17	#include <libxml/HTMLparser.h>
18	#include <libxml/xmlmemory.h>
19	#include <libxml/tree.h>
20	#include <libxml/parser.h>
21	#include <libxml/parserInternals.h>
22	#include <libxml/xmlerror.h>
23	#include <libxml/HTMLtree.h>
24	#include <libxml/entities.h>
25	#include <libxml/encoding.h>
26	#include <libxml/xmlIO.h>
27	#include <libxml/uri.h>
28
29	#include "private/buf.h"
30	#include "private/enc.h"
31	#include "private/error.h"
32	#include "private/html.h"
33	#include "private/io.h"
34	#include "private/parser.h"
35	#include "private/tree.h"
36
37	#define HTML_MAX_NAMELEN 1000
38	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
39	#define HTML_PARSER_BUFFER_SIZE 100
40
41	static int htmlOmittedDefaultValue = 1;
42
43	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44	xmlChar end, xmlChar end2, xmlChar end3);
45	static void htmlParseComment(htmlParserCtxtPtr ctxt);
46
47	/************************************************************************
48	* *
49	* Some factorized error routines *
50	* *
51	************************************************************************/
52
53	/**
54	* htmlErrMemory:
55	* @ctxt: an HTML parser context
56	* @extra: extra information
57	*
58	* Handle a redefinition of attribute error
59	*/
60	static void
61	htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
62	{
63	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
64	(ctxt->instate == XML_PARSER_EOF))
65	return;
66	if (ctxt != NULL) {
67	ctxt->errNo = XML_ERR_NO_MEMORY;
68	ctxt->instate = XML_PARSER_EOF;
69	ctxt->disableSAX = 1;
70	}
71	if (extra)
72	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
73	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
74	NULL, NULL, 0, 0,
75	"Memory allocation failed : %s\n", extra);
76	else
77	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
78	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
79	NULL, NULL, 0, 0, "Memory allocation failed\n");
80	}
81
82	/**
83	* htmlParseErr:
84	* @ctxt: an HTML parser context
85	* @error: the error number
86	* @msg: the error message
87	* @str1: string infor
88	* @str2: string infor
89	*
90	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
91	*/
92	static void LIBXML_ATTR_FORMAT(3,0)
93	htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
94	const char msg, const xmlChar str1, const xmlChar *str2)
95	{
96	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
97	(ctxt->instate == XML_PARSER_EOF))
98	return;
99	if (ctxt != NULL)
100	ctxt->errNo = error;
101	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
102	XML_ERR_ERROR, NULL, 0,
103	(const char ) str1, (const char ) str2,
104	NULL, 0, 0,
105	msg, str1, str2);
106	if (ctxt != NULL)
107	ctxt->wellFormed = 0;
108	}
109
110	/**
111	* htmlParseErrInt:
112	* @ctxt: an HTML parser context
113	* @error: the error number
114	* @msg: the error message
115	* @val: integer info
116	*
117	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
118	*/
119	static void LIBXML_ATTR_FORMAT(3,0)
120	htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
121	const char *msg, int val)
122	{
123	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
124	(ctxt->instate == XML_PARSER_EOF))
125	return;
126	if (ctxt != NULL)
127	ctxt->errNo = error;
128	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
129	XML_ERR_ERROR, NULL, 0, NULL, NULL,
130	NULL, val, 0, msg, val);
131	if (ctxt != NULL)
132	ctxt->wellFormed = 0;
133	}
134
135	/************************************************************************
136	* *
137	* Parser stacks related functions and macros *
138	* *
139	************************************************************************/
140
141	/**
142	* htmlnamePush:
143	* @ctxt: an HTML parser context
144	* @value: the element name
145	*
146	* Pushes a new element name on top of the name stack
147	*
148	* Returns -1 in case of error, the index in the stack otherwise
149	*/
150	static int
151	htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
152	{
153	if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
154	ctxt->html = 3;
155	if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
156	ctxt->html = 10;
157	if (ctxt->nameNr >= ctxt->nameMax) {
158	size_t newSize = ctxt->nameMax * 2;
159	const xmlChar **tmp;
160
161	tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
162	newSize * sizeof(ctxt->nameTab[0]));
163	if (tmp == NULL) {
164	htmlErrMemory(ctxt, NULL);
165	return (-1);
166	}
167	ctxt->nameTab = tmp;
168	ctxt->nameMax = newSize;
169	}
170	ctxt->nameTab[ctxt->nameNr] = value;
171	ctxt->name = value;
172	return (ctxt->nameNr++);
173	}
174	/**
175	* htmlnamePop:
176	* @ctxt: an HTML parser context
177	*
178	* Pops the top element name from the name stack
179	*
180	* Returns the name just removed
181	*/
182	static const xmlChar *
183	htmlnamePop(htmlParserCtxtPtr ctxt)
184	{
185	const xmlChar *ret;
186
187	if (ctxt->nameNr <= 0)
188	return (NULL);
189	ctxt->nameNr--;
190	if (ctxt->nameNr < 0)
191	return (NULL);
192	if (ctxt->nameNr > 0)
193	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
194	else
195	ctxt->name = NULL;
196	ret = ctxt->nameTab[ctxt->nameNr];
197	ctxt->nameTab[ctxt->nameNr] = NULL;
198	return (ret);
199	}
200
201	/**
202	* htmlNodeInfoPush:
203	* @ctxt: an HTML parser context
204	* @value: the node info
205	*
206	* Pushes a new element name on top of the node info stack
207	*
208	* Returns 0 in case of error, the index in the stack otherwise
209	*/
210	static int
211	htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
212	{
213	if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
214	if (ctxt->nodeInfoMax == 0)
215	ctxt->nodeInfoMax = 5;
216	ctxt->nodeInfoMax *= 2;
217	ctxt->nodeInfoTab = (htmlParserNodeInfo *)
218	xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
219	ctxt->nodeInfoMax *
220	sizeof(ctxt->nodeInfoTab[0]));
221	if (ctxt->nodeInfoTab == NULL) {
222	htmlErrMemory(ctxt, NULL);
223	return (0);
224	}
225	}
226	ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
227	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
228	return (ctxt->nodeInfoNr++);
229	}
230
231	/**
232	* htmlNodeInfoPop:
233	* @ctxt: an HTML parser context
234	*
235	* Pops the top element name from the node info stack
236	*
237	* Returns 0 in case of error, the pointer to NodeInfo otherwise
238	*/
239	static htmlParserNodeInfo *
240	htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
241	{
242	if (ctxt->nodeInfoNr <= 0)
243	return (NULL);
244	ctxt->nodeInfoNr--;
245	if (ctxt->nodeInfoNr < 0)
246	return (NULL);
247	if (ctxt->nodeInfoNr > 0)
248	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
249	else
250	ctxt->nodeInfo = NULL;
251	return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
252	}
253
254	/*
255	* Macros for accessing the content. Those should be used only by the parser,
256	* and not exported.
257	*
258	* Dirty macros, i.e. one need to make assumption on the context to use them
259	*
260	* CUR_PTR return the current pointer to the xmlChar to be parsed.
261	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
262	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
263	* in UNICODE mode. This should be used internally by the parser
264	* only to compare to ASCII values otherwise it would break when
265	* running with UTF-8 encoding.
266	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
267	* to compare on ASCII based substring.
268	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
269	* it should be used only to compare on ASCII based substring.
270	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
271	* strings without newlines within the parser.
272	*
273	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
274	*
275	* NEXT Skip to the next character, this does the proper decoding
276	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
277	* NEXTL(l) Skip the current unicode character of l xmlChars long.
278	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
279	*/
280
281	#define UPPER (toupper(*ctxt->input->cur))
282
283	#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
284
285	#define NXT(val) ctxt->input->cur[(val)]
286
287	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
288
289	#define CUR_PTR ctxt->input->cur
290	#define BASE_PTR ctxt->input->base
291
292	#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
293	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
294	xmlParserShrink(ctxt)
295
296	#define GROW if ((ctxt->progressive == 0) && \
297	(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
298	xmlParserGrow(ctxt)
299
300	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
301
302	/* Imported from XML */
303
304	#define CUR (*ctxt->input->cur)
305	#define NEXT xmlNextChar(ctxt)
306
307	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
308
309
310	#define NEXTL(l) do { \
311	if (*(ctxt->input->cur) == '\n') { \
312	ctxt->input->line++; ctxt->input->col = 1; \
313	} else ctxt->input->col++; \
314	ctxt->token = 0; ctxt->input->cur += l; \
315	} while (0)
316
317	/************
318	\
319	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
320	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
321	************/
322
323	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
324
325	#define COPY_BUF(l,b,i,v) \
326	if (l == 1) b[i++] = v; \
327	else i += xmlCopyChar(l,&b[i],v)
328
329	/**
330	* htmlFindEncoding:
331	* @the HTML parser context
332	*
333	* Ty to find and encoding in the current data available in the input
334	* buffer this is needed to try to switch to the proper encoding when
335	* one face a character error.
336	* That's an heuristic, since it's operating outside of parsing it could
337	* try to use a meta which had been commented out, that's the reason it
338	* should only be used in case of error, not as a default.
339	*
340	* Returns an encoding string or NULL if not found, the string need to
341	* be freed
342	*/
343	static xmlChar *
344	htmlFindEncoding(xmlParserCtxtPtr ctxt) {
345	const xmlChar start, cur, *end;
346
347	if ((ctxt == NULL) \|\| (ctxt->input == NULL) \|\|
348	(ctxt->input->flags & XML_INPUT_HAS_ENCODING))
349	return(NULL);
350	if ((ctxt->input->cur == NULL) \|\| (ctxt->input->end == NULL))
351	return(NULL);
352
353	start = ctxt->input->cur;
354	end = ctxt->input->end;
355	/* we also expect the input buffer to be zero terminated */
356	if (*end != 0)
357	return(NULL);
358
359	cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
360	if (cur == NULL)
361	return(NULL);
362	cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
363	if (cur == NULL)
364	return(NULL);
365	cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
366	if (cur == NULL)
367	return(NULL);
368	cur += 8;
369	start = cur;
370	while (((cur >= 'A') && (cur <= 'Z')) \|\|
371	((cur >= 'a') && (cur <= 'z')) \|\|
372	((cur >= '0') && (cur <= '9')) \|\|
373	(cur == '-') \|\| (cur == '_') \|\| (cur == ':') \|\| (cur == '/'))
374	cur++;
375	if (cur == start)
376	return(NULL);
377	return(xmlStrndup(start, cur - start));
378	}
379
380	/**
381	* htmlCurrentChar:
382	* @ctxt: the HTML parser context
383	* @len: pointer to the length of the char read
384	*
385	* The current char value, if using UTF-8 this may actually span multiple
386	* bytes in the input buffer. Implement the end of line normalization:
387	* 2.11 End-of-Line Handling
388	* If the encoding is unspecified, in the case we find an ISO-Latin-1
389	* char, then the encoding converter is plugged in automatically.
390	*
391	* Returns the current char value and its length
392	*/
393
394	static int
395	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
396	const unsigned char *cur;
397	unsigned char c;
398	unsigned int val;
399
400	if (ctxt->instate == XML_PARSER_EOF)
401	return(0);
402
403	if (ctxt->token != 0) {
404	*len = 0;
405	return(ctxt->token);
406	}
407
408	if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) {
409	xmlParserGrow(ctxt);
410	if (ctxt->instate == XML_PARSER_EOF)
411	return(0);
412	}
413
414	if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
415	xmlChar * guess;
416	xmlCharEncodingHandlerPtr handler;
417
418	/*
419	* Assume it's a fixed length encoding (1) with
420	* a compatible encoding for the ASCII set, since
421	* HTML constructs only use < 128 chars
422	*/
423	if (*ctxt->input->cur < 0x80) {
424	*len = 1;
425	if ((*ctxt->input->cur == 0) &&
426	(ctxt->input->cur < ctxt->input->end)) {
427	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
428	"Char 0x%X out of allowed range\n", 0);
429	return(' ');
430	}
431	return(*ctxt->input->cur);
432	}
433
434	/*
435	* Humm this is bad, do an automatic flow conversion
436	*/
437	guess = htmlFindEncoding(ctxt);
438	if (guess == NULL) {
439	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
440	} else {
441	handler = xmlFindCharEncodingHandler((const char *) guess);
442	if (handler != NULL) {
443	/*
444	* Don't use UTF-8 encoder which isn't required and
445	* can produce invalid UTF-8.
446	*/
447	if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
448	xmlSwitchToEncoding(ctxt, handler);
449	} else {
450	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
451	"Unsupported encoding %s", guess, NULL);
452	}
453	xmlFree(guess);
454	}
455	ctxt->input->flags \|= XML_INPUT_HAS_ENCODING;
456	}
457
458	/*
459	* We are supposed to handle UTF8, check it's valid
460	* From rfc2044: encoding of the Unicode values on UTF-8:
461	*
462	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
463	* 0000 0000-0000 007F 0xxxxxxx
464	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
465	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
466	*
467	* Check for the 0x110000 limit too
468	*/
469	cur = ctxt->input->cur;
470	c = *cur;
471	if (c & 0x80) {
472	size_t avail;
473
474	if ((c & 0x40) == 0)
475	goto encoding_error;
476
477	avail = ctxt->input->end - ctxt->input->cur;
478
479	if ((avail < 2) \|\| ((cur[1] & 0xc0) != 0x80))
480	goto encoding_error;
481	if ((c & 0xe0) == 0xe0) {
482	if ((avail < 3) \|\| ((cur[2] & 0xc0) != 0x80))
483	goto encoding_error;
484	if ((c & 0xf0) == 0xf0) {
485	if (((c & 0xf8) != 0xf0) \|\|
486	(avail < 4) \|\| ((cur[3] & 0xc0) != 0x80))
487	goto encoding_error;
488	/* 4-byte code */
489	*len = 4;
490	val = (cur[0] & 0x7) << 18;
491	val \|= (cur[1] & 0x3f) << 12;
492	val \|= (cur[2] & 0x3f) << 6;
493	val \|= cur[3] & 0x3f;
494	if (val < 0x10000)
495	goto encoding_error;
496	} else {
497	/* 3-byte code */
498	*len = 3;
499	val = (cur[0] & 0xf) << 12;
500	val \|= (cur[1] & 0x3f) << 6;
501	val \|= cur[2] & 0x3f;
502	if (val < 0x800)
503	goto encoding_error;
504	}
505	} else {
506	/* 2-byte code */
507	*len = 2;
508	val = (cur[0] & 0x1f) << 6;
509	val \|= cur[1] & 0x3f;
510	if (val < 0x80)
511	goto encoding_error;
512	}
513	if (!IS_CHAR(val)) {
514	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
515	"Char 0x%X out of allowed range\n", val);
516	}
517	return(val);
518	} else {
519	if ((*ctxt->input->cur == 0) &&
520	(ctxt->input->cur < ctxt->input->end)) {
521	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
522	"Char 0x%X out of allowed range\n", 0);
523	*len = 1;
524	return(' ');
525	}
526	/* 1-byte code */
527	*len = 1;
528	return(*ctxt->input->cur);
529	}
530
531	encoding_error:
532	{
533	char buffer[150];
534
535	if (ctxt->input->end - ctxt->input->cur >= 4) {
536	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
537	ctxt->input->cur[0], ctxt->input->cur[1],
538	ctxt->input->cur[2], ctxt->input->cur[3]);
539	} else {
540	snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
541	}
542	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
543	"Input is not proper UTF-8, indicate encoding !\n",
544	BAD_CAST buffer, NULL);
545	}
546
547	if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
548	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
549	*len = 1;
550	return(*ctxt->input->cur);
551	}
552
553	/**
554	* htmlSkipBlankChars:
555	* @ctxt: the HTML parser context
556	*
557	* skip all blanks character found at that point in the input streams.
558	*
559	* Returns the number of space chars skipped
560	*/
561
562	static int
563	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
564	int res = 0;
565
566	while (IS_BLANK_CH(*(ctxt->input->cur))) {
567	if (*(ctxt->input->cur) == '\n') {
568	ctxt->input->line++; ctxt->input->col = 1;
569	} else ctxt->input->col++;
570	ctxt->input->cur++;
571	if (*ctxt->input->cur == 0)
572	xmlParserGrow(ctxt);
573	if (res < INT_MAX)
574	res++;
575	}
576	return(res);
577	}
578
579
580
581	/************************************************************************
582	* *
583	* The list of HTML elements and their properties *
584	* *
585	************************************************************************/
586
587	/*
588	* Start Tag: 1 means the start tag can be omitted
589	* End Tag: 1 means the end tag can be omitted
590	* 2 means it's forbidden (empty elements)
591	* 3 means the tag is stylistic and should be closed easily
592	* Depr: this element is deprecated
593	* DTD: 1 means that this element is valid only in the Loose DTD
594	* 2 means that this element is valid only in the Frameset DTD
595	*
596	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
597	, subElements , impliedsubelt , Attributes, userdata
598	*/
599
600	/* Definitions and a couple of vars for HTML Elements */
601
602	#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
603	#define NB_FONTSTYLE 8
604	#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
605	#define NB_PHRASE 10
606	#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
607	#define NB_SPECIAL 16
608	#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
609	#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
610	#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
611	#define NB_BLOCK NB_HEADING + NB_LIST + 14
612	#define FORMCTRL "input", "select", "textarea", "label", "button"
613	#define NB_FORMCTRL 5
614	#define PCDATA
615	#define NB_PCDATA 0
616	#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
617	#define NB_HEADING 6
618	#define LIST "ul", "ol", "dir", "menu"
619	#define NB_LIST 4
620	#define MODIFIER
621	#define NB_MODIFIER 0
622	#define FLOW BLOCK,INLINE
623	#define NB_FLOW NB_BLOCK + NB_INLINE
624	#define EMPTY NULL
625
626
627	static const char* const html_flow[] = { FLOW, NULL } ;
628	static const char* const html_inline[] = { INLINE, NULL } ;
629
630	/* placeholders: elts with content but no subelements */
631	static const char* const html_pcdata[] = { NULL } ;
632	#define html_cdata html_pcdata
633
634
635	/* ... and for HTML Attributes */
636
637	#define COREATTRS "id", "class", "style", "title"
638	#define NB_COREATTRS 4
639	#define I18N "lang", "dir"
640	#define NB_I18N 2
641	#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
642	#define NB_EVENTS 9
643	#define ATTRS COREATTRS,I18N,EVENTS
644	#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
645	#define CELLHALIGN "align", "char", "charoff"
646	#define NB_CELLHALIGN 3
647	#define CELLVALIGN "valign"
648	#define NB_CELLVALIGN 1
649
650	static const char* const html_attrs[] = { ATTRS, NULL } ;
651	static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
652	static const char* const core_attrs[] = { COREATTRS, NULL } ;
653	static const char* const i18n_attrs[] = { I18N, NULL } ;
654
655
656	/* Other declarations that should go inline ... */
657	static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
658	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
659	"tabindex", "onfocus", "onblur", NULL } ;
660	static const char* const target_attr[] = { "target", NULL } ;
661	static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
662	static const char* const alt_attr[] = { "alt", NULL } ;
663	static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
664	static const char* const href_attrs[] = { "href", NULL } ;
665	static const char* const clear_attrs[] = { "clear", NULL } ;
666	static const char* const inline_p[] = { INLINE, "p", NULL } ;
667
668	static const char* const flow_param[] = { FLOW, "param", NULL } ;
669	static const char* const applet_attrs[] = { COREATTRS , "codebase",
670	"archive", "alt", "name", "height", "width", "align",
671	"hspace", "vspace", NULL } ;
672	static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
673	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
674	static const char* const basefont_attrs[] =
675	{ "id", "size", "color", "face", NULL } ;
676	static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
677	static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
678	static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
679	static const char* const body_depr[] = { "background", "bgcolor", "text",
680	"link", "vlink", "alink", NULL } ;
681	static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
682	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
683
684
685	static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
686	static const char* const col_elt[] = { "col", NULL } ;
687	static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
688	static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
689	static const char* const dl_contents[] = { "dt", "dd", NULL } ;
690	static const char* const compact_attr[] = { "compact", NULL } ;
691	static const char* const label_attr[] = { "label", NULL } ;
692	static const char* const fieldset_contents[] = { FLOW, "legend" } ;
693	static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
694	static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
695	static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
696	static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
697	static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
698	static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
699	static const char* const head_attrs[] = { I18N, "profile", NULL } ;
700	static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
701	static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
702	static const char* const version_attr[] = { "version", NULL } ;
703	static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
704	static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
705	static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
706	static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
707	static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
708	static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
709	static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
710	static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
711	static const char* const align_attr[] = { "align", NULL } ;
712	static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
713	static const char* const map_contents[] = { BLOCK, "area", NULL } ;
714	static const char* const name_attr[] = { "name", NULL } ;
715	static const char* const action_attr[] = { "action", NULL } ;
716	static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
717	static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
718	static const char* const content_attr[] = { "content", NULL } ;
719	static const char* const type_attr[] = { "type", NULL } ;
720	static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
721	static const char* const object_contents[] = { FLOW, "param", NULL } ;
722	static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
723	static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
724	static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
725	static const char* const option_elt[] = { "option", NULL } ;
726	static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
727	static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
728	static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
729	static const char* const width_attr[] = { "width", NULL } ;
730	static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
731	static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
732	static const char* const language_attr[] = { "language", NULL } ;
733	static const char* const select_content[] = { "optgroup", "option", NULL } ;
734	static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
735	static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
736	static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
737	static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
738	static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
739	static const char* const tr_elt[] = { "tr", NULL } ;
740	static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
741	static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
742	static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
743	static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
744	static const char* const tr_contents[] = { "th", "td", NULL } ;
745	static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
746	static const char* const li_elt[] = { "li", NULL } ;
747	static const char* const ul_depr[] = { "type", "compact", NULL} ;
748	static const char* const dir_attr[] = { "dir", NULL} ;
749
750	#define DECL (const char**)
751
752	static const htmlElemDesc
753	html40ElementTable[] = {
754	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
755	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
756	},
757	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
758	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
759	},
760	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
761	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
762	},
763	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
764	DECL inline_p , NULL , DECL html_attrs, NULL, NULL
765	},
766	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
767	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
768	},
769	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
770	EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
771	},
772	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
773	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
774	},
775	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
776	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
777	},
778	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
779	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
780	},
781	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
782	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
783	},
784	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
785	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
786	},
787	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
788	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
789	},
790	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
791	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
792	},
793	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
794	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
795	},
796	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
797	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
798	},
799	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
800	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
801	},
802	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
803	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
804	},
805	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
806	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
807	},
808	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
809	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
810	},
811	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
812	EMPTY , NULL , DECL col_attrs , NULL, NULL
813	},
814	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
815	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
816	},
817	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
818	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
819	},
820	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
821	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
822	},
823	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
824	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
825	},
826	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
827	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
828	},
829	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
830	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
831	},
832	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
833	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
834	},
835	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
836	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837	},
838	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
839	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
840	},
841	{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
842	EMPTY, NULL, DECL embed_attrs, NULL, NULL
843	},
844	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
845	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
846	},
847	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
848	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
849	},
850	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
851	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
852	},
853	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
854	EMPTY, NULL, NULL, DECL frame_attrs, NULL
855	},
856	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
857	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
858	},
859	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
860	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
861	},
862	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
863	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
864	},
865	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
866	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
867	},
868	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
869	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
870	},
871	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
872	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
873	},
874	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
875	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
876	},
877	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
878	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
879	},
880	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
881	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
882	},
883	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
884	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
885	},
886	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
887	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
888	},
889	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
890	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
891	},
892	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
893	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
894	},
895	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
896	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
897	},
898	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
899	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
900	},
901	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
902	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
903	},
904	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
905	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
906	},
907	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
908	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
909	},
910	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
911	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
912	},
913	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
914	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
915	},
916	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
917	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
918	},
919	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
920	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
921	},
922	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
923	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
924	},
925	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
926	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
927	},
928	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
929	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
930	},
931	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
932	DECL html_flow, "div", DECL html_attrs, NULL, NULL
933	},
934	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
935	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
936	},
937	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
938	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
939	},
940	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
941	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
942	},
943	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
944	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
945	},
946	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
947	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
948	},
949	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
950	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
951	},
952	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
953	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
954	},
955	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
956	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
957	},
958	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
959	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
960	},
961	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
962	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
963	},
964	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
965	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
966	},
967	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
968	DECL select_content, NULL, DECL select_attrs, NULL, NULL
969	},
970	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
971	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
972	},
973	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
974	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975	},
976	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
977	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
978	},
979	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
980	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
981	},
982	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
983	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
984	},
985	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
986	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
987	},
988	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
989	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
990	},
991	{ "table", 0, 0, 0, 0, 0, 0, 0, "",
992	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
993	},
994	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
995	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
996	},
997	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
998	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
999	},
1000	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1001	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1002	},
1003	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1004	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1005	},
1006	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1007	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1008	},
1009	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1010	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1011	},
1012	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1013	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1014	},
1015	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1016	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1017	},
1018	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1019	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1020	},
1021	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1022	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1023	},
1024	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1025	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1026	},
1027	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1028	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1029	}
1030	};
1031
1032	typedef struct {
1033	const char *oldTag;
1034	const char *newTag;
1035	} htmlStartCloseEntry;
1036
1037	/*
1038	* start tags that imply the end of current element
1039	*/
1040	static const htmlStartCloseEntry htmlStartClose[] = {
1041	{ "a", "a" },
1042	{ "a", "fieldset" },
1043	{ "a", "table" },
1044	{ "a", "td" },
1045	{ "a", "th" },
1046	{ "address", "dd" },
1047	{ "address", "dl" },
1048	{ "address", "dt" },
1049	{ "address", "form" },
1050	{ "address", "li" },
1051	{ "address", "ul" },
1052	{ "b", "center" },
1053	{ "b", "p" },
1054	{ "b", "td" },
1055	{ "b", "th" },
1056	{ "big", "p" },
1057	{ "caption", "col" },
1058	{ "caption", "colgroup" },
1059	{ "caption", "tbody" },
1060	{ "caption", "tfoot" },
1061	{ "caption", "thead" },
1062	{ "caption", "tr" },
1063	{ "col", "col" },
1064	{ "col", "colgroup" },
1065	{ "col", "tbody" },
1066	{ "col", "tfoot" },
1067	{ "col", "thead" },
1068	{ "col", "tr" },
1069	{ "colgroup", "colgroup" },
1070	{ "colgroup", "tbody" },
1071	{ "colgroup", "tfoot" },
1072	{ "colgroup", "thead" },
1073	{ "colgroup", "tr" },
1074	{ "dd", "dt" },
1075	{ "dir", "dd" },
1076	{ "dir", "dl" },
1077	{ "dir", "dt" },
1078	{ "dir", "form" },
1079	{ "dir", "ul" },
1080	{ "dl", "form" },
1081	{ "dl", "li" },
1082	{ "dt", "dd" },
1083	{ "dt", "dl" },
1084	{ "font", "center" },
1085	{ "font", "td" },
1086	{ "font", "th" },
1087	{ "form", "form" },
1088	{ "h1", "fieldset" },
1089	{ "h1", "form" },
1090	{ "h1", "li" },
1091	{ "h1", "p" },
1092	{ "h1", "table" },
1093	{ "h2", "fieldset" },
1094	{ "h2", "form" },
1095	{ "h2", "li" },
1096	{ "h2", "p" },
1097	{ "h2", "table" },
1098	{ "h3", "fieldset" },
1099	{ "h3", "form" },
1100	{ "h3", "li" },
1101	{ "h3", "p" },
1102	{ "h3", "table" },
1103	{ "h4", "fieldset" },
1104	{ "h4", "form" },
1105	{ "h4", "li" },
1106	{ "h4", "p" },
1107	{ "h4", "table" },
1108	{ "h5", "fieldset" },
1109	{ "h5", "form" },
1110	{ "h5", "li" },
1111	{ "h5", "p" },
1112	{ "h5", "table" },
1113	{ "h6", "fieldset" },
1114	{ "h6", "form" },
1115	{ "h6", "li" },
1116	{ "h6", "p" },
1117	{ "h6", "table" },
1118	{ "head", "a" },
1119	{ "head", "abbr" },
1120	{ "head", "acronym" },
1121	{ "head", "address" },
1122	{ "head", "b" },
1123	{ "head", "bdo" },
1124	{ "head", "big" },
1125	{ "head", "blockquote" },
1126	{ "head", "body" },
1127	{ "head", "br" },
1128	{ "head", "center" },
1129	{ "head", "cite" },
1130	{ "head", "code" },
1131	{ "head", "dd" },
1132	{ "head", "dfn" },
1133	{ "head", "dir" },
1134	{ "head", "div" },
1135	{ "head", "dl" },
1136	{ "head", "dt" },
1137	{ "head", "em" },
1138	{ "head", "fieldset" },
1139	{ "head", "font" },
1140	{ "head", "form" },
1141	{ "head", "frameset" },
1142	{ "head", "h1" },
1143	{ "head", "h2" },
1144	{ "head", "h3" },
1145	{ "head", "h4" },
1146	{ "head", "h5" },
1147	{ "head", "h6" },
1148	{ "head", "hr" },
1149	{ "head", "i" },
1150	{ "head", "iframe" },
1151	{ "head", "img" },
1152	{ "head", "kbd" },
1153	{ "head", "li" },
1154	{ "head", "listing" },
1155	{ "head", "map" },
1156	{ "head", "menu" },
1157	{ "head", "ol" },
1158	{ "head", "p" },
1159	{ "head", "pre" },
1160	{ "head", "q" },
1161	{ "head", "s" },
1162	{ "head", "samp" },
1163	{ "head", "small" },
1164	{ "head", "span" },
1165	{ "head", "strike" },
1166	{ "head", "strong" },
1167	{ "head", "sub" },
1168	{ "head", "sup" },
1169	{ "head", "table" },
1170	{ "head", "tt" },
1171	{ "head", "u" },
1172	{ "head", "ul" },
1173	{ "head", "var" },
1174	{ "head", "xmp" },
1175	{ "hr", "form" },
1176	{ "i", "center" },
1177	{ "i", "p" },
1178	{ "i", "td" },
1179	{ "i", "th" },
1180	{ "legend", "fieldset" },
1181	{ "li", "li" },
1182	{ "link", "body" },
1183	{ "link", "frameset" },
1184	{ "listing", "dd" },
1185	{ "listing", "dl" },
1186	{ "listing", "dt" },
1187	{ "listing", "fieldset" },
1188	{ "listing", "form" },
1189	{ "listing", "li" },
1190	{ "listing", "table" },
1191	{ "listing", "ul" },
1192	{ "menu", "dd" },
1193	{ "menu", "dl" },
1194	{ "menu", "dt" },
1195	{ "menu", "form" },
1196	{ "menu", "ul" },
1197	{ "ol", "form" },
1198	{ "option", "optgroup" },
1199	{ "option", "option" },
1200	{ "p", "address" },
1201	{ "p", "blockquote" },
1202	{ "p", "body" },
1203	{ "p", "caption" },
1204	{ "p", "center" },
1205	{ "p", "col" },
1206	{ "p", "colgroup" },
1207	{ "p", "dd" },
1208	{ "p", "dir" },
1209	{ "p", "div" },
1210	{ "p", "dl" },
1211	{ "p", "dt" },
1212	{ "p", "fieldset" },
1213	{ "p", "form" },
1214	{ "p", "frameset" },
1215	{ "p", "h1" },
1216	{ "p", "h2" },
1217	{ "p", "h3" },
1218	{ "p", "h4" },
1219	{ "p", "h5" },
1220	{ "p", "h6" },
1221	{ "p", "head" },
1222	{ "p", "hr" },
1223	{ "p", "li" },
1224	{ "p", "listing" },
1225	{ "p", "menu" },
1226	{ "p", "ol" },
1227	{ "p", "p" },
1228	{ "p", "pre" },
1229	{ "p", "table" },
1230	{ "p", "tbody" },
1231	{ "p", "td" },
1232	{ "p", "tfoot" },
1233	{ "p", "th" },
1234	{ "p", "title" },
1235	{ "p", "tr" },
1236	{ "p", "ul" },
1237	{ "p", "xmp" },
1238	{ "pre", "dd" },
1239	{ "pre", "dl" },
1240	{ "pre", "dt" },
1241	{ "pre", "fieldset" },
1242	{ "pre", "form" },
1243	{ "pre", "li" },
1244	{ "pre", "table" },
1245	{ "pre", "ul" },
1246	{ "s", "p" },
1247	{ "script", "noscript" },
1248	{ "small", "p" },
1249	{ "span", "td" },
1250	{ "span", "th" },
1251	{ "strike", "p" },
1252	{ "style", "body" },
1253	{ "style", "frameset" },
1254	{ "tbody", "tbody" },
1255	{ "tbody", "tfoot" },
1256	{ "td", "tbody" },
1257	{ "td", "td" },
1258	{ "td", "tfoot" },
1259	{ "td", "th" },
1260	{ "td", "tr" },
1261	{ "tfoot", "tbody" },
1262	{ "th", "tbody" },
1263	{ "th", "td" },
1264	{ "th", "tfoot" },
1265	{ "th", "th" },
1266	{ "th", "tr" },
1267	{ "thead", "tbody" },
1268	{ "thead", "tfoot" },
1269	{ "title", "body" },
1270	{ "title", "frameset" },
1271	{ "tr", "tbody" },
1272	{ "tr", "tfoot" },
1273	{ "tr", "tr" },
1274	{ "tt", "p" },
1275	{ "u", "p" },
1276	{ "u", "td" },
1277	{ "u", "th" },
1278	{ "ul", "address" },
1279	{ "ul", "form" },
1280	{ "ul", "menu" },
1281	{ "ul", "pre" },
1282	{ "xmp", "dd" },
1283	{ "xmp", "dl" },
1284	{ "xmp", "dt" },
1285	{ "xmp", "fieldset" },
1286	{ "xmp", "form" },
1287	{ "xmp", "li" },
1288	{ "xmp", "table" },
1289	{ "xmp", "ul" }
1290	};
1291
1292	/*
1293	* The list of HTML elements which are supposed not to have
1294	* CDATA content and where a p element will be implied
1295	*
1296	* TODO: extend that list by reading the HTML SGML DTD on
1297	* implied paragraph
1298	*/
1299	static const char *const htmlNoContentElements[] = {
1300	"html",
1301	"head",
1302	NULL
1303	};
1304
1305	/*
1306	* The list of HTML attributes which are of content %Script;
1307	* NOTE: when adding ones, check htmlIsScriptAttribute() since
1308	* it assumes the name starts with 'on'
1309	*/
1310	static const char *const htmlScriptAttributes[] = {
1311	"onclick",
1312	"ondblclick",
1313	"onmousedown",
1314	"onmouseup",
1315	"onmouseover",
1316	"onmousemove",
1317	"onmouseout",
1318	"onkeypress",
1319	"onkeydown",
1320	"onkeyup",
1321	"onload",
1322	"onunload",
1323	"onfocus",
1324	"onblur",
1325	"onsubmit",
1326	"onreset",
1327	"onchange",
1328	"onselect"
1329	};
1330
1331	/*
1332	* This table is used by the htmlparser to know what to do with
1333	* broken html pages. By assigning different priorities to different
1334	* elements the parser can decide how to handle extra endtags.
1335	* Endtags are only allowed to close elements with lower or equal
1336	* priority.
1337	*/
1338
1339	typedef struct {
1340	const char *name;
1341	int priority;
1342	} elementPriority;
1343
1344	static const elementPriority htmlEndPriority[] = {
1345	{"div", 150},
1346	{"td", 160},
1347	{"th", 160},
1348	{"tr", 170},
1349	{"thead", 180},
1350	{"tbody", 180},
1351	{"tfoot", 180},
1352	{"table", 190},
1353	{"head", 200},
1354	{"body", 200},
1355	{"html", 220},
1356	{NULL, 100} /* Default priority */
1357	};
1358
1359	/************************************************************************
1360	* *
1361	* functions to handle HTML specific data *
1362	* *
1363	************************************************************************/
1364
1365	/**
1366	* htmlInitAutoClose:
1367	*
1368	* DEPRECATED: This is a no-op.
1369	*/
1370	void
1371	htmlInitAutoClose(void) {
1372	}
1373
1374	static int
1375	htmlCompareTags(const void key, const void member) {
1376	const xmlChar tag = (const xmlChar ) key;
1377	const htmlElemDesc desc = (const htmlElemDesc ) member;
1378
1379	return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1380	}
1381
1382	/**
1383	* htmlTagLookup:
1384	* @tag: The tag name in lowercase
1385	*
1386	* Lookup the HTML tag in the ElementTable
1387	*
1388	* Returns the related htmlElemDescPtr or NULL if not found.
1389	*/
1390	const htmlElemDesc *
1391	htmlTagLookup(const xmlChar *tag) {
1392	if (tag == NULL)
1393	return(NULL);
1394
1395	return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1396	sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1397	sizeof(htmlElemDesc), htmlCompareTags));
1398	}
1399
1400	/**
1401	* htmlGetEndPriority:
1402	* @name: The name of the element to look up the priority for.
1403	*
1404	* Return value: The "endtag" priority.
1405	**/
1406	static int
1407	htmlGetEndPriority (const xmlChar *name) {
1408	int i = 0;
1409
1410	while ((htmlEndPriority[i].name != NULL) &&
1411	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1412	i++;
1413
1414	return(htmlEndPriority[i].priority);
1415	}
1416
1417
1418	static int
1419	htmlCompareStartClose(const void vkey, const void member) {
1420	const htmlStartCloseEntry key = (const htmlStartCloseEntry ) vkey;
1421	const htmlStartCloseEntry entry = (const htmlStartCloseEntry ) member;
1422	int ret;
1423
1424	ret = strcmp(key->oldTag, entry->oldTag);
1425	if (ret == 0)
1426	ret = strcmp(key->newTag, entry->newTag);
1427
1428	return(ret);
1429	}
1430
1431	/**
1432	* htmlCheckAutoClose:
1433	* @newtag: The new tag name
1434	* @oldtag: The old tag name
1435	*
1436	* Checks whether the new tag is one of the registered valid tags for
1437	* closing old.
1438	*
1439	* Returns 0 if no, 1 if yes.
1440	*/
1441	static int
1442	htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1443	{
1444	htmlStartCloseEntry key;
1445	void *res;
1446
1447	key.oldTag = (const char *) oldtag;
1448	key.newTag = (const char *) newtag;
1449	res = bsearch(&key, htmlStartClose,
1450	sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1451	sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1452	return(res != NULL);
1453	}
1454
1455	/**
1456	* htmlAutoCloseOnClose:
1457	* @ctxt: an HTML parser context
1458	* @newtag: The new tag name
1459	* @force: force the tag closure
1460	*
1461	* The HTML DTD allows an ending tag to implicitly close other tags.
1462	*/
1463	static void
1464	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1465	{
1466	const htmlElemDesc *info;
1467	int i, priority;
1468
1469	priority = htmlGetEndPriority(newtag);
1470
1471	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1472
1473	if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1474	break;
1475	/*
1476	* A misplaced endtag can only close elements with lower
1477	* or equal priority, so if we find an element with higher
1478	* priority before we find an element with
1479	* matching name, we just ignore this endtag
1480	*/
1481	if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1482	return;
1483	}
1484	if (i < 0)
1485	return;
1486
1487	while (!xmlStrEqual(newtag, ctxt->name)) {
1488	info = htmlTagLookup(ctxt->name);
1489	if ((info != NULL) && (info->endTag == 3)) {
1490	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1491	"Opening and ending tag mismatch: %s and %s\n",
1492	newtag, ctxt->name);
1493	}
1494	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1495	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1496	htmlnamePop(ctxt);
1497	}
1498	}
1499
1500	/**
1501	* htmlAutoCloseOnEnd:
1502	* @ctxt: an HTML parser context
1503	*
1504	* Close all remaining tags at the end of the stream
1505	*/
1506	static void
1507	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1508	{
1509	int i;
1510
1511	if (ctxt->nameNr == 0)
1512	return;
1513	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1514	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1515	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1516	htmlnamePop(ctxt);
1517	}
1518	}
1519
1520	/**
1521	* htmlAutoClose:
1522	* @ctxt: an HTML parser context
1523	* @newtag: The new tag name or NULL
1524	*
1525	* The HTML DTD allows a tag to implicitly close other tags.
1526	* The list is kept in htmlStartClose array. This function is
1527	* called when a new tag has been detected and generates the
1528	* appropriates closes if possible/needed.
1529	* If newtag is NULL this mean we are at the end of the resource
1530	* and we should check
1531	*/
1532	static void
1533	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1534	{
1535	if (newtag == NULL)
1536	return;
1537
1538	while ((ctxt->name != NULL) &&
1539	(htmlCheckAutoClose(newtag, ctxt->name))) {
1540	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1541	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1542	htmlnamePop(ctxt);
1543	}
1544	}
1545
1546	/**
1547	* htmlAutoCloseTag:
1548	* @doc: the HTML document
1549	* @name: The tag name
1550	* @elem: the HTML element
1551	*
1552	* The HTML DTD allows a tag to implicitly close other tags.
1553	* The list is kept in htmlStartClose array. This function checks
1554	* if the element or one of it's children would autoclose the
1555	* given tag.
1556	*
1557	* Returns 1 if autoclose, 0 otherwise
1558	*/
1559	int
1560	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1561	htmlNodePtr child;
1562
1563	if (elem == NULL) return(1);
1564	if (xmlStrEqual(name, elem->name)) return(0);
1565	if (htmlCheckAutoClose(elem->name, name)) return(1);
1566	child = elem->children;
1567	while (child != NULL) {
1568	if (htmlAutoCloseTag(doc, name, child)) return(1);
1569	child = child->next;
1570	}
1571	return(0);
1572	}
1573
1574	/**
1575	* htmlIsAutoClosed:
1576	* @doc: the HTML document
1577	* @elem: the HTML element
1578	*
1579	* The HTML DTD allows a tag to implicitly close other tags.
1580	* The list is kept in htmlStartClose array. This function checks
1581	* if a tag is autoclosed by one of it's child
1582	*
1583	* Returns 1 if autoclosed, 0 otherwise
1584	*/
1585	int
1586	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1587	htmlNodePtr child;
1588
1589	if (elem == NULL) return(1);
1590	child = elem->children;
1591	while (child != NULL) {
1592	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1593	child = child->next;
1594	}
1595	return(0);
1596	}
1597
1598	/**
1599	* htmlCheckImplied:
1600	* @ctxt: an HTML parser context
1601	* @newtag: The new tag name
1602	*
1603	* The HTML DTD allows a tag to exists only implicitly
1604	* called when a new tag has been detected and generates the
1605	* appropriates implicit tags if missing
1606	*/
1607	static void
1608	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1609	int i;
1610
1611	if (ctxt->options & HTML_PARSE_NOIMPLIED)
1612	return;
1613	if (!htmlOmittedDefaultValue)
1614	return;
1615	if (xmlStrEqual(newtag, BAD_CAST"html"))
1616	return;
1617	if (ctxt->nameNr <= 0) {
1618	htmlnamePush(ctxt, BAD_CAST"html");
1619	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1620	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1621	}
1622	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
1623	return;
1624	if ((ctxt->nameNr <= 1) &&
1625	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
1626	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
1627	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
1628	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
1629	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
1630	(xmlStrEqual(newtag, BAD_CAST"base")))) {
1631	if (ctxt->html >= 3) {
1632	/* we already saw or generated an <head> before */
1633	return;
1634	}
1635	/*
1636	* dropped OBJECT ... i you put it first BODY will be
1637	* assumed !
1638	*/
1639	htmlnamePush(ctxt, BAD_CAST"head");
1640	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1641	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1642	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1643	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1644	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1645	if (ctxt->html >= 10) {
1646	/* we already saw or generated a <body> before */
1647	return;
1648	}
1649	for (i = 0;i < ctxt->nameNr;i++) {
1650	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1651	return;
1652	}
1653	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1654	return;
1655	}
1656	}
1657
1658	htmlnamePush(ctxt, BAD_CAST"body");
1659	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1660	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1661	}
1662	}
1663
1664	/**
1665	* htmlCheckParagraph
1666	* @ctxt: an HTML parser context
1667	*
1668	* Check whether a p element need to be implied before inserting
1669	* characters in the current element.
1670	*
1671	* Returns 1 if a paragraph has been inserted, 0 if not and -1
1672	* in case of error.
1673	*/
1674
1675	static int
1676	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1677	const xmlChar *tag;
1678	int i;
1679
1680	if (ctxt == NULL)
1681	return(-1);
1682	tag = ctxt->name;
1683	if (tag == NULL) {
1684	htmlAutoClose(ctxt, BAD_CAST"p");
1685	htmlCheckImplied(ctxt, BAD_CAST"p");
1686	htmlnamePush(ctxt, BAD_CAST"p");
1687	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1688	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1689	return(1);
1690	}
1691	if (!htmlOmittedDefaultValue)
1692	return(0);
1693	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1694	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1695	htmlAutoClose(ctxt, BAD_CAST"p");
1696	htmlCheckImplied(ctxt, BAD_CAST"p");
1697	htmlnamePush(ctxt, BAD_CAST"p");
1698	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1699	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1700	return(1);
1701	}
1702	}
1703	return(0);
1704	}
1705
1706	/**
1707	* htmlIsScriptAttribute:
1708	* @name: an attribute name
1709	*
1710	* Check if an attribute is of content type Script
1711	*
1712	* Returns 1 is the attribute is a script 0 otherwise
1713	*/
1714	int
1715	htmlIsScriptAttribute(const xmlChar *name) {
1716	unsigned int i;
1717
1718	if (name == NULL)
1719	return(0);
1720	/*
1721	* all script attributes start with 'on'
1722	*/
1723	if ((name[0] != 'o') \|\| (name[1] != 'n'))
1724	return(0);
1725	for (i = 0;
1726	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1727	i++) {
1728	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1729	return(1);
1730	}
1731	return(0);
1732	}
1733
1734	/************************************************************************
1735	* *
1736	* The list of HTML predefined entities *
1737	* *
1738	************************************************************************/
1739
1740
1741	static const htmlEntityDesc html40EntitiesTable[] = {
1742	/*
1743	* the 4 absolute ones, plus apostrophe.
1744	*/
1745	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1746	{ 38, "amp", "ampersand, U+0026 ISOnum" },
1747	{ 39, "apos", "single quote" },
1748	{ 60, "lt", "less-than sign, U+003C ISOnum" },
1749	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1750
1751	/*
1752	* A bunch still in the 128-255 range
1753	* Replacing them depend really on the charset used.
1754	*/
1755	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1756	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1757	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1758	{ 163, "pound","pound sign, U+00A3 ISOnum" },
1759	{ 164, "curren","currency sign, U+00A4 ISOnum" },
1760	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1761	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1762	{ 167, "sect", "section sign, U+00A7 ISOnum" },
1763	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1764	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1765	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1766	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1767	{ 172, "not", "not sign, U+00AC ISOnum" },
1768	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1769	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1770	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1771	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1772	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1773	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1774	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1775	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1776	{ 181, "micro","micro sign, U+00B5 ISOnum" },
1777	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1778	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1779	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1780	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1781	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1782	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1783	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1784	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1785	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1786	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1787	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1788	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1789	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1790	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1791	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1792	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1793	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1794	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1795	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1796	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1797	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1798	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1799	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1800	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1801	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1802	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1803	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1804	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1805	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1806	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1807	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1808	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1809	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1810	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1811	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1812	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1813	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1814	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1815	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1816	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1817	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1818	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1819	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1820	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1821	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1822	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1823	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1824	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1825	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1826	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1827	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1828	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1829	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1830	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1831	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1832	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1833	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1834	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1835	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1836	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1837	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1838	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1839	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1840	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1841	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1842	{ 247, "divide","division sign, U+00F7 ISOnum" },
1843	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1844	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1845	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1846	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1847	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1848	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1849	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1850	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1851
1852	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1853	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1854	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1855	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1856	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1857
1858	/*
1859	* Anything below should really be kept as entities references
1860	*/
1861	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1862
1863	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1864	{ 732, "tilde","small tilde, U+02DC ISOdia" },
1865
1866	{ 913, "Alpha","greek capital letter alpha, U+0391" },
1867	{ 914, "Beta", "greek capital letter beta, U+0392" },
1868	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1869	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1870	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1871	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1872	{ 919, "Eta", "greek capital letter eta, U+0397" },
1873	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1874	{ 921, "Iota", "greek capital letter iota, U+0399" },
1875	{ 922, "Kappa","greek capital letter kappa, U+039A" },
1876	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1877	{ 924, "Mu", "greek capital letter mu, U+039C" },
1878	{ 925, "Nu", "greek capital letter nu, U+039D" },
1879	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1880	{ 927, "Omicron","greek capital letter omicron, U+039F" },
1881	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1882	{ 929, "Rho", "greek capital letter rho, U+03A1" },
1883	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1884	{ 932, "Tau", "greek capital letter tau, U+03A4" },
1885	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1886	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1887	{ 935, "Chi", "greek capital letter chi, U+03A7" },
1888	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1889	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1890
1891	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1892	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1893	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1894	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1895	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1896	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1897	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1898	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1899	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1900	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1901	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1902	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1903	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1904	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1905	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1906	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1907	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1908	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1909	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1910	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1911	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1912	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1913	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1914	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1915	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1916	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1917	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1918	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1919
1920	{ 8194, "ensp", "en space, U+2002 ISOpub" },
1921	{ 8195, "emsp", "em space, U+2003 ISOpub" },
1922	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1923	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1924	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1925	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1926	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1927	{ 8211, "ndash","en dash, U+2013 ISOpub" },
1928	{ 8212, "mdash","em dash, U+2014 ISOpub" },
1929	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1930	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1931	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1932	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1933	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1934	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1935	{ 8224, "dagger","dagger, U+2020 ISOpub" },
1936	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1937
1938	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1939	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1940
1941	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1942
1943	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1944	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1945
1946	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1947	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1948
1949	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1950	{ 8260, "frasl","fraction slash, U+2044 NEW" },
1951
1952	{ 8364, "euro", "euro sign, U+20AC NEW" },
1953
1954	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1955	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1956	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1957	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1958	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1959	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1960	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1961	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1962	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1963	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1964	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1965	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1966	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1967	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1968	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1969	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1970
1971	{ 8704, "forall","for all, U+2200 ISOtech" },
1972	{ 8706, "part", "partial differential, U+2202 ISOtech" },
1973	{ 8707, "exist","there exists, U+2203 ISOtech" },
1974	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1975	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1976	{ 8712, "isin", "element of, U+2208 ISOtech" },
1977	{ 8713, "notin","not an element of, U+2209 ISOtech" },
1978	{ 8715, "ni", "contains as member, U+220B ISOtech" },
1979	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1980	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1981	{ 8722, "minus","minus sign, U+2212 ISOtech" },
1982	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1983	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1984	{ 8733, "prop", "proportional to, U+221D ISOtech" },
1985	{ 8734, "infin","infinity, U+221E ISOtech" },
1986	{ 8736, "ang", "angle, U+2220 ISOamso" },
1987	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1988	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1989	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1990	{ 8746, "cup", "union = cup, U+222A ISOtech" },
1991	{ 8747, "int", "integral, U+222B ISOtech" },
1992	{ 8756, "there4","therefore, U+2234 ISOtech" },
1993	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1994	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1995	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1996	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1997	{ 8801, "equiv","identical to, U+2261 ISOtech" },
1998	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1999	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2000	{ 8834, "sub", "subset of, U+2282 ISOtech" },
2001	{ 8835, "sup", "superset of, U+2283 ISOtech" },
2002	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2003	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2004	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2005	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2006	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2007	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2008	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2009	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2010	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2011	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2012	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
2013	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2014	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2015	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
2016
2017	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
2018	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2019	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2020	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
2021
2022	};
2023
2024	/************************************************************************
2025	* *
2026	* Commodity functions to handle entities *
2027	* *
2028	************************************************************************/
2029
2030	/*
2031	* Macro used to grow the current buffer.
2032	*/
2033	#define growBuffer(buffer) { \
2034	xmlChar *tmp; \
2035	buffer##_size *= 2; \
2036	tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); \
2037	if (tmp == NULL) { \
2038	htmlErrMemory(ctxt, "growing buffer\n"); \
2039	xmlFree(buffer); \
2040	return(NULL); \
2041	} \
2042	buffer = tmp; \
2043	}
2044
2045	/**
2046	* htmlEntityLookup:
2047	* @name: the entity name
2048	*
2049	* Lookup the given entity in EntitiesTable
2050	*
2051	* TODO: the linear scan is really ugly, an hash table is really needed.
2052	*
2053	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2054	*/
2055	const htmlEntityDesc *
2056	htmlEntityLookup(const xmlChar *name) {
2057	unsigned int i;
2058
2059	for (i = 0;i < (sizeof(html40EntitiesTable)/
2060	sizeof(html40EntitiesTable[0]));i++) {
2061	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2062	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2063	}
2064	}
2065	return(NULL);
2066	}
2067
2068	/**
2069	* htmlEntityValueLookup:
2070	* @value: the entity's unicode value
2071	*
2072	* Lookup the given entity in EntitiesTable
2073	*
2074	* TODO: the linear scan is really ugly, an hash table is really needed.
2075	*
2076	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2077	*/
2078	const htmlEntityDesc *
2079	htmlEntityValueLookup(unsigned int value) {
2080	unsigned int i;
2081
2082	for (i = 0;i < (sizeof(html40EntitiesTable)/
2083	sizeof(html40EntitiesTable[0]));i++) {
2084	if (html40EntitiesTable[i].value >= value) {
2085	if (html40EntitiesTable[i].value > value)
2086	break;
2087	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2088	}
2089	}
2090	return(NULL);
2091	}
2092
2093	/**
2094	* UTF8ToHtml:
2095	* @out: a pointer to an array of bytes to store the result
2096	* @outlen: the length of @out
2097	* @in: a pointer to an array of UTF-8 chars
2098	* @inlen: the length of @in
2099	*
2100	* Take a block of UTF-8 chars in and try to convert it to an ASCII
2101	* plus HTML entities block of chars out.
2102	*
2103	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2104	* The value of @inlen after return is the number of octets consumed
2105	* as the return value is positive, else unpredictable.
2106	* The value of @outlen after return is the number of octets consumed.
2107	*/
2108	int
2109	UTF8ToHtml(unsigned char* out, int *outlen,
2110	const unsigned char* in, int *inlen) {
2111	const unsigned char* processed = in;
2112	const unsigned char* outend;
2113	const unsigned char* outstart = out;
2114	const unsigned char* instart = in;
2115	const unsigned char* inend;
2116	unsigned int c, d;
2117	int trailing;
2118
2119	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL)) return(-1);
2120	if (in == NULL) {
2121	/*
2122	* initialization nothing to do
2123	*/
2124	*outlen = 0;
2125	*inlen = 0;
2126	return(0);
2127	}
2128	inend = in + (*inlen);
2129	outend = out + (*outlen);
2130	while (in < inend) {
2131	d = *in++;
2132	if (d < 0x80) { c= d; trailing= 0; }
2133	else if (d < 0xC0) {
2134	/* trailing byte in leading position */
2135	*outlen = out - outstart;
2136	*inlen = processed - instart;
2137	return(-2);
2138	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2139	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2140	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2141	else {
2142	/* no chance for this in Ascii */
2143	*outlen = out - outstart;
2144	*inlen = processed - instart;
2145	return(-2);
2146	}
2147
2148	if (inend - in < trailing) {
2149	break;
2150	}
2151
2152	for ( ; trailing; trailing--) {
2153	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
2154	break;
2155	c <<= 6;
2156	c \|= d & 0x3F;
2157	}
2158
2159	/* assertion: c is a single UTF-4 value */
2160	if (c < 0x80) {
2161	if (out + 1 >= outend)
2162	break;
2163	*out++ = c;
2164	} else {
2165	int len;
2166	const htmlEntityDesc * ent;
2167	const char *cp;
2168	char nbuf[16];
2169
2170	/*
2171	* Try to lookup a predefined HTML entity for it
2172	*/
2173
2174	ent = htmlEntityValueLookup(c);
2175	if (ent == NULL) {
2176	snprintf(nbuf, sizeof(nbuf), "#%u", c);
2177	cp = nbuf;
2178	}
2179	else
2180	cp = ent->name;
2181	len = strlen(cp);
2182	if (out + 2 + len >= outend)
2183	break;
2184	*out++ = '&';
2185	memcpy(out, cp, len);
2186	out += len;
2187	*out++ = ';';
2188	}
2189	processed = in;
2190	}
2191	*outlen = out - outstart;
2192	*inlen = processed - instart;
2193	return(0);
2194	}
2195
2196	/**
2197	* htmlEncodeEntities:
2198	* @out: a pointer to an array of bytes to store the result
2199	* @outlen: the length of @out
2200	* @in: a pointer to an array of UTF-8 chars
2201	* @inlen: the length of @in
2202	* @quoteChar: the quote character to escape (' or ") or zero.
2203	*
2204	* Take a block of UTF-8 chars in and try to convert it to an ASCII
2205	* plus HTML entities block of chars out.
2206	*
2207	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2208	* The value of @inlen after return is the number of octets consumed
2209	* as the return value is positive, else unpredictable.
2210	* The value of @outlen after return is the number of octets consumed.
2211	*/
2212	int
2213	htmlEncodeEntities(unsigned char* out, int *outlen,
2214	const unsigned char* in, int *inlen, int quoteChar) {
2215	const unsigned char* processed = in;
2216	const unsigned char* outend;
2217	const unsigned char* outstart = out;
2218	const unsigned char* instart = in;
2219	const unsigned char* inend;
2220	unsigned int c, d;
2221	int trailing;
2222
2223	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL) \|\| (in == NULL))
2224	return(-1);
2225	outend = out + (*outlen);
2226	inend = in + (*inlen);
2227	while (in < inend) {
2228	d = *in++;
2229	if (d < 0x80) { c= d; trailing= 0; }
2230	else if (d < 0xC0) {
2231	/* trailing byte in leading position */
2232	*outlen = out - outstart;
2233	*inlen = processed - instart;
2234	return(-2);
2235	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2236	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2237	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2238	else {
2239	/* no chance for this in Ascii */
2240	*outlen = out - outstart;
2241	*inlen = processed - instart;
2242	return(-2);
2243	}
2244
2245	if (inend - in < trailing)
2246	break;
2247
2248	while (trailing--) {
2249	if (((d= *in++) & 0xC0) != 0x80) {
2250	*outlen = out - outstart;
2251	*inlen = processed - instart;
2252	return(-2);
2253	}
2254	c <<= 6;
2255	c \|= d & 0x3F;
2256	}
2257
2258	/* assertion: c is a single UTF-4 value */
2259	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2260	(c != '&') && (c != '<') && (c != '>')) {
2261	if (out >= outend)
2262	break;
2263	*out++ = c;
2264	} else {
2265	const htmlEntityDesc * ent;
2266	const char *cp;
2267	char nbuf[16];
2268	int len;
2269
2270	/*
2271	* Try to lookup a predefined HTML entity for it
2272	*/
2273	ent = htmlEntityValueLookup(c);
2274	if (ent == NULL) {
2275	snprintf(nbuf, sizeof(nbuf), "#%u", c);
2276	cp = nbuf;
2277	}
2278	else
2279	cp = ent->name;
2280	len = strlen(cp);
2281	if (outend - out < len + 2)
2282	break;
2283	*out++ = '&';
2284	memcpy(out, cp, len);
2285	out += len;
2286	*out++ = ';';
2287	}
2288	processed = in;
2289	}
2290	*outlen = out - outstart;
2291	*inlen = processed - instart;
2292	return(0);
2293	}
2294
2295	/************************************************************************
2296	* *
2297	* Commodity functions to handle streams *
2298	* *
2299	************************************************************************/
2300
2301	#ifdef LIBXML_PUSH_ENABLED
2302	/**
2303	* htmlNewInputStream:
2304	* @ctxt: an HTML parser context
2305	*
2306	* Create a new input stream structure
2307	* Returns the new input stream or NULL
2308	*/
2309	static htmlParserInputPtr
2310	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2311	htmlParserInputPtr input;
2312
2313	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2314	if (input == NULL) {
2315	htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2316	return(NULL);
2317	}
2318	memset(input, 0, sizeof(htmlParserInput));
2319	input->filename = NULL;
2320	input->directory = NULL;
2321	input->base = NULL;
2322	input->cur = NULL;
2323	input->buf = NULL;
2324	input->line = 1;
2325	input->col = 1;
2326	input->buf = NULL;
2327	input->free = NULL;
2328	input->version = NULL;
2329	input->consumed = 0;
2330	input->length = 0;
2331	return(input);
2332	}
2333	#endif
2334
2335
2336	/************************************************************************
2337	* *
2338	* Commodity functions, cleanup needed ? *
2339	* *
2340	************************************************************************/
2341	/*
2342	* all tags allowing pc data from the html 4.01 loose dtd
2343	* NOTE: it might be more appropriate to integrate this information
2344	* into the html40ElementTable array but I don't want to risk any
2345	* binary incompatibility
2346	*/
2347	static const char *allowPCData[] = {
2348	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2349	"blockquote", "body", "button", "caption", "center", "cite", "code",
2350	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2351	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2352	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2353	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2354	};
2355
2356	/**
2357	* areBlanks:
2358	* @ctxt: an HTML parser context
2359	* @str: a xmlChar *
2360	* @len: the size of @str
2361	*
2362	* Is this a sequence of blank chars that one can ignore ?
2363	*
2364	* Returns 1 if ignorable 0 otherwise.
2365	*/
2366
2367	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2368	unsigned int i;
2369	int j;
2370	xmlNodePtr lastChild;
2371	xmlDtdPtr dtd;
2372
2373	for (j = 0;j < len;j++)
2374	if (!(IS_BLANK_CH(str[j]))) return(0);
2375
2376	if (CUR == 0) return(1);
2377	if (CUR != '<') return(0);
2378	if (ctxt->name == NULL)
2379	return(1);
2380	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2381	return(1);
2382	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2383	return(1);
2384
2385	/* Only strip CDATA children of the body tag for strict HTML DTDs */
2386	if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2387	dtd = xmlGetIntSubset(ctxt->myDoc);
2388	if (dtd != NULL && dtd->ExternalID != NULL) {
2389	if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") \|\|
2390	!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2391	return(1);
2392	}
2393	}
2394
2395	if (ctxt->node == NULL) return(0);
2396	lastChild = xmlGetLastChild(ctxt->node);
2397	while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2398	lastChild = lastChild->prev;
2399	if (lastChild == NULL) {
2400	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2401	(ctxt->node->content != NULL)) return(0);
2402	/* keep ws in constructs like ...<b> </b>...
2403	for all tags "b" allowing PCDATA */
2404	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2405	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2406	return(0);
2407	}
2408	}
2409	} else if (xmlNodeIsText(lastChild)) {
2410	return(0);
2411	} else {
2412	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2413	for all tags "p" allowing PCDATA */
2414	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2415	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2416	return(0);
2417	}
2418	}
2419	}
2420	return(1);
2421	}
2422
2423	/**
2424	* htmlNewDocNoDtD:
2425	* @URI: URI for the dtd, or NULL
2426	* @ExternalID: the external ID of the DTD, or NULL
2427	*
2428	* Creates a new HTML document without a DTD node if @URI and @ExternalID
2429	* are NULL
2430	*
2431	* Returns a new document, do not initialize the DTD if not provided
2432	*/
2433	htmlDocPtr
2434	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
2435	xmlDocPtr cur;
2436
2437	/*
2438	* Allocate a new document and fill the fields.
2439	*/
2440	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2441	if (cur == NULL) {
2442	htmlErrMemory(NULL, "HTML document creation failed\n");
2443	return(NULL);
2444	}
2445	memset(cur, 0, sizeof(xmlDoc));
2446
2447	cur->type = XML_HTML_DOCUMENT_NODE;
2448	cur->version = NULL;
2449	cur->intSubset = NULL;
2450	cur->doc = cur;
2451	cur->name = NULL;
2452	cur->children = NULL;
2453	cur->extSubset = NULL;
2454	cur->oldNs = NULL;
2455	cur->encoding = NULL;
2456	cur->standalone = 1;
2457	cur->compression = 0;
2458	cur->ids = NULL;
2459	cur->refs = NULL;
2460	cur->_private = NULL;
2461	cur->charset = XML_CHAR_ENCODING_UTF8;
2462	cur->properties = XML_DOC_HTML \| XML_DOC_USERBUILT;
2463	if ((ExternalID != NULL) \|\|
2464	(URI != NULL))
2465	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2466	if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2467	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2468	return(cur);
2469	}
2470
2471	/**
2472	* htmlNewDoc:
2473	* @URI: URI for the dtd, or NULL
2474	* @ExternalID: the external ID of the DTD, or NULL
2475	*
2476	* Creates a new HTML document
2477	*
2478	* Returns a new document
2479	*/
2480	htmlDocPtr
2481	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
2482	if ((URI == NULL) && (ExternalID == NULL))
2483	return(htmlNewDocNoDtD(
2484	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2485	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2486
2487	return(htmlNewDocNoDtD(URI, ExternalID));
2488	}
2489
2490
2491	/************************************************************************
2492	* *
2493	* The parser itself *
2494	* Relates to http://www.w3.org/TR/html40 *
2495	* *
2496	************************************************************************/
2497
2498	/************************************************************************
2499	* *
2500	* The parser itself *
2501	* *
2502	************************************************************************/
2503
2504	static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2505
2506	static void
2507	htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2508	int c;
2509
2510	htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2511	"Incorrectly opened comment\n", NULL, NULL);
2512
2513	do {
2514	c = CUR;
2515	if (c == 0)
2516	break;
2517	NEXT;
2518	} while (c != '>');
2519	}
2520
2521	/**
2522	* htmlParseHTMLName:
2523	* @ctxt: an HTML parser context
2524	*
2525	* parse an HTML tag or attribute name, note that we convert it to lowercase
2526	* since HTML names are not case-sensitive.
2527	*
2528	* Returns the Tag Name parsed or NULL
2529	*/
2530
2531	static const xmlChar *
2532	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2533	const xmlChar *ret;
2534	int i = 0;
2535	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2536
2537	if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2538	(CUR != ':') && (CUR != '.')) return(NULL);
2539
2540	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2541	((IS_ASCII_LETTER(CUR)) \|\| (IS_ASCII_DIGIT(CUR)) \|\|
2542	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_') \|\|
2543	(CUR == '.'))) {
2544	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2545	else loc[i] = CUR;
2546	i++;
2547
2548	NEXT;
2549	}
2550
2551	ret = xmlDictLookup(ctxt->dict, loc, i);
2552	if (ret == NULL)
2553	htmlErrMemory(ctxt, NULL);
2554
2555	return(ret);
2556	}
2557
2558
2559	/**
2560	* htmlParseHTMLName_nonInvasive:
2561	* @ctxt: an HTML parser context
2562	*
2563	* parse an HTML tag or attribute name, note that we convert it to lowercase
2564	* since HTML names are not case-sensitive, this doesn't consume the data
2565	* from the stream, it's a look-ahead
2566	*
2567	* Returns the Tag Name parsed or NULL
2568	*/
2569
2570	static const xmlChar *
2571	htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2572	int i = 0;
2573	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2574
2575	if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2576	(NXT(1) != ':')) return(NULL);
2577
2578	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2579	((IS_ASCII_LETTER(NXT(1+i))) \|\| (IS_ASCII_DIGIT(NXT(1+i))) \|\|
2580	(NXT(1+i) == ':') \|\| (NXT(1+i) == '-') \|\| (NXT(1+i) == '_'))) {
2581	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2582	else loc[i] = NXT(1+i);
2583	i++;
2584	}
2585
2586	return(xmlDictLookup(ctxt->dict, loc, i));
2587	}
2588
2589
2590	/**
2591	* htmlParseName:
2592	* @ctxt: an HTML parser context
2593	*
2594	* parse an HTML name, this routine is case sensitive.
2595	*
2596	* Returns the Name parsed or NULL
2597	*/
2598
2599	static const xmlChar *
2600	htmlParseName(htmlParserCtxtPtr ctxt) {
2601	const xmlChar *in;
2602	const xmlChar *ret;
2603	int count = 0;
2604
2605	GROW;
2606
2607	/*
2608	* Accelerator for simple ASCII names
2609	*/
2610	in = ctxt->input->cur;
2611	if (((in >= 0x61) && (in <= 0x7A)) \|\|
2612	((in >= 0x41) && (in <= 0x5A)) \|\|
2613	(in == '_') \|\| (in == ':')) {
2614	in++;
2615	while (((in >= 0x61) && (in <= 0x7A)) \|\|
2616	((in >= 0x41) && (in <= 0x5A)) \|\|
2617	((in >= 0x30) && (in <= 0x39)) \|\|
2618	(in == '_') \|\| (in == '-') \|\|
2619	(in == ':') \|\| (in == '.'))
2620	in++;
2621
2622	if (in == ctxt->input->end)
2623	return(NULL);
2624
2625	if ((in > 0) && (in < 0x80)) {
2626	count = in - ctxt->input->cur;
2627	ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2628	ctxt->input->cur = in;
2629	ctxt->input->col += count;
2630	return(ret);
2631	}
2632	}
2633	return(htmlParseNameComplex(ctxt));
2634	}
2635
2636	static const xmlChar *
2637	htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2638	int len = 0, l;
2639	int c;
2640	int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2641	XML_MAX_TEXT_LENGTH :
2642	XML_MAX_NAME_LENGTH;
2643	const xmlChar *base = ctxt->input->base;
2644
2645	/*
2646	* Handler for more complex cases
2647	*/
2648	c = CUR_CHAR(l);
2649	if ((c == ' ') \|\| (c == '>') \|\| (c == '/') \|\| /* accelerators */
2650	(!IS_LETTER(c) && (c != '_') &&
2651	(c != ':'))) {
2652	return(NULL);
2653	}
2654
2655	while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2656	((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\|
2657	(c == '.') \|\| (c == '-') \|\|
2658	(c == '_') \|\| (c == ':') \|\|
2659	(IS_COMBINING(c)) \|\|
2660	(IS_EXTENDER(c)))) {
2661	len += l;
2662	if (len > maxLength) {
2663	htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2664	return(NULL);
2665	}
2666	NEXTL(l);
2667	c = CUR_CHAR(l);
2668	if (ctxt->input->base != base) {
2669	/*
2670	* We changed encoding from an unknown encoding
2671	* Input buffer changed location, so we better start again
2672	*/
2673	return(htmlParseNameComplex(ctxt));
2674	}
2675	}
2676	if (ctxt->instate == XML_PARSER_EOF)
2677	return(NULL);
2678
2679	if (ctxt->input->cur - ctxt->input->base < len) {
2680	/* Sanity check */
2681	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2682	"unexpected change of input buffer", NULL, NULL);
2683	return (NULL);
2684	}
2685
2686	return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2687	}
2688
2689
2690	/**
2691	* htmlParseHTMLAttribute:
2692	* @ctxt: an HTML parser context
2693	* @stop: a char stop value
2694	*
2695	* parse an HTML attribute value till the stop (quote), if
2696	* stop is 0 then it stops at the first space
2697	*
2698	* Returns the attribute parsed or NULL
2699	*/
2700
2701	static xmlChar *
2702	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2703	xmlChar *buffer = NULL;
2704	int buffer_size = 0;
2705	int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2706	XML_MAX_HUGE_LENGTH :
2707	XML_MAX_TEXT_LENGTH;
2708	xmlChar *out = NULL;
2709	const xmlChar *name = NULL;
2710	const xmlChar *cur = NULL;
2711	const htmlEntityDesc * ent;
2712
2713	/*
2714	* allocate a translation buffer.
2715	*/
2716	buffer_size = HTML_PARSER_BUFFER_SIZE;
2717	buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2718	if (buffer == NULL) {
2719	htmlErrMemory(ctxt, "buffer allocation failed\n");
2720	return(NULL);
2721	}
2722	out = buffer;
2723
2724	/*
2725	* Ok loop until we reach one of the ending chars
2726	*/
2727	while ((CUR != 0) && (CUR != stop)) {
2728	if ((stop == 0) && (CUR == '>')) break;
2729	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2730	if (CUR == '&') {
2731	if (NXT(1) == '#') {
2732	unsigned int c;
2733	int bits;
2734
2735	c = htmlParseCharRef(ctxt);
2736	if (c < 0x80)
2737	{ *out++ = c; bits= -6; }
2738	else if (c < 0x800)
2739	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2740	else if (c < 0x10000)
2741	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2742	else
2743	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2744
2745	for ( ; bits >= 0; bits-= 6) {
2746	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2747	}
2748
2749	if (out - buffer > buffer_size - 100) {
2750	int indx = out - buffer;
2751
2752	growBuffer(buffer);
2753	out = &buffer[indx];
2754	}
2755	} else {
2756	ent = htmlParseEntityRef(ctxt, &name);
2757	if (name == NULL) {
2758	*out++ = '&';
2759	if (out - buffer > buffer_size - 100) {
2760	int indx = out - buffer;
2761
2762	growBuffer(buffer);
2763	out = &buffer[indx];
2764	}
2765	} else if (ent == NULL) {
2766	*out++ = '&';
2767	cur = name;
2768	while (*cur != 0) {
2769	if (out - buffer > buffer_size - 100) {
2770	int indx = out - buffer;
2771
2772	growBuffer(buffer);
2773	out = &buffer[indx];
2774	}
2775	out++ = cur++;
2776	}
2777	} else {
2778	unsigned int c;
2779	int bits;
2780
2781	if (out - buffer > buffer_size - 100) {
2782	int indx = out - buffer;
2783
2784	growBuffer(buffer);
2785	out = &buffer[indx];
2786	}
2787	c = ent->value;
2788	if (c < 0x80)
2789	{ *out++ = c; bits= -6; }
2790	else if (c < 0x800)
2791	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2792	else if (c < 0x10000)
2793	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2794	else
2795	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2796
2797	for ( ; bits >= 0; bits-= 6) {
2798	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2799	}
2800	}
2801	}
2802	} else {
2803	unsigned int c;
2804	int bits, l;
2805
2806	if (out - buffer > buffer_size - 100) {
2807	int indx = out - buffer;
2808
2809	growBuffer(buffer);
2810	out = &buffer[indx];
2811	}
2812	c = CUR_CHAR(l);
2813	if (ctxt->instate == XML_PARSER_EOF) {
2814	xmlFree(buffer);
2815	return(NULL);
2816	}
2817	if (c < 0x80)
2818	{ *out++ = c; bits= -6; }
2819	else if (c < 0x800)
2820	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2821	else if (c < 0x10000)
2822	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2823	else
2824	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2825
2826	for ( ; bits >= 0; bits-= 6) {
2827	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2828	}
2829	NEXTL(l);
2830	}
2831	if (out - buffer > maxLength) {
2832	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2833	"attribute value too long\n", NULL, NULL);
2834	xmlFree(buffer);
2835	return(NULL);
2836	}
2837	}
2838	*out = 0;
2839	return(buffer);
2840	}
2841
2842	/**
2843	* htmlParseEntityRef:
2844	* @ctxt: an HTML parser context
2845	* @str: location to store the entity name
2846	*
2847	* DEPRECATED: Internal function, don't use.
2848	*
2849	* parse an HTML ENTITY references
2850	*
2851	* [68] EntityRef ::= '&' Name ';'
2852	*
2853	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2854	* if non-NULL *str will have to be freed by the caller.
2855	*/
2856	const htmlEntityDesc *
2857	htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2858	const xmlChar *name;
2859	const htmlEntityDesc * ent = NULL;
2860
2861	if (str != NULL) *str = NULL;
2862	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) return(NULL);
2863
2864	if (CUR == '&') {
2865	NEXT;
2866	name = htmlParseName(ctxt);
2867	if (name == NULL) {
2868	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2869	"htmlParseEntityRef: no name\n", NULL, NULL);
2870	} else {
2871	GROW;
2872	if (CUR == ';') {
2873	if (str != NULL)
2874	*str = name;
2875
2876	/*
2877	* Lookup the entity in the table.
2878	*/
2879	ent = htmlEntityLookup(name);
2880	if (ent != NULL) /* OK that's ugly !!! */
2881	NEXT;
2882	} else {
2883	htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2884	"htmlParseEntityRef: expecting ';'\n",
2885	NULL, NULL);
2886	if (str != NULL)
2887	*str = name;
2888	}
2889	}
2890	}
2891	return(ent);
2892	}
2893
2894	/**
2895	* htmlParseAttValue:
2896	* @ctxt: an HTML parser context
2897	*
2898	* parse a value for an attribute
2899	* Note: the parser won't do substitution of entities here, this
2900	* will be handled later in xmlStringGetNodeList, unless it was
2901	* asked for ctxt->replaceEntities != 0
2902	*
2903	* Returns the AttValue parsed or NULL.
2904	*/
2905
2906	static xmlChar *
2907	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2908	xmlChar *ret = NULL;
2909
2910	if (CUR == '"') {
2911	NEXT;
2912	ret = htmlParseHTMLAttribute(ctxt, '"');
2913	if (CUR != '"') {
2914	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2915	"AttValue: \" expected\n", NULL, NULL);
2916	} else
2917	NEXT;
2918	} else if (CUR == '\'') {
2919	NEXT;
2920	ret = htmlParseHTMLAttribute(ctxt, '\'');
2921	if (CUR != '\'') {
2922	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2923	"AttValue: ' expected\n", NULL, NULL);
2924	} else
2925	NEXT;
2926	} else {
2927	/*
2928	* That's an HTMLism, the attribute value may not be quoted
2929	*/
2930	ret = htmlParseHTMLAttribute(ctxt, 0);
2931	if (ret == NULL) {
2932	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2933	"AttValue: no value found\n", NULL, NULL);
2934	}
2935	}
2936	return(ret);
2937	}
2938
2939	/**
2940	* htmlParseSystemLiteral:
2941	* @ctxt: an HTML parser context
2942	*
2943	* parse an HTML Literal
2944	*
2945	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
2946	*
2947	* Returns the SystemLiteral parsed or NULL
2948	*/
2949
2950	static xmlChar *
2951	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2952	size_t len = 0, startPosition = 0;
2953	int err = 0;
2954	int quote;
2955	xmlChar *ret = NULL;
2956
2957	if ((CUR != '"') && (CUR != '\'')) {
2958	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2959	"SystemLiteral \" or ' expected\n", NULL, NULL);
2960	return(NULL);
2961	}
2962	quote = CUR;
2963	NEXT;
2964
2965	if (CUR_PTR < BASE_PTR)
2966	return(ret);
2967	startPosition = CUR_PTR - BASE_PTR;
2968
2969	while ((CUR != 0) && (CUR != quote)) {
2970	/* TODO: Handle UTF-8 */
2971	if (!IS_CHAR_CH(CUR)) {
2972	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2973	"Invalid char in SystemLiteral 0x%X\n", CUR);
2974	err = 1;
2975	}
2976	NEXT;
2977	len++;
2978	}
2979	if (CUR != quote) {
2980	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2981	"Unfinished SystemLiteral\n", NULL, NULL);
2982	} else {
2983	if (err == 0)
2984	ret = xmlStrndup((BASE_PTR+startPosition), len);
2985	NEXT;
2986	}
2987
2988	return(ret);
2989	}
2990
2991	/**
2992	* htmlParsePubidLiteral:
2993	* @ctxt: an HTML parser context
2994	*
2995	* parse an HTML public literal
2996	*
2997	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
2998	*
2999	* Returns the PubidLiteral parsed or NULL.
3000	*/
3001
3002	static xmlChar *
3003	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3004	size_t len = 0, startPosition = 0;
3005	int err = 0;
3006	int quote;
3007	xmlChar *ret = NULL;
3008
3009	if ((CUR != '"') && (CUR != '\'')) {
3010	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3011	"PubidLiteral \" or ' expected\n", NULL, NULL);
3012	return(NULL);
3013	}
3014	quote = CUR;
3015	NEXT;
3016
3017	/*
3018	* Name ::= (Letter \| '_') (NameChar)*
3019	*/
3020	if (CUR_PTR < BASE_PTR)
3021	return(ret);
3022	startPosition = CUR_PTR - BASE_PTR;
3023
3024	while ((CUR != 0) && (CUR != quote)) {
3025	if (!IS_PUBIDCHAR_CH(CUR)) {
3026	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3027	"Invalid char in PubidLiteral 0x%X\n", CUR);
3028	err = 1;
3029	}
3030	len++;
3031	NEXT;
3032	}
3033
3034	if (CUR != quote) {
3035	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3036	"Unfinished PubidLiteral\n", NULL, NULL);
3037	} else {
3038	if (err == 0)
3039	ret = xmlStrndup((BASE_PTR + startPosition), len);
3040	NEXT;
3041	}
3042
3043	return(ret);
3044	}
3045
3046	/**
3047	* htmlParseScript:
3048	* @ctxt: an HTML parser context
3049	*
3050	* parse the content of an HTML SCRIPT or STYLE element
3051	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
3052	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3053	* http://www.w3.org/TR/html4/types.html#type-script
3054	* http://www.w3.org/TR/html4/types.html#h-6.15
3055	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3056	*
3057	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
3058	* element and the value of intrinsic event attributes. User agents must
3059	* not evaluate script data as HTML markup but instead must pass it on as
3060	* data to a script engine.
3061	* NOTES:
3062	* - The content is passed like CDATA
3063	* - the attributes for style and scripting "onXXX" are also described
3064	* as CDATA but SGML allows entities references in attributes so their
3065	* processing is identical as other attributes
3066	*/
3067	static void
3068	htmlParseScript(htmlParserCtxtPtr ctxt) {
3069	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3070	int nbchar = 0;
3071	int cur,l;
3072
3073	cur = CUR_CHAR(l);
3074	while (cur != 0) {
3075	if ((cur == '<') && (NXT(1) == '/')) {
3076	/*
3077	* One should break here, the specification is clear:
3078	* Authors should therefore escape "</" within the content.
3079	* Escape mechanisms are specific to each scripting or
3080	* style sheet language.
3081	*
3082	* In recovery mode, only break if end tag match the
3083	* current tag, effectively ignoring all tags inside the
3084	* script/style block and treating the entire block as
3085	* CDATA.
3086	*/
3087	if (ctxt->recovery) {
3088	if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3089	xmlStrlen(ctxt->name)) == 0)
3090	{
3091	break; /* while */
3092	} else {
3093	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3094	"Element %s embeds close tag\n",
3095	ctxt->name, NULL);
3096	}
3097	} else {
3098	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
3099	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3100	{
3101	break; /* while */
3102	}
3103	}
3104	}
3105	if (IS_CHAR(cur)) {
3106	COPY_BUF(l,buf,nbchar,cur);
3107	} else {
3108	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3109	"Invalid char in CDATA 0x%X\n", cur);
3110	}
3111	NEXTL(l);
3112	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3113	buf[nbchar] = 0;
3114	if (ctxt->sax->cdataBlock!= NULL) {
3115	/*
3116	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3117	*/
3118	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3119	} else if (ctxt->sax->characters != NULL) {
3120	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3121	}
3122	nbchar = 0;
3123	SHRINK;
3124	}
3125	cur = CUR_CHAR(l);
3126	}
3127
3128	if (ctxt->instate == XML_PARSER_EOF)
3129	return;
3130
3131	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3132	buf[nbchar] = 0;
3133	if (ctxt->sax->cdataBlock!= NULL) {
3134	/*
3135	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3136	*/
3137	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3138	} else if (ctxt->sax->characters != NULL) {
3139	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3140	}
3141	}
3142	}
3143
3144
3145	/**
3146	* htmlParseCharDataInternal:
3147	* @ctxt: an HTML parser context
3148	* @readahead: optional read ahead character in ascii range
3149	*
3150	* parse a CharData section.
3151	* if we are within a CDATA section ']]>' marks an end of section.
3152	*
3153	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3154	*/
3155
3156	static void
3157	htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3158	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3159	int nbchar = 0;
3160	int cur, l;
3161
3162	if (readahead)
3163	buf[nbchar++] = readahead;
3164
3165	cur = CUR_CHAR(l);
3166	while (((cur != '<') \|\| (ctxt->token == '<')) &&
3167	((cur != '&') \|\| (ctxt->token == '&')) &&
3168	(cur != 0)) {
3169	if (!(IS_CHAR(cur))) {
3170	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3171	"Invalid char in CDATA 0x%X\n", cur);
3172	} else {
3173	COPY_BUF(l,buf,nbchar,cur);
3174	}
3175	NEXTL(l);
3176	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3177	buf[nbchar] = 0;
3178
3179	/*
3180	* Ok the segment is to be consumed as chars.
3181	*/
3182	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3183	if (areBlanks(ctxt, buf, nbchar)) {
3184	if (ctxt->keepBlanks) {
3185	if (ctxt->sax->characters != NULL)
3186	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3187	} else {
3188	if (ctxt->sax->ignorableWhitespace != NULL)
3189	ctxt->sax->ignorableWhitespace(ctxt->userData,
3190	buf, nbchar);
3191	}
3192	} else {
3193	htmlCheckParagraph(ctxt);
3194	if (ctxt->sax->characters != NULL)
3195	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3196	}
3197	}
3198	nbchar = 0;
3199	SHRINK;
3200	}
3201	cur = CUR_CHAR(l);
3202	}
3203	if (ctxt->instate == XML_PARSER_EOF)
3204	return;
3205	if (nbchar != 0) {
3206	buf[nbchar] = 0;
3207
3208	/*
3209	* Ok the segment is to be consumed as chars.
3210	*/
3211	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3212	if (areBlanks(ctxt, buf, nbchar)) {
3213	if (ctxt->keepBlanks) {
3214	if (ctxt->sax->characters != NULL)
3215	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3216	} else {
3217	if (ctxt->sax->ignorableWhitespace != NULL)
3218	ctxt->sax->ignorableWhitespace(ctxt->userData,
3219	buf, nbchar);
3220	}
3221	} else {
3222	htmlCheckParagraph(ctxt);
3223	if (ctxt->sax->characters != NULL)
3224	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3225	}
3226	}
3227	}
3228	}
3229
3230	/**
3231	* htmlParseCharData:
3232	* @ctxt: an HTML parser context
3233	*
3234	* parse a CharData section.
3235	* if we are within a CDATA section ']]>' marks an end of section.
3236	*
3237	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3238	*/
3239
3240	static void
3241	htmlParseCharData(htmlParserCtxtPtr ctxt) {
3242	htmlParseCharDataInternal(ctxt, 0);
3243	}
3244
3245	/**
3246	* htmlParseExternalID:
3247	* @ctxt: an HTML parser context
3248	* @publicID: a xmlChar** receiving PubidLiteral
3249	*
3250	* Parse an External ID or a Public ID
3251	*
3252	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3253	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
3254	*
3255	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
3256	*
3257	* Returns the function returns SystemLiteral and in the second
3258	* case publicID receives PubidLiteral, is strict is off
3259	* it is possible to return NULL and have publicID set.
3260	*/
3261
3262	static xmlChar *
3263	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3264	xmlChar *URI = NULL;
3265
3266	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3267	(UPP(2) == 'S') && (UPP(3) == 'T') &&
3268	(UPP(4) == 'E') && (UPP(5) == 'M')) {
3269	SKIP(6);
3270	if (!IS_BLANK_CH(CUR)) {
3271	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3272	"Space required after 'SYSTEM'\n", NULL, NULL);
3273	}
3274	SKIP_BLANKS;
3275	URI = htmlParseSystemLiteral(ctxt);
3276	if (URI == NULL) {
3277	htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3278	"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3279	}
3280	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3281	(UPP(2) == 'B') && (UPP(3) == 'L') &&
3282	(UPP(4) == 'I') && (UPP(5) == 'C')) {
3283	SKIP(6);
3284	if (!IS_BLANK_CH(CUR)) {
3285	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3286	"Space required after 'PUBLIC'\n", NULL, NULL);
3287	}
3288	SKIP_BLANKS;
3289	*publicID = htmlParsePubidLiteral(ctxt);
3290	if (*publicID == NULL) {
3291	htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3292	"htmlParseExternalID: PUBLIC, no Public Identifier\n",
3293	NULL, NULL);
3294	}
3295	SKIP_BLANKS;
3296	if ((CUR == '"') \|\| (CUR == '\'')) {
3297	URI = htmlParseSystemLiteral(ctxt);
3298	}
3299	}
3300	return(URI);
3301	}
3302
3303	/**
3304	* xmlParsePI:
3305	* @ctxt: an XML parser context
3306	*
3307	* parse an XML Processing Instruction.
3308	*
3309	* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3310	*/
3311	static void
3312	htmlParsePI(htmlParserCtxtPtr ctxt) {
3313	xmlChar *buf = NULL;
3314	int len = 0;
3315	int size = HTML_PARSER_BUFFER_SIZE;
3316	int cur, l;
3317	int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3318	XML_MAX_HUGE_LENGTH :
3319	XML_MAX_TEXT_LENGTH;
3320	const xmlChar *target;
3321	xmlParserInputState state;
3322
3323	if ((RAW == '<') && (NXT(1) == '?')) {
3324	state = ctxt->instate;
3325	ctxt->instate = XML_PARSER_PI;
3326	/*
3327	* this is a Processing Instruction.
3328	*/
3329	SKIP(2);
3330
3331	/*
3332	* Parse the target name and check for special support like
3333	* namespace.
3334	*/
3335	target = htmlParseName(ctxt);
3336	if (target != NULL) {
3337	if (RAW == '>') {
3338	SKIP(1);
3339
3340	/*
3341	* SAX: PI detected.
3342	*/
3343	if ((ctxt->sax) && (!ctxt->disableSAX) &&
3344	(ctxt->sax->processingInstruction != NULL))
3345	ctxt->sax->processingInstruction(ctxt->userData,
3346	target, NULL);
3347	ctxt->instate = state;
3348	return;
3349	}
3350	buf = (xmlChar *) xmlMallocAtomic(size);
3351	if (buf == NULL) {
3352	htmlErrMemory(ctxt, NULL);
3353	ctxt->instate = state;
3354	return;
3355	}
3356	cur = CUR;
3357	if (!IS_BLANK(cur)) {
3358	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3359	"ParsePI: PI %s space expected\n", target, NULL);
3360	}
3361	SKIP_BLANKS;
3362	cur = CUR_CHAR(l);
3363	while ((cur != 0) && (cur != '>')) {
3364	if (len + 5 >= size) {
3365	xmlChar *tmp;
3366
3367	size *= 2;
3368	tmp = (xmlChar *) xmlRealloc(buf, size);
3369	if (tmp == NULL) {
3370	htmlErrMemory(ctxt, NULL);
3371	xmlFree(buf);
3372	ctxt->instate = state;
3373	return;
3374	}
3375	buf = tmp;
3376	}
3377	if (IS_CHAR(cur)) {
3378	COPY_BUF(l,buf,len,cur);
3379	} else {
3380	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3381	"Invalid char in processing instruction "
3382	"0x%X\n", cur);
3383	}
3384	if (len > maxLength) {
3385	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3386	"PI %s too long", target, NULL);
3387	xmlFree(buf);
3388	ctxt->instate = state;
3389	return;
3390	}
3391	NEXTL(l);
3392	cur = CUR_CHAR(l);
3393	}
3394	buf[len] = 0;
3395	if (ctxt->instate == XML_PARSER_EOF) {
3396	xmlFree(buf);
3397	return;
3398	}
3399	if (cur != '>') {
3400	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3401	"ParsePI: PI %s never end ...\n", target, NULL);
3402	} else {
3403	SKIP(1);
3404
3405	/*
3406	* SAX: PI detected.
3407	*/
3408	if ((ctxt->sax) && (!ctxt->disableSAX) &&
3409	(ctxt->sax->processingInstruction != NULL))
3410	ctxt->sax->processingInstruction(ctxt->userData,
3411	target, buf);
3412	}
3413	xmlFree(buf);
3414	} else {
3415	htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3416	"PI is not started correctly", NULL, NULL);
3417	}
3418	ctxt->instate = state;
3419	}
3420	}
3421
3422	/**
3423	* htmlParseComment:
3424	* @ctxt: an HTML parser context
3425	*
3426	* Parse an XML (SGML) comment <!-- .... -->
3427	*
3428	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
3429	*/
3430	static void
3431	htmlParseComment(htmlParserCtxtPtr ctxt) {
3432	xmlChar *buf = NULL;
3433	int len;
3434	int size = HTML_PARSER_BUFFER_SIZE;
3435	int q, ql;
3436	int r, rl;
3437	int cur, l;
3438	int next, nl;
3439	int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3440	XML_MAX_HUGE_LENGTH :
3441	XML_MAX_TEXT_LENGTH;
3442	xmlParserInputState state;
3443
3444	/*
3445	* Check that there is a comment right here.
3446	*/
3447	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
3448	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
3449
3450	state = ctxt->instate;
3451	ctxt->instate = XML_PARSER_COMMENT;
3452	SKIP(4);
3453	buf = (xmlChar *) xmlMallocAtomic(size);
3454	if (buf == NULL) {
3455	htmlErrMemory(ctxt, "buffer allocation failed\n");
3456	ctxt->instate = state;
3457	return;
3458	}
3459	len = 0;
3460	buf[len] = 0;
3461	q = CUR_CHAR(ql);
3462	if (q == 0)
3463	goto unfinished;
3464	if (q == '>') {
3465	htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3466	cur = '>';
3467	goto finished;
3468	}
3469	NEXTL(ql);
3470	r = CUR_CHAR(rl);
3471	if (r == 0)
3472	goto unfinished;
3473	if (q == '-' && r == '>') {
3474	htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3475	cur = '>';
3476	goto finished;
3477	}
3478	NEXTL(rl);
3479	cur = CUR_CHAR(l);
3480	while ((cur != 0) &&
3481	((cur != '>') \|\|
3482	(r != '-') \|\| (q != '-'))) {
3483	NEXTL(l);
3484	next = CUR_CHAR(nl);
3485
3486	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3487	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3488	"Comment incorrectly closed by '--!>'", NULL, NULL);
3489	cur = '>';
3490	break;
3491	}
3492
3493	if (len + 5 >= size) {
3494	xmlChar *tmp;
3495
3496	size *= 2;
3497	tmp = (xmlChar *) xmlRealloc(buf, size);
3498	if (tmp == NULL) {
3499	xmlFree(buf);
3500	htmlErrMemory(ctxt, "growing buffer failed\n");
3501	ctxt->instate = state;
3502	return;
3503	}
3504	buf = tmp;
3505	}
3506	if (IS_CHAR(q)) {
3507	COPY_BUF(ql,buf,len,q);
3508	} else {
3509	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3510	"Invalid char in comment 0x%X\n", q);
3511	}
3512	if (len > maxLength) {
3513	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3514	"comment too long", NULL, NULL);
3515	xmlFree(buf);
3516	ctxt->instate = state;
3517	return;
3518	}
3519
3520	q = r;
3521	ql = rl;
3522	r = cur;
3523	rl = l;
3524	cur = next;
3525	l = nl;
3526	}
3527	finished:
3528	buf[len] = 0;
3529	if (ctxt->instate == XML_PARSER_EOF) {
3530	xmlFree(buf);
3531	return;
3532	}
3533	if (cur == '>') {
3534	NEXT;
3535	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3536	(!ctxt->disableSAX))
3537	ctxt->sax->comment(ctxt->userData, buf);
3538	xmlFree(buf);
3539	ctxt->instate = state;
3540	return;
3541	}
3542
3543	unfinished:
3544	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3545	"Comment not terminated \n<!--%.50s\n", buf, NULL);
3546	xmlFree(buf);
3547	}
3548
3549	/**
3550	* htmlParseCharRef:
3551	* @ctxt: an HTML parser context
3552	*
3553	* DEPRECATED: Internal function, don't use.
3554	*
3555	* parse Reference declarations
3556	*
3557	* [66] CharRef ::= '&#' [0-9]+ ';' \|
3558	* '&#x' [0-9a-fA-F]+ ';'
3559	*
3560	* Returns the value parsed (as an int)
3561	*/
3562	int
3563	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3564	int val = 0;
3565
3566	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3567	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3568	"htmlParseCharRef: context error\n",
3569	NULL, NULL);
3570	return(0);
3571	}
3572	if ((CUR == '&') && (NXT(1) == '#') &&
3573	((NXT(2) == 'x') \|\| NXT(2) == 'X')) {
3574	SKIP(3);
3575	while (CUR != ';') {
3576	if ((CUR >= '0') && (CUR <= '9')) {
3577	if (val < 0x110000)
3578	val = val * 16 + (CUR - '0');
3579	} else if ((CUR >= 'a') && (CUR <= 'f')) {
3580	if (val < 0x110000)
3581	val = val * 16 + (CUR - 'a') + 10;
3582	} else if ((CUR >= 'A') && (CUR <= 'F')) {
3583	if (val < 0x110000)
3584	val = val * 16 + (CUR - 'A') + 10;
3585	} else {
3586	htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3587	"htmlParseCharRef: missing semicolon\n",
3588	NULL, NULL);
3589	break;
3590	}
3591	NEXT;
3592	}
3593	if (CUR == ';')
3594	NEXT;
3595	} else if ((CUR == '&') && (NXT(1) == '#')) {
3596	SKIP(2);
3597	while (CUR != ';') {
3598	if ((CUR >= '0') && (CUR <= '9')) {
3599	if (val < 0x110000)
3600	val = val * 10 + (CUR - '0');
3601	} else {
3602	htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3603	"htmlParseCharRef: missing semicolon\n",
3604	NULL, NULL);
3605	break;
3606	}
3607	NEXT;
3608	}
3609	if (CUR == ';')
3610	NEXT;
3611	} else {
3612	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3613	"htmlParseCharRef: invalid value\n", NULL, NULL);
3614	}
3615	/*
3616	* Check the value IS_CHAR ...
3617	*/
3618	if (IS_CHAR(val)) {
3619	return(val);
3620	} else if (val >= 0x110000) {
3621	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3622	"htmlParseCharRef: value too large\n", NULL, NULL);
3623	} else {
3624	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3625	"htmlParseCharRef: invalid xmlChar value %d\n",
3626	val);
3627	}
3628	return(0);
3629	}
3630
3631
3632	/**
3633	* htmlParseDocTypeDecl:
3634	* @ctxt: an HTML parser context
3635	*
3636	* parse a DOCTYPE declaration
3637	*
3638	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3639	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
3640	*/
3641
3642	static void
3643	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3644	const xmlChar *name;
3645	xmlChar *ExternalID = NULL;
3646	xmlChar *URI = NULL;
3647
3648	/*
3649	* We know that '<!DOCTYPE' has been detected.
3650	*/
3651	SKIP(9);
3652
3653	SKIP_BLANKS;
3654
3655	/*
3656	* Parse the DOCTYPE name.
3657	*/
3658	name = htmlParseName(ctxt);
3659	if (name == NULL) {
3660	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3661	"htmlParseDocTypeDecl : no DOCTYPE name !\n",
3662	NULL, NULL);
3663	}
3664	/*
3665	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
3666	*/
3667
3668	SKIP_BLANKS;
3669
3670	/*
3671	* Check for SystemID and ExternalID
3672	*/
3673	URI = htmlParseExternalID(ctxt, &ExternalID);
3674	SKIP_BLANKS;
3675
3676	/*
3677	* We should be at the end of the DOCTYPE declaration.
3678	*/
3679	if (CUR != '>') {
3680	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3681	"DOCTYPE improperly terminated\n", NULL, NULL);
3682	/* Ignore bogus content */
3683	while ((CUR != 0) && (CUR != '>') &&
3684	(ctxt->instate != XML_PARSER_EOF))
3685	NEXT;
3686	}
3687	if (CUR == '>')
3688	NEXT;
3689
3690	/*
3691	* Create or update the document accordingly to the DOCTYPE
3692	*/
3693	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3694	(!ctxt->disableSAX))
3695	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3696
3697	/*
3698	* Cleanup, since we don't use all those identifiers
3699	*/
3700	if (URI != NULL) xmlFree(URI);
3701	if (ExternalID != NULL) xmlFree(ExternalID);
3702	}
3703
3704	/**
3705	* htmlParseAttribute:
3706	* @ctxt: an HTML parser context
3707	* @value: a xmlChar ** used to store the value of the attribute
3708	*
3709	* parse an attribute
3710	*
3711	* [41] Attribute ::= Name Eq AttValue
3712	*
3713	* [25] Eq ::= S? '=' S?
3714	*
3715	* With namespace:
3716	*
3717	* [NS 11] Attribute ::= QName Eq AttValue
3718	*
3719	* Also the case QName == xmlns:??? is handled independently as a namespace
3720	* definition.
3721	*
3722	* Returns the attribute name, and the value in *value.
3723	*/
3724
3725	static const xmlChar *
3726	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3727	const xmlChar *name;
3728	xmlChar *val = NULL;
3729
3730	*value = NULL;
3731	name = htmlParseHTMLName(ctxt);
3732	if (name == NULL) {
3733	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3734	"error parsing attribute name\n", NULL, NULL);
3735	return(NULL);
3736	}
3737
3738	/*
3739	* read the value
3740	*/
3741	SKIP_BLANKS;
3742	if (CUR == '=') {
3743	NEXT;
3744	SKIP_BLANKS;
3745	val = htmlParseAttValue(ctxt);
3746	}
3747
3748	*value = val;
3749	return(name);
3750	}
3751
3752	/**
3753	* htmlCheckEncoding:
3754	* @ctxt: an HTML parser context
3755	* @attvalue: the attribute value
3756	*
3757	* Checks an http-equiv attribute from a Meta tag to detect
3758	* the encoding
3759	* If a new encoding is detected the parser is switched to decode
3760	* it and pass UTF8
3761	*/
3762	static void
3763	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3764	const xmlChar *encoding;
3765
3766	if (!attvalue)
3767	return;
3768
3769	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3770	if (encoding != NULL) {
3771	encoding += 7;
3772	}
3773	/*
3774	* skip blank
3775	*/
3776	if (encoding && IS_BLANK_CH(*encoding))
3777	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3778	if (encoding && *encoding == '=') {
3779	encoding ++;
3780	xmlSetDeclaredEncoding(ctxt, xmlStrdup(encoding));
3781	}
3782	}
3783
3784	/**
3785	* htmlCheckMeta:
3786	* @ctxt: an HTML parser context
3787	* @atts: the attributes values
3788	*
3789	* Checks an attributes from a Meta tag
3790	*/
3791	static void
3792	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3793	int i;
3794	const xmlChar att, value;
3795	int http = 0;
3796	const xmlChar *content = NULL;
3797
3798	if ((ctxt == NULL) \|\| (atts == NULL))
3799	return;
3800
3801	i = 0;
3802	att = atts[i++];
3803	while (att != NULL) {
3804	value = atts[i++];
3805	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3806	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3807	http = 1;
3808	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3809	xmlSetDeclaredEncoding(ctxt, xmlStrdup(value));
3810	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3811	content = value;
3812	att = atts[i++];
3813	}
3814	if ((http) && (content != NULL))
3815	htmlCheckEncoding(ctxt, content);
3816
3817	}
3818
3819	/**
3820	* htmlParseStartTag:
3821	* @ctxt: an HTML parser context
3822	*
3823	* parse a start of tag either for rule element or
3824	* EmptyElement. In both case we don't parse the tag closing chars.
3825	*
3826	* [40] STag ::= '<' Name (S Attribute)* S? '>'
3827	*
3828	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3829	*
3830	* With namespace:
3831	*
3832	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3833	*
3834	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3835	*
3836	* Returns 0 in case of success, -1 in case of error and 1 if discarded
3837	*/
3838
3839	static int
3840	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3841	const xmlChar *name;
3842	const xmlChar *attname;
3843	xmlChar *attvalue;
3844	const xmlChar **atts;
3845	int nbatts = 0;
3846	int maxatts;
3847	int meta = 0;
3848	int i;
3849	int discardtag = 0;
3850
3851	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3852	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3853	"htmlParseStartTag: context error\n", NULL, NULL);
3854	return -1;
3855	}
3856	if (ctxt->instate == XML_PARSER_EOF)
3857	return(-1);
3858	if (CUR != '<') return -1;
3859	NEXT;
3860
3861	atts = ctxt->atts;
3862	maxatts = ctxt->maxatts;
3863
3864	GROW;
3865	name = htmlParseHTMLName(ctxt);
3866	if (name == NULL) {
3867	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3868	"htmlParseStartTag: invalid element name\n",
3869	NULL, NULL);
3870	/* Dump the bogus tag like browsers do */
3871	while ((CUR != 0) && (CUR != '>') &&
3872	(ctxt->instate != XML_PARSER_EOF))
3873	NEXT;
3874	return -1;
3875	}
3876	if (xmlStrEqual(name, BAD_CAST"meta"))
3877	meta = 1;
3878
3879	/*
3880	* Check for auto-closure of HTML elements.
3881	*/
3882	htmlAutoClose(ctxt, name);
3883
3884	/*
3885	* Check for implied HTML elements.
3886	*/
3887	htmlCheckImplied(ctxt, name);
3888
3889	/*
3890	* Avoid html at any level > 0, head at any level != 1
3891	* or any attempt to recurse body
3892	*/
3893	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3894	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3895	"htmlParseStartTag: misplaced <html> tag\n",
3896	name, NULL);
3897	discardtag = 1;
3898	ctxt->depth++;
3899	}
3900	if ((ctxt->nameNr != 1) &&
3901	(xmlStrEqual(name, BAD_CAST"head"))) {
3902	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3903	"htmlParseStartTag: misplaced <head> tag\n",
3904	name, NULL);
3905	discardtag = 1;
3906	ctxt->depth++;
3907	}
3908	if (xmlStrEqual(name, BAD_CAST"body")) {
3909	int indx;
3910	for (indx = 0;indx < ctxt->nameNr;indx++) {
3911	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3912	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3913	"htmlParseStartTag: misplaced <body> tag\n",
3914	name, NULL);
3915	discardtag = 1;
3916	ctxt->depth++;
3917	}
3918	}
3919	}
3920
3921	/*
3922	* Now parse the attributes, it ends up with the ending
3923	*
3924	* (S Attribute)* S?
3925	*/
3926	SKIP_BLANKS;
3927	while ((CUR != 0) &&
3928	(CUR != '>') &&
3929	((CUR != '/') \|\| (NXT(1) != '>')) &&
3930	(ctxt->instate != XML_PARSER_EOF)) {
3931	GROW;
3932	attname = htmlParseAttribute(ctxt, &attvalue);
3933	if (attname != NULL) {
3934
3935	/*
3936	* Well formedness requires at most one declaration of an attribute
3937	*/
3938	for (i = 0; i < nbatts;i += 2) {
3939	if (xmlStrEqual(atts[i], attname)) {
3940	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3941	"Attribute %s redefined\n", attname, NULL);
3942	if (attvalue != NULL)
3943	xmlFree(attvalue);
3944	goto failed;
3945	}
3946	}
3947
3948	/*
3949	* Add the pair to atts
3950	*/
3951	if (atts == NULL) {
3952	maxatts = 22; /* allow for 10 attrs by default */
3953	atts = (const xmlChar **)
3954	xmlMalloc(maxatts * sizeof(xmlChar *));
3955	if (atts == NULL) {
3956	htmlErrMemory(ctxt, NULL);
3957	if (attvalue != NULL)
3958	xmlFree(attvalue);
3959	goto failed;
3960	}
3961	ctxt->atts = atts;
3962	ctxt->maxatts = maxatts;
3963	} else if (nbatts + 4 > maxatts) {
3964	const xmlChar **n;
3965
3966	maxatts *= 2;
3967	n = (const xmlChar *) xmlRealloc((void ) atts,
3968	maxatts * sizeof(const xmlChar *));
3969	if (n == NULL) {
3970	htmlErrMemory(ctxt, NULL);
3971	if (attvalue != NULL)
3972	xmlFree(attvalue);
3973	goto failed;
3974	}
3975	atts = n;
3976	ctxt->atts = atts;
3977	ctxt->maxatts = maxatts;
3978	}
3979	atts[nbatts++] = attname;
3980	atts[nbatts++] = attvalue;
3981	atts[nbatts] = NULL;
3982	atts[nbatts + 1] = NULL;
3983	}
3984	else {
3985	if (attvalue != NULL)
3986	xmlFree(attvalue);
3987	/* Dump the bogus attribute string up to the next blank or
3988	* the end of the tag. */
3989	while ((CUR != 0) &&
3990	!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3991	((CUR != '/') \|\| (NXT(1) != '>')) &&
3992	(ctxt->instate != XML_PARSER_EOF))
3993	NEXT;
3994	}
3995
3996	failed:
3997	SKIP_BLANKS;
3998	}
3999
4000	/*
4001	* Handle specific association to the META tag
4002	*/
4003	if (meta && (nbatts != 0))
4004	htmlCheckMeta(ctxt, atts);
4005
4006	/*
4007	* SAX: Start of Element !
4008	*/
4009	if (!discardtag) {
4010	htmlnamePush(ctxt, name);
4011	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4012	if (nbatts != 0)
4013	ctxt->sax->startElement(ctxt->userData, name, atts);
4014	else
4015	ctxt->sax->startElement(ctxt->userData, name, NULL);
4016	}
4017	}
4018
4019	if (atts != NULL) {
4020	for (i = 1;i < nbatts;i += 2) {
4021	if (atts[i] != NULL)
4022	xmlFree((xmlChar *) atts[i]);
4023	}
4024	}
4025
4026	return(discardtag);
4027	}
4028
4029	/**
4030	* htmlParseEndTag:
4031	* @ctxt: an HTML parser context
4032	*
4033	* parse an end of tag
4034	*
4035	* [42] ETag ::= '</' Name S? '>'
4036	*
4037	* With namespace
4038	*
4039	* [NS 9] ETag ::= '</' QName S? '>'
4040	*
4041	* Returns 1 if the current level should be closed.
4042	*/
4043
4044	static int
4045	htmlParseEndTag(htmlParserCtxtPtr ctxt)
4046	{
4047	const xmlChar *name;
4048	const xmlChar *oldname;
4049	int i, ret;
4050
4051	if ((CUR != '<') \|\| (NXT(1) != '/')) {
4052	htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4053	"htmlParseEndTag: '</' not found\n", NULL, NULL);
4054	return (0);
4055	}
4056	SKIP(2);
4057
4058	name = htmlParseHTMLName(ctxt);
4059	if (name == NULL)
4060	return (0);
4061	/*
4062	* We should definitely be at the ending "S? '>'" part
4063	*/
4064	SKIP_BLANKS;
4065	if (CUR != '>') {
4066	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4067	"End tag : expected '>'\n", NULL, NULL);
4068	/* Skip to next '>' */
4069	while ((CUR != 0) && (CUR != '>'))
4070	NEXT;
4071	}
4072	if (CUR == '>')
4073	NEXT;
4074
4075	/*
4076	* if we ignored misplaced tags in htmlParseStartTag don't pop them
4077	* out now.
4078	*/
4079	if ((ctxt->depth > 0) &&
4080	(xmlStrEqual(name, BAD_CAST "html") \|\|
4081	xmlStrEqual(name, BAD_CAST "body") \|\|
4082	xmlStrEqual(name, BAD_CAST "head"))) {
4083	ctxt->depth--;
4084	return (0);
4085	}
4086
4087	/*
4088	* If the name read is not one of the element in the parsing stack
4089	* then return, it's just an error.
4090	*/
4091	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4092	if (xmlStrEqual(name, ctxt->nameTab[i]))
4093	break;
4094	}
4095	if (i < 0) {
4096	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4097	"Unexpected end tag : %s\n", name, NULL);
4098	return (0);
4099	}
4100
4101
4102	/*
4103	* Check for auto-closure of HTML elements.
4104	*/
4105
4106	htmlAutoCloseOnClose(ctxt, name);
4107
4108	/*
4109	* Well formedness constraints, opening and closing must match.
4110	* With the exception that the autoclose may have popped stuff out
4111	* of the stack.
4112	*/
4113	if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4114	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4115	"Opening and ending tag mismatch: %s and %s\n",
4116	name, ctxt->name);
4117	}
4118
4119	/*
4120	* SAX: End of Tag
4121	*/
4122	oldname = ctxt->name;
4123	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4124	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4125	ctxt->sax->endElement(ctxt->userData, name);
4126	htmlNodeInfoPop(ctxt);
4127	htmlnamePop(ctxt);
4128	ret = 1;
4129	} else {
4130	ret = 0;
4131	}
4132
4133	return (ret);
4134	}
4135
4136
4137	/**
4138	* htmlParseReference:
4139	* @ctxt: an HTML parser context
4140	*
4141	* parse and handle entity references in content,
4142	* this will end-up in a call to character() since this is either a
4143	* CharRef, or a predefined entity.
4144	*/
4145	static void
4146	htmlParseReference(htmlParserCtxtPtr ctxt) {
4147	const htmlEntityDesc * ent;
4148	xmlChar out[6];
4149	const xmlChar *name;
4150	if (CUR != '&') return;
4151
4152	if (NXT(1) == '#') {
4153	unsigned int c;
4154	int bits, i = 0;
4155
4156	c = htmlParseCharRef(ctxt);
4157	if (c == 0)
4158	return;
4159
4160	if (c < 0x80) { out[i++]= c; bits= -6; }
4161	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
4162	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
4163	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
4164
4165	for ( ; bits >= 0; bits-= 6) {
4166	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
4167	}
4168	out[i] = 0;
4169
4170	htmlCheckParagraph(ctxt);
4171	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4172	ctxt->sax->characters(ctxt->userData, out, i);
4173	} else {
4174	ent = htmlParseEntityRef(ctxt, &name);
4175	if (name == NULL) {
4176	htmlCheckParagraph(ctxt);
4177	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4178	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4179	return;
4180	}
4181	if ((ent == NULL) \|\| !(ent->value > 0)) {
4182	htmlCheckParagraph(ctxt);
4183	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4184	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4185	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4186	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4187	}
4188	} else {
4189	unsigned int c;
4190	int bits, i = 0;
4191
4192	c = ent->value;
4193	if (c < 0x80)
4194	{ out[i++]= c; bits= -6; }
4195	else if (c < 0x800)
4196	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
4197	else if (c < 0x10000)
4198	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
4199	else
4200	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
4201
4202	for ( ; bits >= 0; bits-= 6) {
4203	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
4204	}
4205	out[i] = 0;
4206
4207	htmlCheckParagraph(ctxt);
4208	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4209	ctxt->sax->characters(ctxt->userData, out, i);
4210	}
4211	}
4212	}
4213
4214	/**
4215	* htmlParseContent:
4216	* @ctxt: an HTML parser context
4217	*
4218	* Parse a content: comment, sub-element, reference or text.
4219	* Kept for compatibility with old code
4220	*/
4221
4222	static void
4223	htmlParseContent(htmlParserCtxtPtr ctxt) {
4224	xmlChar *currentNode;
4225	int depth;
4226	const xmlChar *name;
4227
4228	currentNode = xmlStrdup(ctxt->name);
4229	depth = ctxt->nameNr;
4230	while (1) {
4231	GROW;
4232
4233	if (ctxt->instate == XML_PARSER_EOF)
4234	break;
4235
4236	/*
4237	* Our tag or one of it's parent or children is ending.
4238	*/
4239	if ((CUR == '<') && (NXT(1) == '/')) {
4240	if (htmlParseEndTag(ctxt) &&
4241	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
4242	if (currentNode != NULL)
4243	xmlFree(currentNode);
4244	return;
4245	}
4246	continue; /* while */
4247	}
4248
4249	else if ((CUR == '<') &&
4250	((IS_ASCII_LETTER(NXT(1))) \|\|
4251	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
4252	name = htmlParseHTMLName_nonInvasive(ctxt);
4253	if (name == NULL) {
4254	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4255	"htmlParseStartTag: invalid element name\n",
4256	NULL, NULL);
4257	/* Dump the bogus tag like browsers do */
4258	while ((CUR != 0) && (CUR != '>'))
4259	NEXT;
4260
4261	if (currentNode != NULL)
4262	xmlFree(currentNode);
4263	return;
4264	}
4265
4266	if (ctxt->name != NULL) {
4267	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4268	htmlAutoClose(ctxt, name);
4269	continue;
4270	}
4271	}
4272	}
4273
4274	/*
4275	* Has this node been popped out during parsing of
4276	* the next element
4277	*/
4278	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4279	(!xmlStrEqual(currentNode, ctxt->name)))
4280	{
4281	if (currentNode != NULL) xmlFree(currentNode);
4282	return;
4283	}
4284
4285	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
4286	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
4287	/*
4288	* Handle SCRIPT/STYLE separately
4289	*/
4290	htmlParseScript(ctxt);
4291	}
4292
4293	else if ((CUR == '<') && (NXT(1) == '!')) {
4294	/*
4295	* Sometimes DOCTYPE arrives in the middle of the document
4296	*/
4297	if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4298	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4299	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4300	(UPP(8) == 'E')) {
4301	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4302	"Misplaced DOCTYPE declaration\n",
4303	BAD_CAST "DOCTYPE" , NULL);
4304	htmlParseDocTypeDecl(ctxt);
4305	}
4306	/*
4307	* First case : a comment
4308	*/
4309	else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4310	htmlParseComment(ctxt);
4311	}
4312	else {
4313	htmlSkipBogusComment(ctxt);
4314	}
4315	}
4316
4317	/*
4318	* Second case : a Processing Instruction.
4319	*/
4320	else if ((CUR == '<') && (NXT(1) == '?')) {
4321	htmlParsePI(ctxt);
4322	}
4323
4324	/*
4325	* Third case : a sub-element.
4326	*/
4327	else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4328	htmlParseElement(ctxt);
4329	}
4330	else if (CUR == '<') {
4331	if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4332	(ctxt->sax->characters != NULL))
4333	ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4334	NEXT;
4335	}
4336
4337	/*
4338	* Fourth case : a reference. If if has not been resolved,
4339	* parsing returns it's Name, create the node
4340	*/
4341	else if (CUR == '&') {
4342	htmlParseReference(ctxt);
4343	}
4344
4345	/*
4346	* Fifth case : end of the resource
4347	*/
4348	else if (CUR == 0) {
4349	htmlAutoCloseOnEnd(ctxt);
4350	break;
4351	}
4352
4353	/*
4354	* Last case, text. Note that References are handled directly.
4355	*/
4356	else {
4357	htmlParseCharData(ctxt);
4358	}
4359
4360	SHRINK;
4361	GROW;
4362	}
4363	if (currentNode != NULL) xmlFree(currentNode);
4364	}
4365
4366	/**
4367	* htmlParseElement:
4368	* @ctxt: an HTML parser context
4369	*
4370	* DEPRECATED: Internal function, don't use.
4371	*
4372	* parse an HTML element, this is highly recursive
4373	* this is kept for compatibility with previous code versions
4374	*
4375	* [39] element ::= EmptyElemTag \| STag content ETag
4376	*
4377	* [41] Attribute ::= Name Eq AttValue
4378	*/
4379
4380	void
4381	htmlParseElement(htmlParserCtxtPtr ctxt) {
4382	const xmlChar *name;
4383	xmlChar *currentNode = NULL;
4384	const htmlElemDesc * info;
4385	htmlParserNodeInfo node_info;
4386	int failed;
4387	int depth;
4388	const xmlChar *oldptr;
4389
4390	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4391	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4392	"htmlParseElement: context error\n", NULL, NULL);
4393	return;
4394	}
4395
4396	if (ctxt->instate == XML_PARSER_EOF)
4397	return;
4398
4399	/* Capture start position */
4400	if (ctxt->record_info) {
4401	node_info.begin_pos = ctxt->input->consumed +
4402	(CUR_PTR - ctxt->input->base);
4403	node_info.begin_line = ctxt->input->line;
4404	}
4405
4406	failed = htmlParseStartTag(ctxt);
4407	name = ctxt->name;
4408	if ((failed == -1) \|\| (name == NULL)) {
4409	if (CUR == '>')
4410	NEXT;
4411	return;
4412	}
4413
4414	/*
4415	* Lookup the info for that element.
4416	*/
4417	info = htmlTagLookup(name);
4418	if (info == NULL) {
4419	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4420	"Tag %s invalid\n", name, NULL);
4421	}
4422
4423	/*
4424	* Check for an Empty Element labeled the XML/SGML way
4425	*/
4426	if ((CUR == '/') && (NXT(1) == '>')) {
4427	SKIP(2);
4428	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4429	ctxt->sax->endElement(ctxt->userData, name);
4430	htmlnamePop(ctxt);
4431	return;
4432	}
4433
4434	if (CUR == '>') {
4435	NEXT;
4436	} else {
4437	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4438	"Couldn't find end of Start Tag %s\n", name, NULL);
4439
4440	/*
4441	* end of parsing of this node.
4442	*/
4443	if (xmlStrEqual(name, ctxt->name)) {
4444	nodePop(ctxt);
4445	htmlnamePop(ctxt);
4446	}
4447
4448	/*
4449	* Capture end position and add node
4450	*/
4451	if (ctxt->record_info) {
4452	node_info.end_pos = ctxt->input->consumed +
4453	(CUR_PTR - ctxt->input->base);
4454	node_info.end_line = ctxt->input->line;
4455	node_info.node = ctxt->node;
4456	xmlParserAddNodeInfo(ctxt, &node_info);
4457	}
4458	return;
4459	}
4460
4461	/*
4462	* Check for an Empty Element from DTD definition
4463	*/
4464	if ((info != NULL) && (info->empty)) {
4465	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4466	ctxt->sax->endElement(ctxt->userData, name);
4467	htmlnamePop(ctxt);
4468	return;
4469	}
4470
4471	/*
4472	* Parse the content of the element:
4473	*/
4474	currentNode = xmlStrdup(ctxt->name);
4475	depth = ctxt->nameNr;
4476	while (CUR != 0) {
4477	oldptr = ctxt->input->cur;
4478	htmlParseContent(ctxt);
4479	if (oldptr==ctxt->input->cur) break;
4480	if (ctxt->nameNr < depth) break;
4481	}
4482
4483	/*
4484	* Capture end position and add node
4485	*/
4486	if ( currentNode != NULL && ctxt->record_info ) {
4487	node_info.end_pos = ctxt->input->consumed +
4488	(CUR_PTR - ctxt->input->base);
4489	node_info.end_line = ctxt->input->line;
4490	node_info.node = ctxt->node;
4491	xmlParserAddNodeInfo(ctxt, &node_info);
4492	}
4493	if (CUR == 0) {
4494	htmlAutoCloseOnEnd(ctxt);
4495	}
4496
4497	if (currentNode != NULL)
4498	xmlFree(currentNode);
4499	}
4500
4501	static void
4502	htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4503	/*
4504	* Capture end position and add node
4505	*/
4506	if ( ctxt->node != NULL && ctxt->record_info ) {
4507	ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4508	(CUR_PTR - ctxt->input->base);
4509	ctxt->nodeInfo->end_line = ctxt->input->line;
4510	ctxt->nodeInfo->node = ctxt->node;
4511	xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4512	htmlNodeInfoPop(ctxt);
4513	}
4514	if (CUR == 0) {
4515	htmlAutoCloseOnEnd(ctxt);
4516	}
4517	}
4518
4519	/**
4520	* htmlParseElementInternal:
4521	* @ctxt: an HTML parser context
4522	*
4523	* parse an HTML element, new version, non recursive
4524	*
4525	* [39] element ::= EmptyElemTag \| STag content ETag
4526	*
4527	* [41] Attribute ::= Name Eq AttValue
4528	*/
4529
4530	static void
4531	htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4532	const xmlChar *name;
4533	const htmlElemDesc * info;
4534	htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4535	int failed;
4536
4537	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4538	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4539	"htmlParseElementInternal: context error\n", NULL, NULL);
4540	return;
4541	}
4542
4543	if (ctxt->instate == XML_PARSER_EOF)
4544	return;
4545
4546	/* Capture start position */
4547	if (ctxt->record_info) {
4548	node_info.begin_pos = ctxt->input->consumed +
4549	(CUR_PTR - ctxt->input->base);
4550	node_info.begin_line = ctxt->input->line;
4551	}
4552
4553	failed = htmlParseStartTag(ctxt);
4554	name = ctxt->name;
4555	if ((failed == -1) \|\| (name == NULL)) {
4556	if (CUR == '>')
4557	NEXT;
4558	return;
4559	}
4560
4561	/*
4562	* Lookup the info for that element.
4563	*/
4564	info = htmlTagLookup(name);
4565	if (info == NULL) {
4566	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4567	"Tag %s invalid\n", name, NULL);
4568	}
4569
4570	/*
4571	* Check for an Empty Element labeled the XML/SGML way
4572	*/
4573	if ((CUR == '/') && (NXT(1) == '>')) {
4574	SKIP(2);
4575	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4576	ctxt->sax->endElement(ctxt->userData, name);
4577	htmlnamePop(ctxt);
4578	return;
4579	}
4580
4581	if (CUR == '>') {
4582	NEXT;
4583	} else {
4584	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4585	"Couldn't find end of Start Tag %s\n", name, NULL);
4586
4587	/*
4588	* end of parsing of this node.
4589	*/
4590	if (xmlStrEqual(name, ctxt->name)) {
4591	nodePop(ctxt);
4592	htmlnamePop(ctxt);
4593	}
4594
4595	if (ctxt->record_info)
4596	htmlNodeInfoPush(ctxt, &node_info);
4597	htmlParserFinishElementParsing(ctxt);
4598	return;
4599	}
4600
4601	/*
4602	* Check for an Empty Element from DTD definition
4603	*/
4604	if ((info != NULL) && (info->empty)) {
4605	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4606	ctxt->sax->endElement(ctxt->userData, name);
4607	htmlnamePop(ctxt);
4608	return;
4609	}
4610
4611	if (ctxt->record_info)
4612	htmlNodeInfoPush(ctxt, &node_info);
4613	}
4614
4615	/**
4616	* htmlParseContentInternal:
4617	* @ctxt: an HTML parser context
4618	*
4619	* Parse a content: comment, sub-element, reference or text.
4620	* New version for non recursive htmlParseElementInternal
4621	*/
4622
4623	static void
4624	htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4625	xmlChar *currentNode;
4626	int depth;
4627	const xmlChar *name;
4628
4629	depth = ctxt->nameNr;
4630	if (depth <= 0) {
4631	currentNode = NULL;
4632	} else {
4633	currentNode = xmlStrdup(ctxt->name);
4634	if (currentNode == NULL) {
4635	htmlErrMemory(ctxt, NULL);
4636	return;
4637	}
4638	}
4639	while (1) {
4640	GROW;
4641
4642	if (ctxt->instate == XML_PARSER_EOF)
4643	break;
4644
4645	/*
4646	* Our tag or one of it's parent or children is ending.
4647	*/
4648	if ((CUR == '<') && (NXT(1) == '/')) {
4649	if (htmlParseEndTag(ctxt) &&
4650	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
4651	if (currentNode != NULL)
4652	xmlFree(currentNode);
4653
4654	depth = ctxt->nameNr;
4655	if (depth <= 0) {
4656	currentNode = NULL;
4657	} else {
4658	currentNode = xmlStrdup(ctxt->name);
4659	if (currentNode == NULL) {
4660	htmlErrMemory(ctxt, NULL);
4661	break;
4662	}
4663	}
4664	}
4665	continue; /* while */
4666	}
4667
4668	else if ((CUR == '<') &&
4669	((IS_ASCII_LETTER(NXT(1))) \|\|
4670	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
4671	name = htmlParseHTMLName_nonInvasive(ctxt);
4672	if (name == NULL) {
4673	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4674	"htmlParseStartTag: invalid element name\n",
4675	NULL, NULL);
4676	/* Dump the bogus tag like browsers do */
4677	while ((CUR == 0) && (CUR != '>'))
4678	NEXT;
4679
4680	htmlParserFinishElementParsing(ctxt);
4681	if (currentNode != NULL)
4682	xmlFree(currentNode);
4683
4684	currentNode = xmlStrdup(ctxt->name);
4685	if (currentNode == NULL) {
4686	htmlErrMemory(ctxt, NULL);
4687	break;
4688	}
4689	depth = ctxt->nameNr;
4690	continue;
4691	}
4692
4693	if (ctxt->name != NULL) {
4694	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4695	htmlAutoClose(ctxt, name);
4696	continue;
4697	}
4698	}
4699	}
4700
4701	/*
4702	* Has this node been popped out during parsing of
4703	* the next element
4704	*/
4705	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4706	(!xmlStrEqual(currentNode, ctxt->name)))
4707	{
4708	htmlParserFinishElementParsing(ctxt);
4709	if (currentNode != NULL) xmlFree(currentNode);
4710
4711	currentNode = xmlStrdup(ctxt->name);
4712	if (currentNode == NULL) {
4713	htmlErrMemory(ctxt, NULL);
4714	break;
4715	}
4716	depth = ctxt->nameNr;
4717	continue;
4718	}
4719
4720	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
4721	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
4722	/*
4723	* Handle SCRIPT/STYLE separately
4724	*/
4725	htmlParseScript(ctxt);
4726	}
4727
4728	else if ((CUR == '<') && (NXT(1) == '!')) {
4729	/*
4730	* Sometimes DOCTYPE arrives in the middle of the document
4731	*/
4732	if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4733	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4734	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4735	(UPP(8) == 'E')) {
4736	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4737	"Misplaced DOCTYPE declaration\n",
4738	BAD_CAST "DOCTYPE" , NULL);
4739	htmlParseDocTypeDecl(ctxt);
4740	}
4741	/*
4742	* First case : a comment
4743	*/
4744	else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4745	htmlParseComment(ctxt);
4746	}
4747	else {
4748	htmlSkipBogusComment(ctxt);
4749	}
4750	}
4751
4752	/*
4753	* Second case : a Processing Instruction.
4754	*/
4755	else if ((CUR == '<') && (NXT(1) == '?')) {
4756	htmlParsePI(ctxt);
4757	}
4758
4759	/*
4760	* Third case : a sub-element.
4761	*/
4762	else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4763	htmlParseElementInternal(ctxt);
4764	if (currentNode != NULL) xmlFree(currentNode);
4765
4766	currentNode = xmlStrdup(ctxt->name);
4767	if (currentNode == NULL) {
4768	htmlErrMemory(ctxt, NULL);
4769	break;
4770	}
4771	depth = ctxt->nameNr;
4772	}
4773	else if (CUR == '<') {
4774	if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4775	(ctxt->sax->characters != NULL))
4776	ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4777	NEXT;
4778	}
4779
4780	/*
4781	* Fourth case : a reference. If if has not been resolved,
4782	* parsing returns it's Name, create the node
4783	*/
4784	else if (CUR == '&') {
4785	htmlParseReference(ctxt);
4786	}
4787
4788	/*
4789	* Fifth case : end of the resource
4790	*/
4791	else if (CUR == 0) {
4792	htmlAutoCloseOnEnd(ctxt);
4793	break;
4794	}
4795
4796	/*
4797	* Last case, text. Note that References are handled directly.
4798	*/
4799	else {
4800	htmlParseCharData(ctxt);
4801	}
4802
4803	SHRINK;
4804	GROW;
4805	}
4806	if (currentNode != NULL) xmlFree(currentNode);
4807	}
4808
4809	/**
4810	* htmlParseContent:
4811	* @ctxt: an HTML parser context
4812	*
4813	* Parse a content: comment, sub-element, reference or text.
4814	* This is the entry point when called from parser.c
4815	*/
4816
4817	void
4818	__htmlParseContent(void *ctxt) {
4819	if (ctxt != NULL)
4820	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4821	}
4822
4823	/**
4824	* htmlParseDocument:
4825	* @ctxt: an HTML parser context
4826	*
4827	* parse an HTML document (and build a tree if using the standard SAX
4828	* interface).
4829	*
4830	* Returns 0, -1 in case of error. the parser context is augmented
4831	* as a result of the parsing.
4832	*/
4833
4834	int
4835	htmlParseDocument(htmlParserCtxtPtr ctxt) {
4836	xmlDtdPtr dtd;
4837
4838	xmlInitParser();
4839
4840	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4841	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4842	"htmlParseDocument: context error\n", NULL, NULL);
4843	return(XML_ERR_INTERNAL_ERROR);
4844	}
4845
4846	/*
4847	* SAX: beginning of the document processing.
4848	*/
4849	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4850	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4851
4852	xmlDetectEncoding(ctxt);
4853
4854	/*
4855	* This is wrong but matches long-standing behavior. In most cases,
4856	* a document starting with an XML declaration will specify UTF-8.
4857	*/
4858	if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4859	(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4860	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4861
4862	/*
4863	* Wipe out everything which is before the first '<'
4864	*/
4865	SKIP_BLANKS;
4866	if (CUR == 0) {
4867	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4868	"Document is empty\n", NULL, NULL);
4869	}
4870
4871	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4872	ctxt->sax->startDocument(ctxt->userData);
4873
4874
4875	/*
4876	* Parse possible comments and PIs before any content
4877	*/
4878	while (((CUR == '<') && (NXT(1) == '!') &&
4879	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4880	((CUR == '<') && (NXT(1) == '?'))) {
4881	htmlParseComment(ctxt);
4882	htmlParsePI(ctxt);
4883	SKIP_BLANKS;
4884	}
4885
4886
4887	/*
4888	* Then possibly doc type declaration(s) and more Misc
4889	* (doctypedecl Misc*)?
4890	*/
4891	if ((CUR == '<') && (NXT(1) == '!') &&
4892	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4893	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4894	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4895	(UPP(8) == 'E')) {
4896	htmlParseDocTypeDecl(ctxt);
4897	}
4898	SKIP_BLANKS;
4899
4900	/*
4901	* Parse possible comments and PIs before any content
4902	*/
4903	while (((CUR == '<') && (NXT(1) == '!') &&
4904	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4905	((CUR == '<') && (NXT(1) == '?'))) {
4906	htmlParseComment(ctxt);
4907	htmlParsePI(ctxt);
4908	SKIP_BLANKS;
4909	}
4910
4911	/*
4912	* Time to start parsing the tree itself
4913	*/
4914	htmlParseContentInternal(ctxt);
4915
4916	/*
4917	* autoclose
4918	*/
4919	if (CUR == 0)
4920	htmlAutoCloseOnEnd(ctxt);
4921
4922
4923	/*
4924	* SAX: end of the document processing.
4925	*/
4926	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4927	ctxt->sax->endDocument(ctxt->userData);
4928
4929	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4930	dtd = xmlGetIntSubset(ctxt->myDoc);
4931	if (dtd == NULL)
4932	ctxt->myDoc->intSubset =
4933	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4934	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4935	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4936	}
4937	if (! ctxt->wellFormed) return(-1);
4938	return(0);
4939	}
4940
4941
4942	/************************************************************************
4943	* *
4944	* Parser contexts handling *
4945	* *
4946	************************************************************************/
4947
4948	/**
4949	* htmlInitParserCtxt:
4950	* @ctxt: an HTML parser context
4951	* @sax: SAX handler
4952	* @userData: user data
4953	*
4954	* Initialize a parser context
4955	*
4956	* Returns 0 in case of success and -1 in case of error
4957	*/
4958
4959	static int
4960	htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4961	void *userData)
4962	{
4963	if (ctxt == NULL) return(-1);
4964	memset(ctxt, 0, sizeof(htmlParserCtxt));
4965
4966	ctxt->dict = xmlDictCreate();
4967	if (ctxt->dict == NULL) {
4968	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4969	return(-1);
4970	}
4971
4972	if (ctxt->sax == NULL)
4973	ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4974	if (ctxt->sax == NULL) {
4975	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4976	return(-1);
4977	}
4978	if (sax == NULL) {
4979	memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4980	xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4981	ctxt->userData = ctxt;
4982	} else {
4983	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4984	ctxt->userData = userData ? userData : ctxt;
4985	}
4986
4987	/* Allocate the Input stack */
4988	ctxt->inputTab = (htmlParserInputPtr *)
4989	xmlMalloc(5 * sizeof(htmlParserInputPtr));
4990	if (ctxt->inputTab == NULL) {
4991	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4992	ctxt->inputNr = 0;
4993	ctxt->inputMax = 0;
4994	ctxt->input = NULL;
4995	return(-1);
4996	}
4997	ctxt->inputNr = 0;
4998	ctxt->inputMax = 5;
4999	ctxt->input = NULL;
5000	ctxt->version = NULL;
5001	ctxt->encoding = NULL;
5002	ctxt->standalone = -1;
5003	ctxt->instate = XML_PARSER_START;
5004
5005	/* Allocate the Node stack */
5006	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
5007	if (ctxt->nodeTab == NULL) {
5008	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5009	ctxt->nodeNr = 0;
5010	ctxt->nodeMax = 0;
5011	ctxt->node = NULL;
5012	ctxt->inputNr = 0;
5013	ctxt->inputMax = 0;
5014	ctxt->input = NULL;
5015	return(-1);
5016	}
5017	ctxt->nodeNr = 0;
5018	ctxt->nodeMax = 10;
5019	ctxt->node = NULL;
5020
5021	/* Allocate the Name stack */
5022	ctxt->nameTab = (const xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
5023	if (ctxt->nameTab == NULL) {
5024	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5025	ctxt->nameNr = 0;
5026	ctxt->nameMax = 0;
5027	ctxt->name = NULL;
5028	ctxt->nodeNr = 0;
5029	ctxt->nodeMax = 0;
5030	ctxt->node = NULL;
5031	ctxt->inputNr = 0;
5032	ctxt->inputMax = 0;
5033	ctxt->input = NULL;
5034	return(-1);
5035	}
5036	ctxt->nameNr = 0;
5037	ctxt->nameMax = 10;
5038	ctxt->name = NULL;
5039
5040	ctxt->nodeInfoTab = NULL;
5041	ctxt->nodeInfoNr = 0;
5042	ctxt->nodeInfoMax = 0;
5043
5044	ctxt->myDoc = NULL;
5045	ctxt->wellFormed = 1;
5046	ctxt->replaceEntities = 0;
5047	ctxt->linenumbers = xmlLineNumbersDefaultValue;
5048	ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5049	ctxt->html = 1;
5050	ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5051	ctxt->vctxt.userData = ctxt;
5052	ctxt->vctxt.error = xmlParserValidityError;
5053	ctxt->vctxt.warning = xmlParserValidityWarning;
5054	ctxt->record_info = 0;
5055	ctxt->validate = 0;
5056	ctxt->checkIndex = 0;
5057	ctxt->catalogs = NULL;
5058	xmlInitNodeInfoSeq(&ctxt->node_seq);
5059	return(0);
5060	}
5061
5062	/**
5063	* htmlFreeParserCtxt:
5064	* @ctxt: an HTML parser context
5065	*
5066	* Free all the memory used by a parser context. However the parsed
5067	* document in ctxt->myDoc is not freed.
5068	*/
5069
5070	void
5071	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5072	{
5073	xmlFreeParserCtxt(ctxt);
5074	}
5075
5076	/**
5077	* htmlNewParserCtxt:
5078	*
5079	* Allocate and initialize a new parser context.
5080	*
5081	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
5082	*/
5083
5084	htmlParserCtxtPtr
5085	htmlNewParserCtxt(void)
5086	{
5087	return(htmlNewSAXParserCtxt(NULL, NULL));
5088	}
5089
5090	/**
5091	* htmlNewSAXParserCtxt:
5092	* @sax: SAX handler
5093	* @userData: user data
5094	*
5095	* Allocate and initialize a new SAX parser context. If userData is NULL,
5096	* the parser context will be passed as user data.
5097	*
5098	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
5099	*/
5100
5101	htmlParserCtxtPtr
5102	htmlNewSAXParserCtxt(const htmlSAXHandler sax, void userData)
5103	{
5104	xmlParserCtxtPtr ctxt;
5105
5106	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5107	if (ctxt == NULL) {
5108	htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5109	return(NULL);
5110	}
5111	memset(ctxt, 0, sizeof(xmlParserCtxt));
5112	if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5113	htmlFreeParserCtxt(ctxt);
5114	return(NULL);
5115	}
5116	return(ctxt);
5117	}
5118
5119	/**
5120	* htmlCreateMemoryParserCtxt:
5121	* @buffer: a pointer to a char array
5122	* @size: the size of the array
5123	*
5124	* Create a parser context for an HTML in-memory document.
5125	*
5126	* Returns the new parser context or NULL
5127	*/
5128	htmlParserCtxtPtr
5129	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5130	xmlParserCtxtPtr ctxt;
5131	xmlParserInputPtr input;
5132	xmlParserInputBufferPtr buf;
5133
5134	if (buffer == NULL)
5135	return(NULL);
5136	if (size <= 0)
5137	return(NULL);
5138
5139	ctxt = htmlNewParserCtxt();
5140	if (ctxt == NULL)
5141	return(NULL);
5142
5143	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5144	if (buf == NULL) {
5145	xmlFreeParserCtxt(ctxt);
5146	return(NULL);
5147	}
5148
5149	input = xmlNewInputStream(ctxt);
5150	if (input == NULL) {
5151	xmlFreeParserInputBuffer(buf);
5152	xmlFreeParserCtxt(ctxt);
5153	return(NULL);
5154	}
5155
5156	input->filename = NULL;
5157	input->buf = buf;
5158	xmlBufResetInput(buf->buffer, input);
5159
5160	inputPush(ctxt, input);
5161	return(ctxt);
5162	}
5163
5164	/**
5165	* htmlCreateDocParserCtxt:
5166	* @str: a pointer to an array of xmlChar
5167	* @encoding: a free form C string describing the HTML document encoding, or NULL
5168	*
5169	* Create a parser context for an HTML document.
5170	*
5171	* TODO: check the need to add encoding handling there
5172	*
5173	* Returns the new parser context or NULL
5174	*/
5175	static htmlParserCtxtPtr
5176	htmlCreateDocParserCtxt(const xmlChar str, const char encoding) {
5177	xmlParserCtxtPtr ctxt;
5178	xmlParserInputPtr input;
5179	xmlParserInputBufferPtr buf;
5180
5181	if (str == NULL)
5182	return(NULL);
5183
5184	ctxt = htmlNewParserCtxt();
5185	if (ctxt == NULL)
5186	return(NULL);
5187
5188	buf = xmlParserInputBufferCreateString(str);
5189	if (buf == NULL) {
5190	xmlFreeParserCtxt(ctxt);
5191	return(NULL);
5192	}
5193
5194	input = xmlNewInputStream(ctxt);
5195	if (input == NULL) {
5196	xmlFreeParserInputBuffer(buf);
5197	xmlFreeParserCtxt(ctxt);
5198	return(NULL);
5199	}
5200
5201	input->filename = NULL;
5202	input->buf = buf;
5203	xmlBufResetInput(buf->buffer, input);
5204
5205	inputPush(ctxt, input);
5206
5207	if (encoding != NULL) {
5208	xmlCharEncoding enc;
5209	xmlCharEncodingHandlerPtr handler;
5210
5211	enc = xmlParseCharEncoding(encoding);
5212	/*
5213	* registered set of known encodings
5214	*/
5215	if (enc != XML_CHAR_ENCODING_ERROR) {
5216	xmlSwitchEncoding(ctxt, enc);
5217	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5218	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5219	"Unsupported encoding %s\n",
5220	(const xmlChar *) encoding, NULL);
5221	}
5222	} else {
5223	/*
5224	* fallback for unknown encodings
5225	*/
5226	handler = xmlFindCharEncodingHandler((const char *) encoding);
5227	if (handler != NULL) {
5228	xmlSwitchToEncoding(ctxt, handler);
5229	} else {
5230	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5231	"Unsupported encoding %s\n",
5232	(const xmlChar *) encoding, NULL);
5233	}
5234	}
5235	}
5236
5237	return(ctxt);
5238	}
5239
5240	#ifdef LIBXML_PUSH_ENABLED
5241	/************************************************************************
5242	* *
5243	* Progressive parsing interfaces *
5244	* *
5245	************************************************************************/
5246
5247	/**
5248	* htmlParseLookupSequence:
5249	* @ctxt: an HTML parser context
5250	* @first: the first char to lookup
5251	* @next: the next char to lookup or zero
5252	* @third: the next char to lookup or zero
5253	* @ignoreattrval: skip over attribute values
5254	*
5255	* Try to find if a sequence (first, next, third) or just (first next) or
5256	* (first) is available in the input stream.
5257	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
5258	* to avoid rescanning sequences of bytes, it DOES change the state of the
5259	* parser, do not use liberally.
5260	* This is basically similar to xmlParseLookupSequence()
5261	*
5262	* Returns the index to the current parsing point if the full sequence
5263	* is available, -1 otherwise.
5264	*/
5265	static int
5266	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5267	xmlChar next, xmlChar third, int ignoreattrval)
5268	{
5269	size_t base, len;
5270	htmlParserInputPtr in;
5271	const xmlChar *buf;
5272	int quote;
5273
5274	in = ctxt->input;
5275	if (in == NULL)
5276	return (-1);
5277
5278	base = ctxt->checkIndex;
5279	quote = ctxt->endCheckState;
5280
5281	buf = in->cur;
5282	len = in->end - in->cur;
5283
5284	/* take into account the sequence length */
5285	if (third)
5286	len -= 2;
5287	else if (next)
5288	len--;
5289	for (; base < len; base++) {
5290	if (base >= INT_MAX / 2) {
5291	ctxt->checkIndex = 0;
5292	ctxt->endCheckState = 0;
5293	return (base - 2);
5294	}
5295	if (ignoreattrval) {
5296	if (quote) {
5297	if (buf[base] == quote)
5298	quote = 0;
5299	continue;
5300	}
5301	if (buf[base] == '"' \|\| buf[base] == '\'') {
5302	quote = buf[base];
5303	continue;
5304	}
5305	}
5306	if (buf[base] == first) {
5307	if (third != 0) {
5308	if ((buf[base + 1] != next) \|\| (buf[base + 2] != third))
5309	continue;
5310	} else if (next != 0) {
5311	if (buf[base + 1] != next)
5312	continue;
5313	}
5314	ctxt->checkIndex = 0;
5315	ctxt->endCheckState = 0;
5316	return (base);
5317	}
5318	}
5319	ctxt->checkIndex = base;
5320	ctxt->endCheckState = quote;
5321	return (-1);
5322	}
5323
5324	/**
5325	* htmlParseLookupCommentEnd:
5326	* @ctxt: an HTML parser context
5327	*
5328	* Try to find a comment end tag in the input stream
5329	* The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5330	* (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5331	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
5332	* to avoid rescanning sequences of bytes, it DOES change the state of the
5333	* parser, do not use liberally.
5334	* This wraps to htmlParseLookupSequence()
5335	*
5336	* Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5337	*/
5338	static int
5339	htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5340	{
5341	int mark = 0;
5342	int offset;
5343
5344	while (1) {
5345	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5346	if (mark < 0)
5347	break;
5348	if ((NXT(mark+2) == '>') \|\|
5349	((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5350	ctxt->checkIndex = 0;
5351	break;
5352	}
5353	offset = (NXT(mark+2) == '!') ? 3 : 2;
5354	if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5355	ctxt->checkIndex = mark;
5356	return(-1);
5357	}
5358	ctxt->checkIndex = mark + 1;
5359	}
5360	return mark;
5361	}
5362
5363
5364	/**
5365	* htmlParseTryOrFinish:
5366	* @ctxt: an HTML parser context
5367	* @terminate: last chunk indicator
5368	*
5369	* Try to progress on parsing
5370	*
5371	* Returns zero if no parsing was possible
5372	*/
5373	static int
5374	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5375	int ret = 0;
5376	htmlParserInputPtr in;
5377	ptrdiff_t avail = 0;
5378	xmlChar cur, next;
5379
5380	htmlParserNodeInfo node_info;
5381
5382	while (1) {
5383
5384	in = ctxt->input;
5385	if (in == NULL) break;
5386	avail = in->end - in->cur;
5387	if ((avail == 0) && (terminate)) {
5388	htmlAutoCloseOnEnd(ctxt);
5389	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5390	/*
5391	* SAX: end of the document processing.
5392	*/
5393	ctxt->instate = XML_PARSER_EOF;
5394	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5395	ctxt->sax->endDocument(ctxt->userData);
5396	}
5397	}
5398	if (avail < 1)
5399	goto done;
5400	/*
5401	* This is done to make progress and avoid an infinite loop
5402	* if a parsing attempt was aborted by hitting a NUL byte. After
5403	* changing htmlCurrentChar, this probably isn't necessary anymore.
5404	* We should consider removing this check.
5405	*/
5406	cur = in->cur[0];
5407	if (cur == 0) {
5408	SKIP(1);
5409	continue;
5410	}
5411
5412	switch (ctxt->instate) {
5413	case XML_PARSER_EOF:
5414	/*
5415	* Document parsing is done !
5416	*/
5417	goto done;
5418	case XML_PARSER_START:
5419	/*
5420	* This is wrong but matches long-standing behavior. In most
5421	* cases, a document starting with an XML declaration will
5422	* specify UTF-8.
5423	*/
5424	if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5425	(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5426	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5427	}
5428
5429	/*
5430	* Very first chars read from the document flow.
5431	*/
5432	cur = in->cur[0];
5433	if (IS_BLANK_CH(cur)) {
5434	SKIP_BLANKS;
5435	avail = in->end - in->cur;
5436	}
5437	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5438	ctxt->sax->setDocumentLocator(ctxt->userData,
5439	&xmlDefaultSAXLocator);
5440	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5441	(!ctxt->disableSAX))
5442	ctxt->sax->startDocument(ctxt->userData);
5443	if (ctxt->instate == XML_PARSER_EOF)
5444	goto done;
5445
5446	cur = in->cur[0];
5447	next = in->cur[1];
5448	if ((cur == '<') && (next == '!') &&
5449	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5450	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5451	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5452	(UPP(8) == 'E')) {
5453	if ((!terminate) &&
5454	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5455	goto done;
5456	htmlParseDocTypeDecl(ctxt);
5457	if (ctxt->instate == XML_PARSER_EOF)
5458	goto done;
5459	ctxt->instate = XML_PARSER_PROLOG;
5460	} else {
5461	ctxt->instate = XML_PARSER_MISC;
5462	}
5463	break;
5464	case XML_PARSER_MISC:
5465	SKIP_BLANKS;
5466	avail = in->end - in->cur;
5467	/*
5468	* no chars in buffer
5469	*/
5470	if (avail < 1)
5471	goto done;
5472	/*
5473	* not enough chars in buffer
5474	*/
5475	if (avail < 2) {
5476	if (!terminate)
5477	goto done;
5478	else
5479	next = ' ';
5480	} else {
5481	next = in->cur[1];
5482	}
5483	cur = in->cur[0];
5484	if ((cur == '<') && (next == '!') &&
5485	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5486	if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5487	goto done;
5488	htmlParseComment(ctxt);
5489	if (ctxt->instate == XML_PARSER_EOF)
5490	goto done;
5491	ctxt->instate = XML_PARSER_MISC;
5492	} else if ((cur == '<') && (next == '?')) {
5493	if ((!terminate) &&
5494	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5495	goto done;
5496	htmlParsePI(ctxt);
5497	if (ctxt->instate == XML_PARSER_EOF)
5498	goto done;
5499	ctxt->instate = XML_PARSER_MISC;
5500	} else if ((cur == '<') && (next == '!') &&
5501	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5502	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5503	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5504	(UPP(8) == 'E')) {
5505	if ((!terminate) &&
5506	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5507	goto done;
5508	htmlParseDocTypeDecl(ctxt);
5509	if (ctxt->instate == XML_PARSER_EOF)
5510	goto done;
5511	ctxt->instate = XML_PARSER_PROLOG;
5512	} else if ((cur == '<') && (next == '!') &&
5513	(avail < 9)) {
5514	goto done;
5515	} else {
5516	ctxt->instate = XML_PARSER_CONTENT;
5517	}
5518	break;
5519	case XML_PARSER_PROLOG:
5520	SKIP_BLANKS;
5521	avail = in->end - in->cur;
5522	if (avail < 2)
5523	goto done;
5524	cur = in->cur[0];
5525	next = in->cur[1];
5526	if ((cur == '<') && (next == '!') &&
5527	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5528	if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5529	goto done;
5530	htmlParseComment(ctxt);
5531	if (ctxt->instate == XML_PARSER_EOF)
5532	goto done;
5533	ctxt->instate = XML_PARSER_PROLOG;
5534	} else if ((cur == '<') && (next == '?')) {
5535	if ((!terminate) &&
5536	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5537	goto done;
5538	htmlParsePI(ctxt);
5539	if (ctxt->instate == XML_PARSER_EOF)
5540	goto done;
5541	ctxt->instate = XML_PARSER_PROLOG;
5542	} else if ((cur == '<') && (next == '!') &&
5543	(avail < 4)) {
5544	goto done;
5545	} else {
5546	ctxt->instate = XML_PARSER_CONTENT;
5547	}
5548	break;
5549	case XML_PARSER_EPILOG:
5550	avail = in->end - in->cur;
5551	if (avail < 1)
5552	goto done;
5553	cur = in->cur[0];
5554	if (IS_BLANK_CH(cur)) {
5555	htmlParseCharData(ctxt);
5556	goto done;
5557	}
5558	if (avail < 2)
5559	goto done;
5560	next = in->cur[1];
5561	if ((cur == '<') && (next == '!') &&
5562	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5563	if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5564	goto done;
5565	htmlParseComment(ctxt);
5566	if (ctxt->instate == XML_PARSER_EOF)
5567	goto done;
5568	ctxt->instate = XML_PARSER_EPILOG;
5569	} else if ((cur == '<') && (next == '?')) {
5570	if ((!terminate) &&
5571	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5572	goto done;
5573	htmlParsePI(ctxt);
5574	if (ctxt->instate == XML_PARSER_EOF)
5575	goto done;
5576	ctxt->instate = XML_PARSER_EPILOG;
5577	} else if ((cur == '<') && (next == '!') &&
5578	(avail < 4)) {
5579	goto done;
5580	} else {
5581	ctxt->errNo = XML_ERR_DOCUMENT_END;
5582	ctxt->wellFormed = 0;
5583	ctxt->instate = XML_PARSER_EOF;
5584	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5585	ctxt->sax->endDocument(ctxt->userData);
5586	goto done;
5587	}
5588	break;
5589	case XML_PARSER_START_TAG: {
5590	const xmlChar *name;
5591	int failed;
5592	const htmlElemDesc * info;
5593
5594	/*
5595	* no chars in buffer
5596	*/
5597	if (avail < 1)
5598	goto done;
5599	/*
5600	* not enough chars in buffer
5601	*/
5602	if (avail < 2) {
5603	if (!terminate)
5604	goto done;
5605	else
5606	next = ' ';
5607	} else {
5608	next = in->cur[1];
5609	}
5610	cur = in->cur[0];
5611	if (cur != '<') {
5612	ctxt->instate = XML_PARSER_CONTENT;
5613	break;
5614	}
5615	if (next == '/') {
5616	ctxt->instate = XML_PARSER_END_TAG;
5617	ctxt->checkIndex = 0;
5618	break;
5619	}
5620	if ((!terminate) &&
5621	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5622	goto done;
5623
5624	/* Capture start position */
5625	if (ctxt->record_info) {
5626	node_info.begin_pos = ctxt->input->consumed +
5627	(CUR_PTR - ctxt->input->base);
5628	node_info.begin_line = ctxt->input->line;
5629	}
5630
5631
5632	failed = htmlParseStartTag(ctxt);
5633	name = ctxt->name;
5634	if ((failed == -1) \|\|
5635	(name == NULL)) {
5636	if (CUR == '>')
5637	NEXT;
5638	break;
5639	}
5640
5641	/*
5642	* Lookup the info for that element.
5643	*/
5644	info = htmlTagLookup(name);
5645	if (info == NULL) {
5646	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5647	"Tag %s invalid\n", name, NULL);
5648	}
5649
5650	/*
5651	* Check for an Empty Element labeled the XML/SGML way
5652	*/
5653	if ((CUR == '/') && (NXT(1) == '>')) {
5654	SKIP(2);
5655	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5656	ctxt->sax->endElement(ctxt->userData, name);
5657	htmlnamePop(ctxt);
5658	if (ctxt->instate == XML_PARSER_EOF)
5659	goto done;
5660	ctxt->instate = XML_PARSER_CONTENT;
5661	break;
5662	}
5663
5664	if (CUR == '>') {
5665	NEXT;
5666	} else {
5667	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5668	"Couldn't find end of Start Tag %s\n",
5669	name, NULL);
5670
5671	/*
5672	* end of parsing of this node.
5673	*/
5674	if (xmlStrEqual(name, ctxt->name)) {
5675	nodePop(ctxt);
5676	htmlnamePop(ctxt);
5677	}
5678
5679	if (ctxt->record_info)
5680	htmlNodeInfoPush(ctxt, &node_info);
5681
5682	if (ctxt->instate == XML_PARSER_EOF)
5683	goto done;
5684	ctxt->instate = XML_PARSER_CONTENT;
5685	break;
5686	}
5687
5688	/*
5689	* Check for an Empty Element from DTD definition
5690	*/
5691	if ((info != NULL) && (info->empty)) {
5692	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5693	ctxt->sax->endElement(ctxt->userData, name);
5694	htmlnamePop(ctxt);
5695	}
5696
5697	if (ctxt->record_info)
5698	htmlNodeInfoPush(ctxt, &node_info);
5699
5700	if (ctxt->instate == XML_PARSER_EOF)
5701	goto done;
5702	ctxt->instate = XML_PARSER_CONTENT;
5703	break;
5704	}
5705	case XML_PARSER_CONTENT: {
5706	xmlChar chr[2] = { 0, 0 };
5707
5708	/*
5709	* Handle preparsed entities and charRef
5710	*/
5711	if (ctxt->token != 0) {
5712	chr[0] = ctxt->token;
5713	htmlCheckParagraph(ctxt);
5714	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5715	ctxt->sax->characters(ctxt->userData, chr, 1);
5716	ctxt->token = 0;
5717	ctxt->checkIndex = 0;
5718	}
5719	if ((avail == 1) && (terminate)) {
5720	cur = in->cur[0];
5721	if ((cur != '<') && (cur != '&')) {
5722	if (ctxt->sax != NULL) {
5723	chr[0] = cur;
5724	if (IS_BLANK_CH(cur)) {
5725	if (ctxt->keepBlanks) {
5726	if (ctxt->sax->characters != NULL)
5727	ctxt->sax->characters(
5728	ctxt->userData, chr, 1);
5729	} else {
5730	if (ctxt->sax->ignorableWhitespace != NULL)
5731	ctxt->sax->ignorableWhitespace(
5732	ctxt->userData, chr, 1);
5733	}
5734	} else {
5735	htmlCheckParagraph(ctxt);
5736	if (ctxt->sax->characters != NULL)
5737	ctxt->sax->characters(
5738	ctxt->userData, chr, 1);
5739	}
5740	}
5741	ctxt->token = 0;
5742	ctxt->checkIndex = 0;
5743	in->cur++;
5744	break;
5745	}
5746	}
5747	if (avail < 2)
5748	goto done;
5749	cur = in->cur[0];
5750	next = in->cur[1];
5751	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
5752	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5753	/*
5754	* Handle SCRIPT/STYLE separately
5755	*/
5756	if (!terminate) {
5757	int idx;
5758	xmlChar val;
5759
5760	idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5761	if (idx < 0)
5762	goto done;
5763	val = in->cur[idx + 2];
5764	if (val == 0) { /* bad cut of input */
5765	/*
5766	* FIXME: htmlParseScript checks for additional
5767	* characters after '</'.
5768	*/
5769	ctxt->checkIndex = idx;
5770	goto done;
5771	}
5772	}
5773	htmlParseScript(ctxt);
5774	if (ctxt->instate == XML_PARSER_EOF)
5775	goto done;
5776	if ((cur == '<') && (next == '/')) {
5777	ctxt->instate = XML_PARSER_END_TAG;
5778	ctxt->checkIndex = 0;
5779	break;
5780	}
5781	} else if ((cur == '<') && (next == '!')) {
5782	if (avail < 4)
5783	goto done;
5784	/*
5785	* Sometimes DOCTYPE arrives in the middle of the document
5786	*/
5787	if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5788	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5789	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5790	(UPP(8) == 'E')) {
5791	if ((!terminate) &&
5792	(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5793	goto done;
5794	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5795	"Misplaced DOCTYPE declaration\n",
5796	BAD_CAST "DOCTYPE" , NULL);
5797	htmlParseDocTypeDecl(ctxt);
5798	} else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5799	if ((!terminate) &&
5800	(htmlParseLookupCommentEnd(ctxt) < 0))
5801	goto done;
5802	htmlParseComment(ctxt);
5803	if (ctxt->instate == XML_PARSER_EOF)
5804	goto done;
5805	ctxt->instate = XML_PARSER_CONTENT;
5806	} else {
5807	if ((!terminate) &&
5808	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5809	goto done;
5810	htmlSkipBogusComment(ctxt);
5811	}
5812	} else if ((cur == '<') && (next == '?')) {
5813	if ((!terminate) &&
5814	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5815	goto done;
5816	htmlParsePI(ctxt);
5817	if (ctxt->instate == XML_PARSER_EOF)
5818	goto done;
5819	ctxt->instate = XML_PARSER_CONTENT;
5820	} else if ((cur == '<') && (next == '/')) {
5821	ctxt->instate = XML_PARSER_END_TAG;
5822	ctxt->checkIndex = 0;
5823	break;
5824	} else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5825	if ((!terminate) && (next == 0))
5826	goto done;
5827	ctxt->instate = XML_PARSER_START_TAG;
5828	ctxt->checkIndex = 0;
5829	break;
5830	} else if (cur == '<') {
5831	if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5832	(ctxt->sax->characters != NULL))
5833	ctxt->sax->characters(ctxt->userData,
5834	BAD_CAST "<", 1);
5835	NEXT;
5836	} else {
5837	/*
5838	* check that the text sequence is complete
5839	* before handing out the data to the parser
5840	* to avoid problems with erroneous end of
5841	* data detection.
5842	*/
5843	if ((!terminate) &&
5844	(htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5845	goto done;
5846	ctxt->checkIndex = 0;
5847	while ((ctxt->instate != XML_PARSER_EOF) &&
5848	(cur != '<') && (in->cur < in->end)) {
5849	if (cur == '&') {
5850	htmlParseReference(ctxt);
5851	} else {
5852	htmlParseCharData(ctxt);
5853	}
5854	cur = in->cur[0];
5855	}
5856	}
5857
5858	break;
5859	}
5860	case XML_PARSER_END_TAG:
5861	if (avail < 2)
5862	goto done;
5863	if ((!terminate) &&
5864	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5865	goto done;
5866	htmlParseEndTag(ctxt);
5867	if (ctxt->instate == XML_PARSER_EOF)
5868	goto done;
5869	if (ctxt->nameNr == 0) {
5870	ctxt->instate = XML_PARSER_EPILOG;
5871	} else {
5872	ctxt->instate = XML_PARSER_CONTENT;
5873	}
5874	ctxt->checkIndex = 0;
5875	break;
5876	default:
5877	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5878	"HPP: internal error\n", NULL, NULL);
5879	ctxt->instate = XML_PARSER_EOF;
5880	break;
5881	}
5882	}
5883	done:
5884	if ((avail == 0) && (terminate)) {
5885	htmlAutoCloseOnEnd(ctxt);
5886	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5887	/*
5888	* SAX: end of the document processing.
5889	*/
5890	ctxt->instate = XML_PARSER_EOF;
5891	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5892	ctxt->sax->endDocument(ctxt->userData);
5893	}
5894	}
5895	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5896	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
5897	(ctxt->instate == XML_PARSER_EPILOG))) {
5898	xmlDtdPtr dtd;
5899	dtd = xmlGetIntSubset(ctxt->myDoc);
5900	if (dtd == NULL)
5901	ctxt->myDoc->intSubset =
5902	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5903	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5904	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5905	}
5906	return(ret);
5907	}
5908
5909	/**
5910	* htmlParseChunk:
5911	* @ctxt: an HTML parser context
5912	* @chunk: an char array
5913	* @size: the size in byte of the chunk
5914	* @terminate: last chunk indicator
5915	*
5916	* Parse a Chunk of memory
5917	*
5918	* Returns zero if no error, the xmlParserErrors otherwise.
5919	*/
5920	int
5921	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5922	int terminate) {
5923	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
5924	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5925	"htmlParseChunk: context error\n", NULL, NULL);
5926	return(XML_ERR_INTERNAL_ERROR);
5927	}
5928	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5929	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5930	size_t pos = ctxt->input->cur - ctxt->input->base;
5931	int res;
5932
5933	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5934	xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5935	if (res < 0) {
5936	htmlParseErr(ctxt, ctxt->input->buf->error,
5937	"xmlParserInputBufferPush failed", NULL, NULL);
5938	xmlHaltParser(ctxt);
5939	return (ctxt->errNo);
5940	}
5941	}
5942	htmlParseTryOrFinish(ctxt, terminate);
5943	if (terminate) {
5944	if ((ctxt->instate != XML_PARSER_EOF) &&
5945	(ctxt->instate != XML_PARSER_EPILOG) &&
5946	(ctxt->instate != XML_PARSER_MISC)) {
5947	ctxt->errNo = XML_ERR_DOCUMENT_END;
5948	ctxt->wellFormed = 0;
5949	}
5950	if (ctxt->instate != XML_PARSER_EOF) {
5951	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5952	ctxt->sax->endDocument(ctxt->userData);
5953	}
5954	ctxt->instate = XML_PARSER_EOF;
5955	}
5956	return((xmlParserErrors) ctxt->errNo);
5957	}
5958
5959	/************************************************************************
5960	* *
5961	* User entry points *
5962	* *
5963	************************************************************************/
5964
5965	/**
5966	* htmlCreatePushParserCtxt:
5967	* @sax: a SAX handler
5968	* @user_data: The user data returned on SAX callbacks
5969	* @chunk: a pointer to an array of chars
5970	* @size: number of chars in the array
5971	* @filename: an optional file name or URI
5972	* @enc: an optional encoding
5973	*
5974	* Create a parser context for using the HTML parser in push mode
5975	* The value of @filename is used for fetching external entities
5976	* and error/warning reports.
5977	*
5978	* Returns the new parser context or NULL
5979	*/
5980	htmlParserCtxtPtr
5981	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5982	const char chunk, int size, const char filename,
5983	xmlCharEncoding enc) {
5984	htmlParserCtxtPtr ctxt;
5985	htmlParserInputPtr inputStream;
5986	xmlParserInputBufferPtr buf;
5987
5988	xmlInitParser();
5989
5990	buf = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE);
5991	if (buf == NULL) return(NULL);
5992
5993	ctxt = htmlNewSAXParserCtxt(sax, user_data);
5994	if (ctxt == NULL) {
5995	xmlFreeParserInputBuffer(buf);
5996	return(NULL);
5997	}
5998	if (filename == NULL) {
5999	ctxt->directory = NULL;
6000	} else {
6001	ctxt->directory = xmlParserGetDirectory(filename);
6002	}
6003
6004	inputStream = htmlNewInputStream(ctxt);
6005	if (inputStream == NULL) {
6006	xmlFreeParserCtxt(ctxt);
6007	xmlFreeParserInputBuffer(buf);
6008	return(NULL);
6009	}
6010
6011	if (filename == NULL)
6012	inputStream->filename = NULL;
6013	else
6014	inputStream->filename = (char *)
6015	xmlCanonicPath((const xmlChar *) filename);
6016	inputStream->buf = buf;
6017	xmlBufResetInput(buf->buffer, inputStream);
6018
6019	inputPush(ctxt, inputStream);
6020
6021	if (enc != XML_CHAR_ENCODING_NONE)
6022	xmlSwitchEncoding(ctxt, enc);
6023
6024	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6025	(ctxt->input->buf != NULL)) {
6026	size_t pos = ctxt->input->cur - ctxt->input->base;
6027	int res;
6028
6029	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6030	xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
6031	if (res < 0) {
6032	htmlParseErr(ctxt, ctxt->input->buf->error,
6033	"xmlParserInputBufferPush failed\n", NULL, NULL);
6034	xmlHaltParser(ctxt);
6035	}
6036	}
6037	ctxt->progressive = 1;
6038
6039	return(ctxt);
6040	}
6041	#endif /* LIBXML_PUSH_ENABLED */
6042
6043	/**
6044	* htmlSAXParseDoc:
6045	* @cur: a pointer to an array of xmlChar
6046	* @encoding: a free form C string describing the HTML document encoding, or NULL
6047	* @sax: the SAX handler block
6048	* @userData: if using SAX, this pointer will be provided on callbacks.
6049	*
6050	* DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
6051	*
6052	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6053	* to handle parse events. If sax is NULL, fallback to the default DOM
6054	* behavior and return a tree.
6055	*
6056	* Returns the resulting document tree unless SAX is NULL or the document is
6057	* not well formed.
6058	*/
6059
6060	htmlDocPtr
6061	htmlSAXParseDoc(const xmlChar cur, const char encoding,
6062	htmlSAXHandlerPtr sax, void *userData) {
6063	htmlDocPtr ret;
6064	htmlParserCtxtPtr ctxt;
6065
6066	xmlInitParser();
6067
6068	if (cur == NULL) return(NULL);
6069
6070
6071	ctxt = htmlCreateDocParserCtxt(cur, encoding);
6072	if (ctxt == NULL) return(NULL);
6073	if (sax != NULL) {
6074	if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6075	ctxt->sax = sax;
6076	ctxt->userData = userData;
6077	}
6078
6079	htmlParseDocument(ctxt);
6080	ret = ctxt->myDoc;
6081	if (sax != NULL) {
6082	ctxt->sax = NULL;
6083	ctxt->userData = NULL;
6084	}
6085	htmlFreeParserCtxt(ctxt);
6086
6087	return(ret);
6088	}
6089
6090	/**
6091	* htmlParseDoc:
6092	* @cur: a pointer to an array of xmlChar
6093	* @encoding: a free form C string describing the HTML document encoding, or NULL
6094	*
6095	* parse an HTML in-memory document and build a tree.
6096	*
6097	* Returns the resulting document tree
6098	*/
6099
6100	htmlDocPtr
6101	htmlParseDoc(const xmlChar cur, const char encoding) {
6102	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6103	}
6104
6105
6106	/**
6107	* htmlCreateFileParserCtxt:
6108	* @filename: the filename
6109	* @encoding: a free form C string describing the HTML document encoding, or NULL
6110	*
6111	* Create a parser context for a file content.
6112	* Automatic support for ZLIB/Compress compressed document is provided
6113	* by default if found at compile-time.
6114	*
6115	* Returns the new parser context or NULL
6116	*/
6117	htmlParserCtxtPtr
6118	htmlCreateFileParserCtxt(const char filename, const char encoding)
6119	{
6120	htmlParserCtxtPtr ctxt;
6121	htmlParserInputPtr inputStream;
6122	char *canonicFilename;
6123
6124	if (filename == NULL)
6125	return(NULL);
6126
6127	ctxt = htmlNewParserCtxt();
6128	if (ctxt == NULL) {
6129	return(NULL);
6130	}
6131	canonicFilename = (char ) xmlCanonicPath((const xmlChar ) filename);
6132	if (canonicFilename == NULL) {
6133	xmlFreeParserCtxt(ctxt);
6134	return(NULL);
6135	}
6136
6137	inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6138	xmlFree(canonicFilename);
6139	if (inputStream == NULL) {
6140	xmlFreeParserCtxt(ctxt);
6141	return(NULL);
6142	}
6143
6144	inputPush(ctxt, inputStream);
6145
6146	/* set encoding */
6147	if (encoding) {
6148	xmlCharEncodingHandlerPtr hdlr;
6149
6150	hdlr = xmlFindCharEncodingHandler(encoding);
6151	if (hdlr != NULL) {
6152	xmlSwitchToEncoding(ctxt, hdlr);
6153	}
6154	}
6155
6156	return(ctxt);
6157	}
6158
6159	/**
6160	* htmlSAXParseFile:
6161	* @filename: the filename
6162	* @encoding: a free form C string describing the HTML document encoding, or NULL
6163	* @sax: the SAX handler block
6164	* @userData: if using SAX, this pointer will be provided on callbacks.
6165	*
6166	* DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
6167	*
6168	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6169	* compressed document is provided by default if found at compile-time.
6170	* It use the given SAX function block to handle the parsing callback.
6171	* If sax is NULL, fallback to the default DOM tree building routines.
6172	*
6173	* Returns the resulting document tree unless SAX is NULL or the document is
6174	* not well formed.
6175	*/
6176
6177	htmlDocPtr
6178	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
6179	void *userData) {
6180	htmlDocPtr ret;
6181	htmlParserCtxtPtr ctxt;
6182	htmlSAXHandlerPtr oldsax = NULL;
6183
6184	xmlInitParser();
6185
6186	ctxt = htmlCreateFileParserCtxt(filename, encoding);
6187	if (ctxt == NULL) return(NULL);
6188	if (sax != NULL) {
6189	oldsax = ctxt->sax;
6190	ctxt->sax = sax;
6191	ctxt->userData = userData;
6192	}
6193
6194	htmlParseDocument(ctxt);
6195
6196	ret = ctxt->myDoc;
6197	if (sax != NULL) {
6198	ctxt->sax = oldsax;
6199	ctxt->userData = NULL;
6200	}
6201	htmlFreeParserCtxt(ctxt);
6202
6203	return(ret);
6204	}
6205
6206	/**
6207	* htmlParseFile:
6208	* @filename: the filename
6209	* @encoding: a free form C string describing the HTML document encoding, or NULL
6210	*
6211	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6212	* compressed document is provided by default if found at compile-time.
6213	*
6214	* Returns the resulting document tree
6215	*/
6216
6217	htmlDocPtr
6218	htmlParseFile(const char filename, const char encoding) {
6219	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6220	}
6221
6222	/**
6223	* htmlHandleOmittedElem:
6224	* @val: int 0 or 1
6225	*
6226	* Set and return the previous value for handling HTML omitted tags.
6227	*
6228	* Returns the last value for 0 for no handling, 1 for auto insertion.
6229	*/
6230
6231	int
6232	htmlHandleOmittedElem(int val) {
6233	int old = htmlOmittedDefaultValue;
6234
6235	htmlOmittedDefaultValue = val;
6236	return(old);
6237	}
6238
6239	/**
6240	* htmlElementAllowedHere:
6241	* @parent: HTML parent element
6242	* @elt: HTML element
6243	*
6244	* Checks whether an HTML element may be a direct child of a parent element.
6245	* Note - doesn't check for deprecated elements
6246	*
6247	* Returns 1 if allowed; 0 otherwise.
6248	*/
6249	int
6250	htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6251	const char** p ;
6252
6253	if ( ! elt \|\| ! parent \|\| ! parent->subelts )
6254	return 0 ;
6255
6256	for ( p = parent->subelts; *p; ++p )
6257	if ( !xmlStrcmp((const xmlChar )p, elt) )
6258	return 1 ;
6259
6260	return 0 ;
6261	}
6262	/**
6263	* htmlElementStatusHere:
6264	* @parent: HTML parent element
6265	* @elt: HTML element
6266	*
6267	* Checks whether an HTML element may be a direct child of a parent element.
6268	* and if so whether it is valid or deprecated.
6269	*
6270	* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6271	*/
6272	htmlStatus
6273	htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6274	if ( ! parent \|\| ! elt )
6275	return HTML_INVALID ;
6276	if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6277	return HTML_INVALID ;
6278
6279	return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6280	}
6281	/**
6282	* htmlAttrAllowed:
6283	* @elt: HTML element
6284	* @attr: HTML attribute
6285	* @legacy: whether to allow deprecated attributes
6286	*
6287	* Checks whether an attribute is valid for an element
6288	* Has full knowledge of Required and Deprecated attributes
6289	*
6290	* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6291	*/
6292	htmlStatus
6293	htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6294	const char** p ;
6295
6296	if ( !elt \|\| ! attr )
6297	return HTML_INVALID ;
6298
6299	if ( elt->attrs_req )
6300	for ( p = elt->attrs_req; *p; ++p)
6301	if ( !xmlStrcmp((const xmlChar)p, attr) )
6302	return HTML_REQUIRED ;
6303
6304	if ( elt->attrs_opt )
6305	for ( p = elt->attrs_opt; *p; ++p)
6306	if ( !xmlStrcmp((const xmlChar)p, attr) )
6307	return HTML_VALID ;
6308
6309	if ( legacy && elt->attrs_depr )
6310	for ( p = elt->attrs_depr; *p; ++p)
6311	if ( !xmlStrcmp((const xmlChar)p, attr) )
6312	return HTML_DEPRECATED ;
6313
6314	return HTML_INVALID ;
6315	}
6316	/**
6317	* htmlNodeStatus:
6318	* @node: an htmlNodePtr in a tree
6319	* @legacy: whether to allow deprecated elements (YES is faster here
6320	* for Element nodes)
6321	*
6322	* Checks whether the tree node is valid. Experimental (the author
6323	* only uses the HTML enhancements in a SAX parser)
6324	*
6325	* Return: for Element nodes, a return from htmlElementAllowedHere (if
6326	* legacy allowed) or htmlElementStatusHere (otherwise).
6327	* for Attribute nodes, a return from htmlAttrAllowed
6328	* for other nodes, HTML_NA (no checks performed)
6329	*/
6330	htmlStatus
6331	htmlNodeStatus(const htmlNodePtr node, int legacy) {
6332	if ( ! node )
6333	return HTML_INVALID ;
6334
6335	switch ( node->type ) {
6336	case XML_ELEMENT_NODE:
6337	return legacy
6338	? ( htmlElementAllowedHere (
6339	htmlTagLookup(node->parent->name) , node->name
6340	) ? HTML_VALID : HTML_INVALID )
6341	: htmlElementStatusHere(
6342	htmlTagLookup(node->parent->name) ,
6343	htmlTagLookup(node->name) )
6344	;
6345	case XML_ATTRIBUTE_NODE:
6346	return htmlAttrAllowed(
6347	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6348	default: return HTML_NA ;
6349	}
6350	}
6351	/************************************************************************
6352	* *
6353	* New set (2.6.0) of simpler and more flexible APIs *
6354	* *
6355	************************************************************************/
6356	/**
6357	* DICT_FREE:
6358	* @str: a string
6359	*
6360	* Free a string if it is not owned by the "dict" dictionary in the
6361	* current scope
6362	*/
6363	#define DICT_FREE(str) \
6364	if ((str) && ((!dict) \|\| \
6365	(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6366	xmlFree((char *)(str));
6367
6368	/**
6369	* htmlCtxtReset:
6370	* @ctxt: an HTML parser context
6371	*
6372	* Reset a parser context
6373	*/
6374	void
6375	htmlCtxtReset(htmlParserCtxtPtr ctxt)
6376	{
6377	xmlParserInputPtr input;
6378	xmlDictPtr dict;
6379
6380	if (ctxt == NULL)
6381	return;
6382
6383	xmlInitParser();
6384	dict = ctxt->dict;
6385
6386	while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6387	xmlFreeInputStream(input);
6388	}
6389	ctxt->inputNr = 0;
6390	ctxt->input = NULL;
6391
6392	ctxt->spaceNr = 0;
6393	if (ctxt->spaceTab != NULL) {
6394	ctxt->spaceTab[0] = -1;
6395	ctxt->space = &ctxt->spaceTab[0];
6396	} else {
6397	ctxt->space = NULL;
6398	}
6399
6400
6401	ctxt->nodeNr = 0;
6402	ctxt->node = NULL;
6403
6404	ctxt->nameNr = 0;
6405	ctxt->name = NULL;
6406
6407	ctxt->nsNr = 0;
6408
6409	DICT_FREE(ctxt->version);
6410	ctxt->version = NULL;
6411	DICT_FREE(ctxt->encoding);
6412	ctxt->encoding = NULL;
6413	DICT_FREE(ctxt->directory);
6414	ctxt->directory = NULL;
6415	DICT_FREE(ctxt->extSubURI);
6416	ctxt->extSubURI = NULL;
6417	DICT_FREE(ctxt->extSubSystem);
6418	ctxt->extSubSystem = NULL;
6419	if (ctxt->myDoc != NULL)
6420	xmlFreeDoc(ctxt->myDoc);
6421	ctxt->myDoc = NULL;
6422
6423	ctxt->standalone = -1;
6424	ctxt->hasExternalSubset = 0;
6425	ctxt->hasPErefs = 0;
6426	ctxt->html = 1;
6427	ctxt->external = 0;
6428	ctxt->instate = XML_PARSER_START;
6429	ctxt->token = 0;
6430
6431	ctxt->wellFormed = 1;
6432	ctxt->nsWellFormed = 1;
6433	ctxt->disableSAX = 0;
6434	ctxt->valid = 1;
6435	ctxt->vctxt.userData = ctxt;
6436	ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6437	ctxt->vctxt.error = xmlParserValidityError;
6438	ctxt->vctxt.warning = xmlParserValidityWarning;
6439	ctxt->record_info = 0;
6440	ctxt->checkIndex = 0;
6441	ctxt->endCheckState = 0;
6442	ctxt->inSubset = 0;
6443	ctxt->errNo = XML_ERR_OK;
6444	ctxt->depth = 0;
6445	ctxt->catalogs = NULL;
6446	xmlInitNodeInfoSeq(&ctxt->node_seq);
6447
6448	if (ctxt->attsDefault != NULL) {
6449	xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6450	ctxt->attsDefault = NULL;
6451	}
6452	if (ctxt->attsSpecial != NULL) {
6453	xmlHashFree(ctxt->attsSpecial, NULL);
6454	ctxt->attsSpecial = NULL;
6455	}
6456
6457	ctxt->nbErrors = 0;
6458	ctxt->nbWarnings = 0;
6459	if (ctxt->lastError.code != XML_ERR_OK)
6460	xmlResetError(&ctxt->lastError);
6461	}
6462
6463	/**
6464	* htmlCtxtUseOptions:
6465	* @ctxt: an HTML parser context
6466	* @options: a combination of htmlParserOption(s)
6467	*
6468	* Applies the options to the parser context
6469	*
6470	* Returns 0 in case of success, the set of unknown or unimplemented options
6471	* in case of error.
6472	*/
6473	int
6474	htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6475	{
6476	if (ctxt == NULL)
6477	return(-1);
6478
6479	if (options & HTML_PARSE_NOWARNING) {
6480	ctxt->sax->warning = NULL;
6481	ctxt->vctxt.warning = NULL;
6482	options -= XML_PARSE_NOWARNING;
6483	ctxt->options \|= XML_PARSE_NOWARNING;
6484	}
6485	if (options & HTML_PARSE_NOERROR) {
6486	ctxt->sax->error = NULL;
6487	ctxt->vctxt.error = NULL;
6488	ctxt->sax->fatalError = NULL;
6489	options -= XML_PARSE_NOERROR;
6490	ctxt->options \|= XML_PARSE_NOERROR;
6491	}
6492	if (options & HTML_PARSE_PEDANTIC) {
6493	ctxt->pedantic = 1;
6494	options -= XML_PARSE_PEDANTIC;
6495	ctxt->options \|= XML_PARSE_PEDANTIC;
6496	} else
6497	ctxt->pedantic = 0;
6498	if (options & XML_PARSE_NOBLANKS) {
6499	ctxt->keepBlanks = 0;
6500	ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6501	options -= XML_PARSE_NOBLANKS;
6502	ctxt->options \|= XML_PARSE_NOBLANKS;
6503	} else
6504	ctxt->keepBlanks = 1;
6505	if (options & HTML_PARSE_RECOVER) {
6506	ctxt->recovery = 1;
6507	options -= HTML_PARSE_RECOVER;
6508	} else
6509	ctxt->recovery = 0;
6510	if (options & HTML_PARSE_COMPACT) {
6511	ctxt->options \|= HTML_PARSE_COMPACT;
6512	options -= HTML_PARSE_COMPACT;
6513	}
6514	if (options & XML_PARSE_HUGE) {
6515	ctxt->options \|= XML_PARSE_HUGE;
6516	options -= XML_PARSE_HUGE;
6517	}
6518	if (options & HTML_PARSE_NODEFDTD) {
6519	ctxt->options \|= HTML_PARSE_NODEFDTD;
6520	options -= HTML_PARSE_NODEFDTD;
6521	}
6522	if (options & HTML_PARSE_IGNORE_ENC) {
6523	ctxt->options \|= HTML_PARSE_IGNORE_ENC;
6524	options -= HTML_PARSE_IGNORE_ENC;
6525	}
6526	if (options & HTML_PARSE_NOIMPLIED) {
6527	ctxt->options \|= HTML_PARSE_NOIMPLIED;
6528	options -= HTML_PARSE_NOIMPLIED;
6529	}
6530	ctxt->dictNames = 0;
6531	ctxt->linenumbers = 1;
6532	return (options);
6533	}
6534
6535	/**
6536	* htmlDoRead:
6537	* @ctxt: an HTML parser context
6538	* @URL: the base URL to use for the document
6539	* @encoding: the document encoding, or NULL
6540	* @options: a combination of htmlParserOption(s)
6541	* @reuse: keep the context for reuse
6542	*
6543	* Common front-end for the htmlRead functions
6544	*
6545	* Returns the resulting document tree or NULL
6546	*/
6547	static htmlDocPtr
6548	htmlDoRead(htmlParserCtxtPtr ctxt, const char URL, const char encoding,
6549	int options, int reuse)
6550	{
6551	htmlDocPtr ret;
6552
6553	htmlCtxtUseOptions(ctxt, options);
6554	ctxt->html = 1;
6555	if (encoding != NULL) {
6556	xmlCharEncodingHandlerPtr hdlr;
6557
6558	hdlr = xmlFindCharEncodingHandler(encoding);
6559	if (hdlr != NULL) {
6560	xmlSwitchToEncoding(ctxt, hdlr);
6561	}
6562	}
6563	if ((URL != NULL) && (ctxt->input != NULL) &&
6564	(ctxt->input->filename == NULL))
6565	ctxt->input->filename = (char ) xmlStrdup((const xmlChar ) URL);
6566	htmlParseDocument(ctxt);
6567	ret = ctxt->myDoc;
6568	ctxt->myDoc = NULL;
6569	if (!reuse) {
6570	if ((ctxt->dictNames) &&
6571	(ret != NULL) &&
6572	(ret->dict == ctxt->dict))
6573	ctxt->dict = NULL;
6574	xmlFreeParserCtxt(ctxt);
6575	}
6576	return (ret);
6577	}
6578
6579	/**
6580	* htmlReadDoc:
6581	* @cur: a pointer to a zero terminated string
6582	* @URL: the base URL to use for the document
6583	* @encoding: the document encoding, or NULL
6584	* @options: a combination of htmlParserOption(s)
6585	*
6586	* parse an XML in-memory document and build a tree.
6587	*
6588	* Returns the resulting document tree
6589	*/
6590	htmlDocPtr
6591	htmlReadDoc(const xmlChar * cur, const char URL, const char encoding, int options)
6592	{
6593	htmlParserCtxtPtr ctxt;
6594
6595	if (cur == NULL)
6596	return (NULL);
6597
6598	xmlInitParser();
6599	ctxt = htmlCreateDocParserCtxt(cur, NULL);
6600	if (ctxt == NULL)
6601	return (NULL);
6602	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6603	}
6604
6605	/**
6606	* htmlReadFile:
6607	* @filename: a file or URL
6608	* @encoding: the document encoding, or NULL
6609	* @options: a combination of htmlParserOption(s)
6610	*
6611	* parse an XML file from the filesystem or the network.
6612	*
6613	* Returns the resulting document tree
6614	*/
6615	htmlDocPtr
6616	htmlReadFile(const char filename, const char encoding, int options)
6617	{
6618	htmlParserCtxtPtr ctxt;
6619
6620	xmlInitParser();
6621	ctxt = htmlCreateFileParserCtxt(filename, encoding);
6622	if (ctxt == NULL)
6623	return (NULL);
6624	return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6625	}
6626
6627	/**
6628	* htmlReadMemory:
6629	* @buffer: a pointer to a char array
6630	* @size: the size of the array
6631	* @URL: the base URL to use for the document
6632	* @encoding: the document encoding, or NULL
6633	* @options: a combination of htmlParserOption(s)
6634	*
6635	* parse an XML in-memory document and build a tree.
6636	*
6637	* Returns the resulting document tree
6638	*/
6639	htmlDocPtr
6640	htmlReadMemory(const char buffer, int size, const char URL, const char *encoding, int options)
6641	{
6642	htmlParserCtxtPtr ctxt;
6643
6644	xmlInitParser();
6645	ctxt = htmlCreateMemoryParserCtxt(buffer, size);
6646	if (ctxt == NULL)
6647	return (NULL);
6648	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6649	}
6650
6651	/**
6652	* htmlReadFd:
6653	* @fd: an open file descriptor
6654	* @URL: the base URL to use for the document
6655	* @encoding: the document encoding, or NULL
6656	* @options: a combination of htmlParserOption(s)
6657	*
6658	* parse an HTML from a file descriptor and build a tree.
6659	* NOTE that the file descriptor will not be closed when the
6660	* reader is closed or reset.
6661	*
6662	* Returns the resulting document tree
6663	*/
6664	htmlDocPtr
6665	htmlReadFd(int fd, const char URL, const char encoding, int options)
6666	{
6667	htmlParserCtxtPtr ctxt;
6668	xmlParserInputBufferPtr input;
6669	htmlParserInputPtr stream;
6670
6671	if (fd < 0)
6672	return (NULL);
6673
6674	xmlInitParser();
6675	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6676	if (input == NULL)
6677	return (NULL);
6678	input->closecallback = NULL;
6679	ctxt = htmlNewParserCtxt();
6680	if (ctxt == NULL) {
6681	xmlFreeParserInputBuffer(input);
6682	return (NULL);
6683	}
6684	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6685	if (stream == NULL) {
6686	xmlFreeParserInputBuffer(input);
6687	htmlFreeParserCtxt(ctxt);
6688	return (NULL);
6689	}
6690	inputPush(ctxt, stream);
6691	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6692	}
6693
6694	/**
6695	* htmlReadIO:
6696	* @ioread: an I/O read function
6697	* @ioclose: an I/O close function
6698	* @ioctx: an I/O handler
6699	* @URL: the base URL to use for the document
6700	* @encoding: the document encoding, or NULL
6701	* @options: a combination of htmlParserOption(s)
6702	*
6703	* parse an HTML document from I/O functions and source and build a tree.
6704	*
6705	* Returns the resulting document tree
6706	*/
6707	htmlDocPtr
6708	htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6709	void ioctx, const char URL, const char *encoding, int options)
6710	{
6711	htmlParserCtxtPtr ctxt;
6712	xmlParserInputBufferPtr input;
6713	xmlParserInputPtr stream;
6714
6715	if (ioread == NULL)
6716	return (NULL);
6717	xmlInitParser();
6718
6719	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6720	XML_CHAR_ENCODING_NONE);
6721	if (input == NULL) {
6722	if (ioclose != NULL)
6723	ioclose(ioctx);
6724	return (NULL);
6725	}
6726	ctxt = htmlNewParserCtxt();
6727	if (ctxt == NULL) {
6728	xmlFreeParserInputBuffer(input);
6729	return (NULL);
6730	}
6731	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6732	if (stream == NULL) {
6733	xmlFreeParserInputBuffer(input);
6734	xmlFreeParserCtxt(ctxt);
6735	return (NULL);
6736	}
6737	inputPush(ctxt, stream);
6738	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6739	}
6740
6741	/**
6742	* htmlCtxtReadDoc:
6743	* @ctxt: an HTML parser context
6744	* @str: a pointer to a zero terminated string
6745	* @URL: the base URL to use for the document
6746	* @encoding: the document encoding, or NULL
6747	* @options: a combination of htmlParserOption(s)
6748	*
6749	* parse an XML in-memory document and build a tree.
6750	* This reuses the existing @ctxt parser context
6751	*
6752	* Returns the resulting document tree
6753	*/
6754	htmlDocPtr
6755	htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6756	const char URL, const char encoding, int options)
6757	{
6758	xmlParserInputBufferPtr input;
6759	xmlParserInputPtr stream;
6760
6761	if (ctxt == NULL)
6762	return (NULL);
6763	if (str == NULL)
6764	return (NULL);
6765	xmlInitParser();
6766
6767	htmlCtxtReset(ctxt);
6768
6769	input = xmlParserInputBufferCreateString(str);
6770	if (input == NULL) {
6771	return(NULL);
6772	}
6773
6774	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6775	if (stream == NULL) {
6776	xmlFreeParserInputBuffer(input);
6777	return(NULL);
6778	}
6779
6780	inputPush(ctxt, stream);
6781	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6782	}
6783
6784	/**
6785	* htmlCtxtReadFile:
6786	* @ctxt: an HTML parser context
6787	* @filename: a file or URL
6788	* @encoding: the document encoding, or NULL
6789	* @options: a combination of htmlParserOption(s)
6790	*
6791	* parse an XML file from the filesystem or the network.
6792	* This reuses the existing @ctxt parser context
6793	*
6794	* Returns the resulting document tree
6795	*/
6796	htmlDocPtr
6797	htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6798	const char *encoding, int options)
6799	{
6800	xmlParserInputPtr stream;
6801
6802	if (filename == NULL)
6803	return (NULL);
6804	if (ctxt == NULL)
6805	return (NULL);
6806	xmlInitParser();
6807
6808	htmlCtxtReset(ctxt);
6809
6810	stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6811	if (stream == NULL) {
6812	return (NULL);
6813	}
6814	inputPush(ctxt, stream);
6815	return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6816	}
6817
6818	/**
6819	* htmlCtxtReadMemory:
6820	* @ctxt: an HTML parser context
6821	* @buffer: a pointer to a char array
6822	* @size: the size of the array
6823	* @URL: the base URL to use for the document
6824	* @encoding: the document encoding, or NULL
6825	* @options: a combination of htmlParserOption(s)
6826	*
6827	* parse an XML in-memory document and build a tree.
6828	* This reuses the existing @ctxt parser context
6829	*
6830	* Returns the resulting document tree
6831	*/
6832	htmlDocPtr
6833	htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6834	const char URL, const char encoding, int options)
6835	{
6836	xmlParserInputBufferPtr input;
6837	xmlParserInputPtr stream;
6838
6839	if (ctxt == NULL)
6840	return (NULL);
6841	if (buffer == NULL)
6842	return (NULL);
6843	xmlInitParser();
6844
6845	htmlCtxtReset(ctxt);
6846
6847	input = xmlParserInputBufferCreateStatic(buffer, size,
6848	XML_CHAR_ENCODING_NONE);
6849	if (input == NULL) {
6850	return(NULL);
6851	}
6852
6853	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6854	if (stream == NULL) {
6855	xmlFreeParserInputBuffer(input);
6856	return(NULL);
6857	}
6858
6859	inputPush(ctxt, stream);
6860	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6861	}
6862
6863	/**
6864	* htmlCtxtReadFd:
6865	* @ctxt: an HTML parser context
6866	* @fd: an open file descriptor
6867	* @URL: the base URL to use for the document
6868	* @encoding: the document encoding, or NULL
6869	* @options: a combination of htmlParserOption(s)
6870	*
6871	* parse an XML from a file descriptor and build a tree.
6872	* This reuses the existing @ctxt parser context
6873	*
6874	* Returns the resulting document tree
6875	*/
6876	htmlDocPtr
6877	htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6878	const char URL, const char encoding, int options)
6879	{
6880	xmlParserInputBufferPtr input;
6881	xmlParserInputPtr stream;
6882
6883	if (fd < 0)
6884	return (NULL);
6885	if (ctxt == NULL)
6886	return (NULL);
6887	xmlInitParser();
6888
6889	htmlCtxtReset(ctxt);
6890
6891
6892	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6893	if (input == NULL)
6894	return (NULL);
6895	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6896	if (stream == NULL) {
6897	xmlFreeParserInputBuffer(input);
6898	return (NULL);
6899	}
6900	inputPush(ctxt, stream);
6901	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6902	}
6903
6904	/**
6905	* htmlCtxtReadIO:
6906	* @ctxt: an HTML parser context
6907	* @ioread: an I/O read function
6908	* @ioclose: an I/O close function
6909	* @ioctx: an I/O handler
6910	* @URL: the base URL to use for the document
6911	* @encoding: the document encoding, or NULL
6912	* @options: a combination of htmlParserOption(s)
6913	*
6914	* parse an HTML document from I/O functions and source and build a tree.
6915	* This reuses the existing @ctxt parser context
6916	*
6917	* Returns the resulting document tree
6918	*/
6919	htmlDocPtr
6920	htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6921	xmlInputCloseCallback ioclose, void *ioctx,
6922	const char *URL,
6923	const char *encoding, int options)
6924	{
6925	xmlParserInputBufferPtr input;
6926	xmlParserInputPtr stream;
6927
6928	if (ioread == NULL)
6929	return (NULL);
6930	if (ctxt == NULL)
6931	return (NULL);
6932	xmlInitParser();
6933
6934	htmlCtxtReset(ctxt);
6935
6936	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6937	XML_CHAR_ENCODING_NONE);
6938	if (input == NULL) {
6939	if (ioclose != NULL)
6940	ioclose(ioctx);
6941	return (NULL);
6942	}
6943	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6944	if (stream == NULL) {
6945	xmlFreeParserInputBuffer(input);
6946	return (NULL);
6947	}
6948	inputPush(ctxt, stream);
6949	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6950	}
6951
6952	#endif /* LIBXML_HTML_ENABLED */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vbox/trunk/src/libs/libxml2-2.12.6/HTMLparser.c

Download in other formats: