VirtualBox

source: vbox/trunk/src/libs/libxml2-2.12.6/HTMLtree.c

Last change on this file was 104106, checked in by vboxsync, 8 weeks ago

libxml2-2.9.14: Applied and adjusted our libxml2 changes to 2.9.14. bugref:10640

  • Property svn:eol-style set to native
File size: 33.3 KB
Line 
1/*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9
10#define IN_LIBXML
11#include "libxml.h"
12#ifdef LIBXML_HTML_ENABLED
13
14#include <string.h> /* for memset() only ! */
15#include <ctype.h>
16#include <stdlib.h>
17
18#include <libxml/xmlmemory.h>
19#include <libxml/HTMLparser.h>
20#include <libxml/HTMLtree.h>
21#include <libxml/entities.h>
22#include <libxml/xmlerror.h>
23#include <libxml/parserInternals.h>
24#include <libxml/uri.h>
25
26#include "private/buf.h"
27#include "private/error.h"
28#include "private/io.h"
29#include "private/save.h"
30
31/************************************************************************
32 * *
33 * Getting/Setting encoding meta tags *
34 * *
35 ************************************************************************/
36
37/**
38 * htmlGetMetaEncoding:
39 * @doc: the document
40 *
41 * Encoding definition lookup in the Meta tags
42 *
43 * Returns the current encoding as flagged in the HTML source
44 */
45const xmlChar *
46htmlGetMetaEncoding(htmlDocPtr doc) {
47 htmlNodePtr cur;
48 const xmlChar *content;
49 const xmlChar *encoding;
50
51 if (doc == NULL)
52 return(NULL);
53 cur = doc->children;
54
55 /*
56 * Search the html
57 */
58 while (cur != NULL) {
59 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60 if (xmlStrEqual(cur->name, BAD_CAST"html"))
61 break;
62 if (xmlStrEqual(cur->name, BAD_CAST"head"))
63 goto found_head;
64 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65 goto found_meta;
66 }
67 cur = cur->next;
68 }
69 if (cur == NULL)
70 return(NULL);
71 cur = cur->children;
72
73 /*
74 * Search the head
75 */
76 while (cur != NULL) {
77 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78 if (xmlStrEqual(cur->name, BAD_CAST"head"))
79 break;
80 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81 goto found_meta;
82 }
83 cur = cur->next;
84 }
85 if (cur == NULL)
86 return(NULL);
87found_head:
88 cur = cur->children;
89
90 /*
91 * Search the meta elements
92 */
93found_meta:
94 while (cur != NULL) {
95 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97 xmlAttrPtr attr = cur->properties;
98 int http;
99 const xmlChar *value;
100
101 content = NULL;
102 http = 0;
103 while (attr != NULL) {
104 if ((attr->children != NULL) &&
105 (attr->children->type == XML_TEXT_NODE) &&
106 (attr->children->next == NULL)) {
107 value = attr->children->content;
108 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110 http = 1;
111 else if ((value != NULL)
112 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113 content = value;
114 if ((http != 0) && (content != NULL))
115 goto found_content;
116 }
117 attr = attr->next;
118 }
119 }
120 }
121 cur = cur->next;
122 }
123 return(NULL);
124
125found_content:
126 encoding = xmlStrstr(content, BAD_CAST"charset=");
127 if (encoding == NULL)
128 encoding = xmlStrstr(content, BAD_CAST"Charset=");
129 if (encoding == NULL)
130 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131 if (encoding != NULL) {
132 encoding += 8;
133 } else {
134 encoding = xmlStrstr(content, BAD_CAST"charset =");
135 if (encoding == NULL)
136 encoding = xmlStrstr(content, BAD_CAST"Charset =");
137 if (encoding == NULL)
138 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139 if (encoding != NULL)
140 encoding += 9;
141 }
142 if (encoding != NULL) {
143 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144 }
145 return(encoding);
146}
147
148/**
149 * htmlSetMetaEncoding:
150 * @doc: the document
151 * @encoding: the encoding string
152 *
153 * Sets the current encoding in the Meta tags
154 * NOTE: this will not change the document content encoding, just
155 * the META flag associated.
156 *
157 * Returns 0 in case of success and -1 in case of error
158 */
159int
160htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161 htmlNodePtr cur, meta = NULL, head = NULL;
162 const xmlChar *content = NULL;
163 char newcontent[100];
164
165 newcontent[0] = 0;
166
167 if (doc == NULL)
168 return(-1);
169
170 /* html isn't a real encoding it's just libxml2 way to get entities */
171 if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172 return(-1);
173
174 if (encoding != NULL) {
175 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176 (char *)encoding);
177 newcontent[sizeof(newcontent) - 1] = 0;
178 }
179
180 cur = doc->children;
181
182 /*
183 * Search the html
184 */
185 while (cur != NULL) {
186 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188 break;
189 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190 goto found_head;
191 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192 goto found_meta;
193 }
194 cur = cur->next;
195 }
196 if (cur == NULL)
197 return(-1);
198 cur = cur->children;
199
200 /*
201 * Search the head
202 */
203 while (cur != NULL) {
204 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206 break;
207 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208 head = cur->parent;
209 goto found_meta;
210 }
211 }
212 cur = cur->next;
213 }
214 if (cur == NULL)
215 return(-1);
216found_head:
217 head = cur;
218 if (cur->children == NULL)
219 goto create;
220 cur = cur->children;
221
222found_meta:
223 /*
224 * Search and update all the remaining the meta elements carrying
225 * encoding information
226 */
227 while (cur != NULL) {
228 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230 xmlAttrPtr attr = cur->properties;
231 int http;
232 const xmlChar *value;
233
234 content = NULL;
235 http = 0;
236 while (attr != NULL) {
237 if ((attr->children != NULL) &&
238 (attr->children->type == XML_TEXT_NODE) &&
239 (attr->children->next == NULL)) {
240 value = attr->children->content;
241 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243 http = 1;
244 else
245 {
246 if ((value != NULL) &&
247 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248 content = value;
249 }
250 if ((http != 0) && (content != NULL))
251 break;
252 }
253 attr = attr->next;
254 }
255 if ((http != 0) && (content != NULL)) {
256 meta = cur;
257 break;
258 }
259
260 }
261 }
262 cur = cur->next;
263 }
264create:
265 if (meta == NULL) {
266 if ((encoding != NULL) && (head != NULL)) {
267 /*
268 * Create a new Meta element with the right attributes
269 */
270
271 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272 if (head->children == NULL)
273 xmlAddChild(head, meta);
274 else
275 xmlAddPrevSibling(head->children, meta);
276 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278 }
279 } else {
280 /* remove the meta tag if NULL is passed */
281 if (encoding == NULL) {
282 xmlUnlinkNode(meta);
283 xmlFreeNode(meta);
284 }
285 /* change the document only if there is a real encoding change */
286 else if (xmlStrcasestr(content, encoding) == NULL) {
287 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288 }
289 }
290
291
292 return(0);
293}
294
295/**
296 * booleanHTMLAttrs:
297 *
298 * These are the HTML attributes which will be output
299 * in minimized form, i.e. <option selected="selected"> will be
300 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301 *
302 */
303static const char* const htmlBooleanAttrs[] = {
304 "checked", "compact", "declare", "defer", "disabled", "ismap",
305 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306 "selected", NULL
307};
308
309
310/**
311 * htmlIsBooleanAttr:
312 * @name: the name of the attribute to check
313 *
314 * Determine if a given attribute is a boolean attribute.
315 *
316 * returns: false if the attribute is not boolean, true otherwise.
317 */
318int
319htmlIsBooleanAttr(const xmlChar *name)
320{
321 int i = 0;
322
323 while (htmlBooleanAttrs[i] != NULL) {
324 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325 return 1;
326 i++;
327 }
328 return 0;
329}
330
331#ifdef LIBXML_OUTPUT_ENABLED
332/************************************************************************
333 * *
334 * Output error handlers *
335 * *
336 ************************************************************************/
337/**
338 * htmlSaveErrMemory:
339 * @extra: extra information
340 *
341 * Handle an out of memory condition
342 */
343static void
344htmlSaveErrMemory(const char *extra)
345{
346 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
347}
348
349/**
350 * htmlSaveErr:
351 * @code: the error number
352 * @node: the location of the error.
353 * @extra: extra information
354 *
355 * Handle an out of memory condition
356 */
357static void
358htmlSaveErr(int code, xmlNodePtr node, const char *extra)
359{
360 const char *msg = NULL;
361
362 switch(code) {
363 case XML_SAVE_NOT_UTF8:
364 msg = "string is not in UTF-8\n";
365 break;
366 case XML_SAVE_CHAR_INVALID:
367 msg = "invalid character value\n";
368 break;
369 case XML_SAVE_UNKNOWN_ENCODING:
370 msg = "unknown encoding %s\n";
371 break;
372 case XML_SAVE_NO_DOCTYPE:
373 msg = "HTML has no DOCTYPE\n";
374 break;
375 default:
376 msg = "unexpected error number\n";
377 }
378 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
379}
380
381/************************************************************************
382 * *
383 * Dumping HTML tree content to a simple buffer *
384 * *
385 ************************************************************************/
386
387/**
388 * htmlBufNodeDumpFormat:
389 * @buf: the xmlBufPtr output
390 * @doc: the document
391 * @cur: the current node
392 * @format: should formatting spaces been added
393 *
394 * Dump an HTML node, recursive behaviour,children are printed too.
395 *
396 * Returns the number of byte written or -1 in case of error
397 */
398static size_t
399htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
400 int format) {
401 size_t use;
402 int ret;
403 xmlOutputBufferPtr outbuf;
404
405 if (cur == NULL) {
406 return (-1);
407 }
408 if (buf == NULL) {
409 return (-1);
410 }
411 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
412 if (outbuf == NULL) {
413 htmlSaveErrMemory("allocating HTML output buffer");
414 return (-1);
415 }
416 memset(outbuf, 0, sizeof(xmlOutputBuffer));
417 outbuf->buffer = buf;
418 outbuf->encoder = NULL;
419 outbuf->writecallback = NULL;
420 outbuf->closecallback = NULL;
421 outbuf->context = NULL;
422 outbuf->written = 0;
423
424 use = xmlBufUse(buf);
425 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
426 xmlFree(outbuf);
427 ret = xmlBufUse(buf) - use;
428 return (ret);
429}
430
431/**
432 * htmlNodeDump:
433 * @buf: the HTML buffer output
434 * @doc: the document
435 * @cur: the current node
436 *
437 * Dump an HTML node, recursive behaviour,children are printed too,
438 * and formatting returns are added.
439 *
440 * Returns the number of byte written or -1 in case of error
441 */
442int
443htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
444 xmlBufPtr buffer;
445 size_t ret;
446
447 if ((buf == NULL) || (cur == NULL))
448 return(-1);
449
450 xmlInitParser();
451 buffer = xmlBufFromBuffer(buf);
452 if (buffer == NULL)
453 return(-1);
454
455 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
456
457 xmlBufBackToBuffer(buffer);
458
459 if (ret > INT_MAX)
460 return(-1);
461 return((int) ret);
462}
463
464/**
465 * htmlNodeDumpFileFormat:
466 * @out: the FILE pointer
467 * @doc: the document
468 * @cur: the current node
469 * @encoding: the document encoding
470 * @format: should formatting spaces been added
471 *
472 * Dump an HTML node, recursive behaviour,children are printed too.
473 *
474 * TODO: if encoding == NULL try to save in the doc encoding
475 *
476 * returns: the number of byte written or -1 in case of failure.
477 */
478int
479htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
480 xmlNodePtr cur, const char *encoding, int format) {
481 xmlOutputBufferPtr buf;
482 xmlCharEncodingHandlerPtr handler = NULL;
483 int ret;
484
485 xmlInitParser();
486
487 if (encoding != NULL) {
488 xmlCharEncoding enc;
489
490 enc = xmlParseCharEncoding(encoding);
491 if (enc != XML_CHAR_ENCODING_UTF8) {
492 handler = xmlFindCharEncodingHandler(encoding);
493 if (handler == NULL)
494 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
495 }
496 } else {
497 /*
498 * Fallback to HTML or ASCII when the encoding is unspecified
499 */
500 if (handler == NULL)
501 handler = xmlFindCharEncodingHandler("HTML");
502 if (handler == NULL)
503 handler = xmlFindCharEncodingHandler("ascii");
504 }
505
506 /*
507 * save the content to a temp buffer.
508 */
509 buf = xmlOutputBufferCreateFile(out, handler);
510 if (buf == NULL) return(0);
511
512 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
513
514 ret = xmlOutputBufferClose(buf);
515 return(ret);
516}
517
518/**
519 * htmlNodeDumpFile:
520 * @out: the FILE pointer
521 * @doc: the document
522 * @cur: the current node
523 *
524 * Dump an HTML node, recursive behaviour,children are printed too,
525 * and formatting returns are added.
526 */
527void
528htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
529 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
530}
531
532/**
533 * htmlDocDumpMemoryFormat:
534 * @cur: the document
535 * @mem: OUT: the memory pointer
536 * @size: OUT: the memory length
537 * @format: should formatting spaces been added
538 *
539 * Dump an HTML document in memory and return the xmlChar * and it's size.
540 * It's up to the caller to free the memory.
541 */
542void
543htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
544 xmlOutputBufferPtr buf;
545 xmlCharEncodingHandlerPtr handler = NULL;
546 const char *encoding;
547
548 xmlInitParser();
549
550 if ((mem == NULL) || (size == NULL))
551 return;
552 if (cur == NULL) {
553 *mem = NULL;
554 *size = 0;
555 return;
556 }
557
558 encoding = (const char *) htmlGetMetaEncoding(cur);
559
560 if (encoding != NULL) {
561 xmlCharEncoding enc;
562
563 enc = xmlParseCharEncoding(encoding);
564 if (enc != XML_CHAR_ENCODING_UTF8) {
565 handler = xmlFindCharEncodingHandler(encoding);
566 if (handler == NULL)
567 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
568
569 }
570 } else {
571 /*
572 * Fallback to HTML or ASCII when the encoding is unspecified
573 */
574 if (handler == NULL)
575 handler = xmlFindCharEncodingHandler("HTML");
576 if (handler == NULL)
577 handler = xmlFindCharEncodingHandler("ascii");
578 }
579
580 buf = xmlAllocOutputBufferInternal(handler);
581 if (buf == NULL) {
582 *mem = NULL;
583 *size = 0;
584 return;
585 }
586
587 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
588
589 xmlOutputBufferFlush(buf);
590 if (buf->conv != NULL) {
591 *size = xmlBufUse(buf->conv);
592 *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
593 } else {
594 *size = xmlBufUse(buf->buffer);
595 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
596 }
597 (void)xmlOutputBufferClose(buf);
598}
599
600/**
601 * htmlDocDumpMemory:
602 * @cur: the document
603 * @mem: OUT: the memory pointer
604 * @size: OUT: the memory length
605 *
606 * Dump an HTML document in memory and return the xmlChar * and it's size.
607 * It's up to the caller to free the memory.
608 */
609void
610htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
611 htmlDocDumpMemoryFormat(cur, mem, size, 1);
612}
613
614
615/************************************************************************
616 * *
617 * Dumping HTML tree content to an I/O output buffer *
618 * *
619 ************************************************************************/
620
621/**
622 * htmlDtdDumpOutput:
623 * @buf: the HTML buffer output
624 * @doc: the document
625 * @encoding: the encoding string
626 *
627 * TODO: check whether encoding is needed
628 *
629 * Dump the HTML document DTD, if any.
630 */
631static void
632htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
633 const char *encoding ATTRIBUTE_UNUSED) {
634 xmlDtdPtr cur = doc->intSubset;
635
636 if (cur == NULL) {
637 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
638 return;
639 }
640 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
641 xmlOutputBufferWriteString(buf, (const char *)cur->name);
642 if (cur->ExternalID != NULL) {
643 xmlOutputBufferWriteString(buf, " PUBLIC ");
644 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
645 if (cur->SystemID != NULL) {
646 xmlOutputBufferWriteString(buf, " ");
647 xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
648 }
649 } else if (cur->SystemID != NULL &&
650 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
651 xmlOutputBufferWriteString(buf, " SYSTEM ");
652 xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
653 }
654 xmlOutputBufferWriteString(buf, ">\n");
655}
656
657/**
658 * htmlAttrDumpOutput:
659 * @buf: the HTML buffer output
660 * @doc: the document
661 * @cur: the attribute pointer
662 *
663 * Dump an HTML attribute
664 */
665static void
666htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
667 xmlChar *value;
668
669 /*
670 * The html output method should not escape a & character
671 * occurring in an attribute value immediately followed by
672 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
673 * This is implemented in xmlEncodeEntitiesReentrant
674 */
675
676 if (cur == NULL) {
677 return;
678 }
679 xmlOutputBufferWriteString(buf, " ");
680 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
681 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
682 xmlOutputBufferWriteString(buf, ":");
683 }
684 xmlOutputBufferWriteString(buf, (const char *)cur->name);
685 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
686 value = xmlNodeListGetString(doc, cur->children, 0);
687 if (value) {
688 xmlOutputBufferWriteString(buf, "=");
689 if ((cur->ns == NULL) && (cur->parent != NULL) &&
690 (cur->parent->ns == NULL) &&
691 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
692 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
693 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
694 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
695 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
696 xmlChar *escaped;
697 xmlChar *tmp = value;
698
699 while (IS_BLANK_CH(*tmp)) tmp++;
700
701 /*
702 * Angle brackets are technically illegal in URIs, but they're
703 * used in server side includes, for example. Curly brackets
704 * are illegal as well and often used in templates.
705 * Don't escape non-whitespace, printable ASCII chars for
706 * improved interoperability. Only escape space, control
707 * and non-ASCII chars.
708 */
709 escaped = xmlURIEscapeStr(tmp,
710 BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
711 if (escaped != NULL) {
712 xmlBufWriteQuotedString(buf->buffer, escaped);
713 xmlFree(escaped);
714 } else {
715 xmlBufWriteQuotedString(buf->buffer, value);
716 }
717 } else {
718 xmlBufWriteQuotedString(buf->buffer, value);
719 }
720 xmlFree(value);
721 } else {
722 xmlOutputBufferWriteString(buf, "=\"\"");
723 }
724 }
725}
726
727/**
728 * htmlNodeDumpFormatOutput:
729 * @buf: the HTML buffer output
730 * @doc: the document
731 * @cur: the current node
732 * @encoding: the encoding string (unused)
733 * @format: should formatting spaces been added
734 *
735 * Dump an HTML node, recursive behaviour,children are printed too.
736 */
737void
738htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
739 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
740 int format) {
741 xmlNodePtr root, parent;
742 xmlAttrPtr attr;
743 const htmlElemDesc * info;
744
745 xmlInitParser();
746
747 if ((cur == NULL) || (buf == NULL)) {
748 return;
749 }
750
751 root = cur;
752 parent = cur->parent;
753 while (1) {
754 switch (cur->type) {
755 case XML_HTML_DOCUMENT_NODE:
756 case XML_DOCUMENT_NODE:
757 if (((xmlDocPtr) cur)->intSubset != NULL) {
758 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
759 }
760 if (cur->children != NULL) {
761 /* Always validate cur->parent when descending. */
762 if (cur->parent == parent) {
763 parent = cur;
764 cur = cur->children;
765 continue;
766 }
767 } else {
768 xmlOutputBufferWriteString(buf, "\n");
769 }
770 break;
771
772 case XML_ELEMENT_NODE:
773 /*
774 * Some users like lxml are known to pass nodes with a corrupted
775 * tree structure. Fall back to a recursive call to handle this
776 * case.
777 */
778 if ((cur->parent != parent) && (cur->children != NULL)) {
779 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
780 break;
781 }
782
783 /*
784 * Get specific HTML info for that node.
785 */
786 if (cur->ns == NULL)
787 info = htmlTagLookup(cur->name);
788 else
789 info = NULL;
790
791 xmlOutputBufferWriteString(buf, "<");
792 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
793 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
794 xmlOutputBufferWriteString(buf, ":");
795 }
796 xmlOutputBufferWriteString(buf, (const char *)cur->name);
797 if (cur->nsDef)
798 xmlNsListDumpOutput(buf, cur->nsDef);
799 attr = cur->properties;
800 while (attr != NULL) {
801 htmlAttrDumpOutput(buf, doc, attr);
802 attr = attr->next;
803 }
804
805 if ((info != NULL) && (info->empty)) {
806 xmlOutputBufferWriteString(buf, ">");
807 } else if (cur->children == NULL) {
808 if ((info != NULL) && (info->saveEndTag != 0) &&
809 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
810 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
811 xmlOutputBufferWriteString(buf, ">");
812 } else {
813 xmlOutputBufferWriteString(buf, "></");
814 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
815 xmlOutputBufferWriteString(buf,
816 (const char *)cur->ns->prefix);
817 xmlOutputBufferWriteString(buf, ":");
818 }
819 xmlOutputBufferWriteString(buf, (const char *)cur->name);
820 xmlOutputBufferWriteString(buf, ">");
821 }
822 } else {
823 xmlOutputBufferWriteString(buf, ">");
824 if ((format) && (info != NULL) && (!info->isinline) &&
825 (cur->children->type != HTML_TEXT_NODE) &&
826 (cur->children->type != HTML_ENTITY_REF_NODE) &&
827 (cur->children != cur->last) &&
828 (cur->name != NULL) &&
829 (cur->name[0] != 'p')) /* p, pre, param */
830 xmlOutputBufferWriteString(buf, "\n");
831 parent = cur;
832 cur = cur->children;
833 continue;
834 }
835
836 if ((format) && (cur->next != NULL) &&
837 (info != NULL) && (!info->isinline)) {
838 if ((cur->next->type != HTML_TEXT_NODE) &&
839 (cur->next->type != HTML_ENTITY_REF_NODE) &&
840 (parent != NULL) &&
841 (parent->name != NULL) &&
842 (parent->name[0] != 'p')) /* p, pre, param */
843 xmlOutputBufferWriteString(buf, "\n");
844 }
845
846 break;
847
848 case XML_ATTRIBUTE_NODE:
849 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
850 break;
851
852 case HTML_TEXT_NODE:
853 if (cur->content == NULL)
854 break;
855 if (((cur->name == (const xmlChar *)xmlStringText) ||
856 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
857 ((parent == NULL) ||
858 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
859 (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
860 xmlChar *buffer;
861
862 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
863 if (buffer != NULL) {
864 xmlOutputBufferWriteString(buf, (const char *)buffer);
865 xmlFree(buffer);
866 }
867 } else {
868 xmlOutputBufferWriteString(buf, (const char *)cur->content);
869 }
870 break;
871
872 case HTML_COMMENT_NODE:
873 if (cur->content != NULL) {
874 xmlOutputBufferWriteString(buf, "<!--");
875 xmlOutputBufferWriteString(buf, (const char *)cur->content);
876 xmlOutputBufferWriteString(buf, "-->");
877 }
878 break;
879
880 case HTML_PI_NODE:
881 if (cur->name != NULL) {
882 xmlOutputBufferWriteString(buf, "<?");
883 xmlOutputBufferWriteString(buf, (const char *)cur->name);
884 if (cur->content != NULL) {
885 xmlOutputBufferWriteString(buf, " ");
886 xmlOutputBufferWriteString(buf,
887 (const char *)cur->content);
888 }
889 xmlOutputBufferWriteString(buf, ">");
890 }
891 break;
892
893 case HTML_ENTITY_REF_NODE:
894 xmlOutputBufferWriteString(buf, "&");
895 xmlOutputBufferWriteString(buf, (const char *)cur->name);
896 xmlOutputBufferWriteString(buf, ";");
897 break;
898
899 case HTML_PRESERVE_NODE:
900 if (cur->content != NULL) {
901 xmlOutputBufferWriteString(buf, (const char *)cur->content);
902 }
903 break;
904
905 default:
906 break;
907 }
908
909 while (1) {
910 if (cur == root)
911 return;
912 if (cur->next != NULL) {
913 cur = cur->next;
914 break;
915 }
916
917 cur = parent;
918 /* cur->parent was validated when descending. */
919 parent = cur->parent;
920
921 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
922 (cur->type == XML_DOCUMENT_NODE)) {
923 xmlOutputBufferWriteString(buf, "\n");
924 } else {
925 if ((format) && (cur->ns == NULL))
926 info = htmlTagLookup(cur->name);
927 else
928 info = NULL;
929
930 if ((format) && (info != NULL) && (!info->isinline) &&
931 (cur->last->type != HTML_TEXT_NODE) &&
932 (cur->last->type != HTML_ENTITY_REF_NODE) &&
933 (cur->children != cur->last) &&
934 (cur->name != NULL) &&
935 (cur->name[0] != 'p')) /* p, pre, param */
936 xmlOutputBufferWriteString(buf, "\n");
937
938 xmlOutputBufferWriteString(buf, "</");
939 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
940 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
941 xmlOutputBufferWriteString(buf, ":");
942 }
943 xmlOutputBufferWriteString(buf, (const char *)cur->name);
944 xmlOutputBufferWriteString(buf, ">");
945
946 if ((format) && (info != NULL) && (!info->isinline) &&
947 (cur->next != NULL)) {
948 if ((cur->next->type != HTML_TEXT_NODE) &&
949 (cur->next->type != HTML_ENTITY_REF_NODE) &&
950 (parent != NULL) &&
951 (parent->name != NULL) &&
952 (parent->name[0] != 'p')) /* p, pre, param */
953 xmlOutputBufferWriteString(buf, "\n");
954 }
955 }
956 }
957 }
958}
959
960/**
961 * htmlNodeDumpOutput:
962 * @buf: the HTML buffer output
963 * @doc: the document
964 * @cur: the current node
965 * @encoding: the encoding string (unused)
966 *
967 * Dump an HTML node, recursive behaviour,children are printed too,
968 * and formatting returns/spaces are added.
969 */
970void
971htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
972 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
973 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
974}
975
976/**
977 * htmlDocContentDumpFormatOutput:
978 * @buf: the HTML buffer output
979 * @cur: the document
980 * @encoding: the encoding string (unused)
981 * @format: should formatting spaces been added
982 *
983 * Dump an HTML document.
984 */
985void
986htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
987 const char *encoding ATTRIBUTE_UNUSED,
988 int format) {
989 int type = 0;
990 if (cur) {
991 type = cur->type;
992 cur->type = XML_HTML_DOCUMENT_NODE;
993 }
994 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
995 if (cur)
996 cur->type = (xmlElementType) type;
997}
998
999/**
1000 * htmlDocContentDumpOutput:
1001 * @buf: the HTML buffer output
1002 * @cur: the document
1003 * @encoding: the encoding string (unused)
1004 *
1005 * Dump an HTML document. Formatting return/spaces are added.
1006 */
1007void
1008htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1009 const char *encoding ATTRIBUTE_UNUSED) {
1010 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1011}
1012
1013/************************************************************************
1014 * *
1015 * Saving functions front-ends *
1016 * *
1017 ************************************************************************/
1018
1019/**
1020 * htmlDocDump:
1021 * @f: the FILE*
1022 * @cur: the document
1023 *
1024 * Dump an HTML document to an open FILE.
1025 *
1026 * returns: the number of byte written or -1 in case of failure.
1027 */
1028int
1029htmlDocDump(FILE *f, xmlDocPtr cur) {
1030 xmlOutputBufferPtr buf;
1031 xmlCharEncodingHandlerPtr handler = NULL;
1032 const char *encoding;
1033 int ret;
1034
1035 xmlInitParser();
1036
1037 if ((cur == NULL) || (f == NULL)) {
1038 return(-1);
1039 }
1040
1041 encoding = (const char *) htmlGetMetaEncoding(cur);
1042
1043 if (encoding != NULL) {
1044 xmlCharEncoding enc;
1045
1046 enc = xmlParseCharEncoding(encoding);
1047 if (enc != XML_CHAR_ENCODING_UTF8) {
1048 handler = xmlFindCharEncodingHandler(encoding);
1049 if (handler == NULL)
1050 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1051 }
1052 } else {
1053 /*
1054 * Fallback to HTML or ASCII when the encoding is unspecified
1055 */
1056 if (handler == NULL)
1057 handler = xmlFindCharEncodingHandler("HTML");
1058 if (handler == NULL)
1059 handler = xmlFindCharEncodingHandler("ascii");
1060 }
1061
1062 buf = xmlOutputBufferCreateFile(f, handler);
1063 if (buf == NULL) return(-1);
1064 htmlDocContentDumpOutput(buf, cur, NULL);
1065
1066 ret = xmlOutputBufferClose(buf);
1067 return(ret);
1068}
1069
1070/**
1071 * htmlSaveFile:
1072 * @filename: the filename (or URL)
1073 * @cur: the document
1074 *
1075 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1076 * used.
1077 * returns: the number of byte written or -1 in case of failure.
1078 */
1079int
1080htmlSaveFile(const char *filename, xmlDocPtr cur) {
1081 xmlOutputBufferPtr buf;
1082 xmlCharEncodingHandlerPtr handler = NULL;
1083 const char *encoding;
1084 int ret;
1085
1086 if ((cur == NULL) || (filename == NULL))
1087 return(-1);
1088
1089 xmlInitParser();
1090
1091 encoding = (const char *) htmlGetMetaEncoding(cur);
1092
1093 if (encoding != NULL) {
1094 xmlCharEncoding enc;
1095
1096 enc = xmlParseCharEncoding(encoding);
1097 if (enc != XML_CHAR_ENCODING_UTF8) {
1098 handler = xmlFindCharEncodingHandler(encoding);
1099 if (handler == NULL)
1100 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1101 }
1102 } else {
1103 /*
1104 * Fallback to HTML or ASCII when the encoding is unspecified
1105 */
1106 if (handler == NULL)
1107 handler = xmlFindCharEncodingHandler("HTML");
1108 if (handler == NULL)
1109 handler = xmlFindCharEncodingHandler("ascii");
1110 }
1111
1112 /*
1113 * save the content to a temp buffer.
1114 */
1115 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1116 if (buf == NULL) return(0);
1117
1118 htmlDocContentDumpOutput(buf, cur, NULL);
1119
1120 ret = xmlOutputBufferClose(buf);
1121 return(ret);
1122}
1123
1124/**
1125 * htmlSaveFileFormat:
1126 * @filename: the filename
1127 * @cur: the document
1128 * @format: should formatting spaces been added
1129 * @encoding: the document encoding
1130 *
1131 * Dump an HTML document to a file using a given encoding.
1132 *
1133 * returns: the number of byte written or -1 in case of failure.
1134 */
1135int
1136htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1137 const char *encoding, int format) {
1138 xmlOutputBufferPtr buf;
1139 xmlCharEncodingHandlerPtr handler = NULL;
1140 int ret;
1141
1142 if ((cur == NULL) || (filename == NULL))
1143 return(-1);
1144
1145 xmlInitParser();
1146
1147 if (encoding != NULL) {
1148 xmlCharEncoding enc;
1149
1150 enc = xmlParseCharEncoding(encoding);
1151 if (enc != XML_CHAR_ENCODING_UTF8) {
1152 handler = xmlFindCharEncodingHandler(encoding);
1153 if (handler == NULL)
1154 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1155 }
1156 htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1157 } else {
1158 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1159
1160 /*
1161 * Fallback to HTML or ASCII when the encoding is unspecified
1162 */
1163 if (handler == NULL)
1164 handler = xmlFindCharEncodingHandler("HTML");
1165 if (handler == NULL)
1166 handler = xmlFindCharEncodingHandler("ascii");
1167 }
1168
1169 /*
1170 * save the content to a temp buffer.
1171 */
1172 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1173 if (buf == NULL) return(0);
1174
1175 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1176
1177 ret = xmlOutputBufferClose(buf);
1178 return(ret);
1179}
1180
1181/**
1182 * htmlSaveFileEnc:
1183 * @filename: the filename
1184 * @cur: the document
1185 * @encoding: the document encoding
1186 *
1187 * Dump an HTML document to a file using a given encoding
1188 * and formatting returns/spaces are added.
1189 *
1190 * returns: the number of byte written or -1 in case of failure.
1191 */
1192int
1193htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1194 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1195}
1196
1197#endif /* LIBXML_OUTPUT_ENABLED */
1198
1199#endif /* LIBXML_HTML_ENABLED */
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use