VirtualBox

source: vbox/trunk/src/libs/libxml2-2.6.30/HTMLtree.c@ 25275

Last change on this file since 25275 was 6076, checked in by vboxsync, 17 years ago

Merged dmik/s2 branch (r25959:26751) to the trunk.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Date Revision Author Id
File size: 31.1 KB
Line 
1/*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9
10#define IN_LIBXML
11#include "libxml.h"
12#ifdef LIBXML_HTML_ENABLED
13
14#include <string.h> /* for memset() only ! */
15
16#ifdef HAVE_CTYPE_H
17#include <ctype.h>
18#endif
19#ifdef HAVE_STDLIB_H
20#include <stdlib.h>
21#endif
22
23#include <libxml/xmlmemory.h>
24#include <libxml/HTMLparser.h>
25#include <libxml/HTMLtree.h>
26#include <libxml/entities.h>
27#include <libxml/valid.h>
28#include <libxml/xmlerror.h>
29#include <libxml/parserInternals.h>
30#include <libxml/globals.h>
31#include <libxml/uri.h>
32
33/************************************************************************
34 * *
35 * Getting/Setting encoding meta tags *
36 * *
37 ************************************************************************/
38
39/**
40 * htmlGetMetaEncoding:
41 * @doc: the document
42 *
43 * Encoding definition lookup in the Meta tags
44 *
45 * Returns the current encoding as flagged in the HTML source
46 */
47const xmlChar *
48htmlGetMetaEncoding(htmlDocPtr doc) {
49 htmlNodePtr cur;
50 const xmlChar *content;
51 const xmlChar *encoding;
52
53 if (doc == NULL)
54 return(NULL);
55 cur = doc->children;
56
57 /*
58 * Search the html
59 */
60 while (cur != NULL) {
61 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62 if (xmlStrEqual(cur->name, BAD_CAST"html"))
63 break;
64 if (xmlStrEqual(cur->name, BAD_CAST"head"))
65 goto found_head;
66 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67 goto found_meta;
68 }
69 cur = cur->next;
70 }
71 if (cur == NULL)
72 return(NULL);
73 cur = cur->children;
74
75 /*
76 * Search the head
77 */
78 while (cur != NULL) {
79 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80 if (xmlStrEqual(cur->name, BAD_CAST"head"))
81 break;
82 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83 goto found_meta;
84 }
85 cur = cur->next;
86 }
87 if (cur == NULL)
88 return(NULL);
89found_head:
90 cur = cur->children;
91
92 /*
93 * Search the meta elements
94 */
95found_meta:
96 while (cur != NULL) {
97 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99 xmlAttrPtr attr = cur->properties;
100 int http;
101 const xmlChar *value;
102
103 content = NULL;
104 http = 0;
105 while (attr != NULL) {
106 if ((attr->children != NULL) &&
107 (attr->children->type == XML_TEXT_NODE) &&
108 (attr->children->next == NULL)) {
109 value = attr->children->content;
110 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112 http = 1;
113 else if ((value != NULL)
114 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115 content = value;
116 if ((http != 0) && (content != NULL))
117 goto found_content;
118 }
119 attr = attr->next;
120 }
121 }
122 }
123 cur = cur->next;
124 }
125 return(NULL);
126
127found_content:
128 encoding = xmlStrstr(content, BAD_CAST"charset=");
129 if (encoding == NULL)
130 encoding = xmlStrstr(content, BAD_CAST"Charset=");
131 if (encoding == NULL)
132 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133 if (encoding != NULL) {
134 encoding += 8;
135 } else {
136 encoding = xmlStrstr(content, BAD_CAST"charset =");
137 if (encoding == NULL)
138 encoding = xmlStrstr(content, BAD_CAST"Charset =");
139 if (encoding == NULL)
140 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141 if (encoding != NULL)
142 encoding += 9;
143 }
144 if (encoding != NULL) {
145 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146 }
147 return(encoding);
148}
149
150/**
151 * htmlSetMetaEncoding:
152 * @doc: the document
153 * @encoding: the encoding string
154 *
155 * Sets the current encoding in the Meta tags
156 * NOTE: this will not change the document content encoding, just
157 * the META flag associated.
158 *
159 * Returns 0 in case of success and -1 in case of error
160 */
161int
162htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163 htmlNodePtr cur, meta;
164 const xmlChar *content;
165 char newcontent[100];
166
167
168 if (doc == NULL)
169 return(-1);
170
171 if (encoding != NULL) {
172 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
173 (char *)encoding);
174 newcontent[sizeof(newcontent) - 1] = 0;
175 }
176
177 cur = doc->children;
178
179 /*
180 * Search the html
181 */
182 while (cur != NULL) {
183 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
184 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
185 break;
186 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
187 goto found_head;
188 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
189 goto found_meta;
190 }
191 cur = cur->next;
192 }
193 if (cur == NULL)
194 return(-1);
195 cur = cur->children;
196
197 /*
198 * Search the head
199 */
200 while (cur != NULL) {
201 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
202 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
203 break;
204 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
205 goto found_meta;
206 }
207 cur = cur->next;
208 }
209 if (cur == NULL)
210 return(-1);
211found_head:
212 if (cur->children == NULL) {
213 if (encoding == NULL)
214 return(0);
215 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
216 xmlAddChild(cur, meta);
217 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
218 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
219 return(0);
220 }
221 cur = cur->children;
222
223found_meta:
224 if (encoding != NULL) {
225 /*
226 * Create a new Meta element with the right attributes
227 */
228
229 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
230 xmlAddPrevSibling(cur, meta);
231 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
232 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
233 }
234
235 /*
236 * Search and destroy all the remaining the meta elements carrying
237 * encoding informations
238 */
239 while (cur != NULL) {
240 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
241 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
242 xmlAttrPtr attr = cur->properties;
243 int http;
244 const xmlChar *value;
245
246 content = NULL;
247 http = 0;
248 while (attr != NULL) {
249 if ((attr->children != NULL) &&
250 (attr->children->type == XML_TEXT_NODE) &&
251 (attr->children->next == NULL)) {
252 value = attr->children->content;
253 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
254 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
255 http = 1;
256 else
257 {
258 if ((value != NULL) &&
259 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
260 content = value;
261 }
262 if ((http != 0) && (content != NULL))
263 break;
264 }
265 attr = attr->next;
266 }
267 if ((http != 0) && (content != NULL)) {
268 meta = cur;
269 cur = cur->next;
270 xmlUnlinkNode(meta);
271 xmlFreeNode(meta);
272 continue;
273 }
274
275 }
276 }
277 cur = cur->next;
278 }
279 return(0);
280}
281
282/**
283 * booleanHTMLAttrs:
284 *
285 * These are the HTML attributes which will be output
286 * in minimized form, i.e. <option selected="selected"> will be
287 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
288 *
289 */
290static const char* htmlBooleanAttrs[] = {
291 "checked", "compact", "declare", "defer", "disabled", "ismap",
292 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
293 "selected", NULL
294};
295
296
297/**
298 * htmlIsBooleanAttr:
299 * @name: the name of the attribute to check
300 *
301 * Determine if a given attribute is a boolean attribute.
302 *
303 * returns: false if the attribute is not boolean, true otherwise.
304 */
305int
306htmlIsBooleanAttr(const xmlChar *name)
307{
308 int i = 0;
309
310 while (htmlBooleanAttrs[i] != NULL) {
311 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
312 return 1;
313 i++;
314 }
315 return 0;
316}
317
318#ifdef LIBXML_OUTPUT_ENABLED
319/************************************************************************
320 * *
321 * Output error handlers *
322 * *
323 ************************************************************************/
324/**
325 * htmlSaveErrMemory:
326 * @extra: extra informations
327 *
328 * Handle an out of memory condition
329 */
330static void
331htmlSaveErrMemory(const char *extra)
332{
333 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
334}
335
336/**
337 * htmlSaveErr:
338 * @code: the error number
339 * @node: the location of the error.
340 * @extra: extra informations
341 *
342 * Handle an out of memory condition
343 */
344static void
345htmlSaveErr(int code, xmlNodePtr node, const char *extra)
346{
347 const char *msg = NULL;
348
349 switch(code) {
350 case XML_SAVE_NOT_UTF8:
351 msg = "string is not in UTF-8\n";
352 break;
353 case XML_SAVE_CHAR_INVALID:
354 msg = "invalid character value\n";
355 break;
356 case XML_SAVE_UNKNOWN_ENCODING:
357 msg = "unknown encoding %s\n";
358 break;
359 case XML_SAVE_NO_DOCTYPE:
360 msg = "HTML has no DOCTYPE\n";
361 break;
362 default:
363 msg = "unexpected error number\n";
364 }
365 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
366}
367
368/************************************************************************
369 * *
370 * Dumping HTML tree content to a simple buffer *
371 * *
372 ************************************************************************/
373
374static int
375htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
376 int format);
377
378/**
379 * htmlNodeDumpFormat:
380 * @buf: the HTML buffer output
381 * @doc: the document
382 * @cur: the current node
383 * @format: should formatting spaces been added
384 *
385 * Dump an HTML node, recursive behaviour,children are printed too.
386 *
387 * Returns the number of byte written or -1 in case of error
388 */
389static int
390htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
391 int format) {
392 unsigned int use;
393 int ret;
394 xmlOutputBufferPtr outbuf;
395
396 if (cur == NULL) {
397 return (-1);
398 }
399 if (buf == NULL) {
400 return (-1);
401 }
402 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
403 if (outbuf == NULL) {
404 htmlSaveErrMemory("allocating HTML output buffer");
405 return (-1);
406 }
407 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
408 outbuf->buffer = buf;
409 outbuf->encoder = NULL;
410 outbuf->writecallback = NULL;
411 outbuf->closecallback = NULL;
412 outbuf->context = NULL;
413 outbuf->written = 0;
414
415 use = buf->use;
416 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
417 xmlFree(outbuf);
418 ret = buf->use - use;
419 return (ret);
420}
421
422/**
423 * htmlNodeDump:
424 * @buf: the HTML buffer output
425 * @doc: the document
426 * @cur: the current node
427 *
428 * Dump an HTML node, recursive behaviour,children are printed too,
429 * and formatting returns are added.
430 *
431 * Returns the number of byte written or -1 in case of error
432 */
433int
434htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
435 xmlInitParser();
436
437 return(htmlNodeDumpFormat(buf, doc, cur, 1));
438}
439
440/**
441 * htmlNodeDumpFileFormat:
442 * @out: the FILE pointer
443 * @doc: the document
444 * @cur: the current node
445 * @encoding: the document encoding
446 * @format: should formatting spaces been added
447 *
448 * Dump an HTML node, recursive behaviour,children are printed too.
449 *
450 * TODO: if encoding == NULL try to save in the doc encoding
451 *
452 * returns: the number of byte written or -1 in case of failure.
453 */
454int
455htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
456 xmlNodePtr cur, const char *encoding, int format) {
457 xmlOutputBufferPtr buf;
458 xmlCharEncodingHandlerPtr handler = NULL;
459 int ret;
460
461 xmlInitParser();
462
463 if (encoding != NULL) {
464 xmlCharEncoding enc;
465
466 enc = xmlParseCharEncoding(encoding);
467 if (enc != XML_CHAR_ENCODING_UTF8) {
468 handler = xmlFindCharEncodingHandler(encoding);
469 if (handler == NULL)
470 return(-1);
471 }
472 }
473
474 /*
475 * Fallback to HTML or ASCII when the encoding is unspecified
476 */
477 if (handler == NULL)
478 handler = xmlFindCharEncodingHandler("HTML");
479 if (handler == NULL)
480 handler = xmlFindCharEncodingHandler("ascii");
481
482 /*
483 * save the content to a temp buffer.
484 */
485 buf = xmlOutputBufferCreateFile(out, handler);
486 if (buf == NULL) return(0);
487
488 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
489
490 ret = xmlOutputBufferClose(buf);
491 return(ret);
492}
493
494/**
495 * htmlNodeDumpFile:
496 * @out: the FILE pointer
497 * @doc: the document
498 * @cur: the current node
499 *
500 * Dump an HTML node, recursive behaviour,children are printed too,
501 * and formatting returns are added.
502 */
503void
504htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
505 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
506}
507
508/**
509 * htmlDocDumpMemoryFormat:
510 * @cur: the document
511 * @mem: OUT: the memory pointer
512 * @size: OUT: the memory length
513 * @format: should formatting spaces been added
514 *
515 * Dump an HTML document in memory and return the xmlChar * and it's size.
516 * It's up to the caller to free the memory.
517 */
518void
519htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
520 xmlOutputBufferPtr buf;
521 xmlCharEncodingHandlerPtr handler = NULL;
522 const char *encoding;
523
524 xmlInitParser();
525
526 if ((mem == NULL) || (size == NULL))
527 return;
528 if (cur == NULL) {
529 *mem = NULL;
530 *size = 0;
531 return;
532 }
533
534 encoding = (const char *) htmlGetMetaEncoding(cur);
535
536 if (encoding != NULL) {
537 xmlCharEncoding enc;
538
539 enc = xmlParseCharEncoding(encoding);
540 if (enc != cur->charset) {
541 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
542 /*
543 * Not supported yet
544 */
545 *mem = NULL;
546 *size = 0;
547 return;
548 }
549
550 handler = xmlFindCharEncodingHandler(encoding);
551 if (handler == NULL) {
552 *mem = NULL;
553 *size = 0;
554 return;
555 }
556 } else {
557 handler = xmlFindCharEncodingHandler(encoding);
558 }
559 }
560
561 /*
562 * Fallback to HTML or ASCII when the encoding is unspecified
563 */
564 if (handler == NULL)
565 handler = xmlFindCharEncodingHandler("HTML");
566 if (handler == NULL)
567 handler = xmlFindCharEncodingHandler("ascii");
568
569 buf = xmlAllocOutputBuffer(handler);
570 if (buf == NULL) {
571 *mem = NULL;
572 *size = 0;
573 return;
574 }
575
576 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
577
578 xmlOutputBufferFlush(buf);
579 if (buf->conv != NULL) {
580 *size = buf->conv->use;
581 *mem = xmlStrndup(buf->conv->content, *size);
582 } else {
583 *size = buf->buffer->use;
584 *mem = xmlStrndup(buf->buffer->content, *size);
585 }
586 (void)xmlOutputBufferClose(buf);
587}
588
589/**
590 * htmlDocDumpMemory:
591 * @cur: the document
592 * @mem: OUT: the memory pointer
593 * @size: OUT: the memory length
594 *
595 * Dump an HTML document in memory and return the xmlChar * and it's size.
596 * It's up to the caller to free the memory.
597 */
598void
599htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
600 htmlDocDumpMemoryFormat(cur, mem, size, 1);
601}
602
603
604/************************************************************************
605 * *
606 * Dumping HTML tree content to an I/O output buffer *
607 * *
608 ************************************************************************/
609
610void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
611
612/**
613 * htmlDtdDumpOutput:
614 * @buf: the HTML buffer output
615 * @doc: the document
616 * @encoding: the encoding string
617 *
618 * TODO: check whether encoding is needed
619 *
620 * Dump the HTML document DTD, if any.
621 */
622static void
623htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
624 const char *encoding ATTRIBUTE_UNUSED) {
625 xmlDtdPtr cur = doc->intSubset;
626
627 if (cur == NULL) {
628 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
629 return;
630 }
631 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
632 xmlOutputBufferWriteString(buf, (const char *)cur->name);
633 if (cur->ExternalID != NULL) {
634 xmlOutputBufferWriteString(buf, " PUBLIC ");
635 xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
636 if (cur->SystemID != NULL) {
637 xmlOutputBufferWriteString(buf, " ");
638 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
639 }
640 } else if (cur->SystemID != NULL) {
641 xmlOutputBufferWriteString(buf, " SYSTEM ");
642 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
643 }
644 xmlOutputBufferWriteString(buf, ">\n");
645}
646
647/**
648 * htmlAttrDumpOutput:
649 * @buf: the HTML buffer output
650 * @doc: the document
651 * @cur: the attribute pointer
652 * @encoding: the encoding string
653 *
654 * Dump an HTML attribute
655 */
656static void
657htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
658 const char *encoding ATTRIBUTE_UNUSED) {
659 xmlChar *value;
660
661 /*
662 * TODO: The html output method should not escape a & character
663 * occurring in an attribute value immediately followed by
664 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
665 */
666
667 if (cur == NULL) {
668 return;
669 }
670 xmlOutputBufferWriteString(buf, " ");
671 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
672 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
673 xmlOutputBufferWriteString(buf, ":");
674 }
675 xmlOutputBufferWriteString(buf, (const char *)cur->name);
676 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
677 value = xmlNodeListGetString(doc, cur->children, 0);
678 if (value) {
679 xmlOutputBufferWriteString(buf, "=");
680 if ((cur->ns == NULL) && (cur->parent != NULL) &&
681 (cur->parent->ns == NULL) &&
682 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
683 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
684 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
685 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
686 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
687 xmlChar *escaped;
688 xmlChar *tmp = value;
689
690 while (IS_BLANK_CH(*tmp)) tmp++;
691
692 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
693 if (escaped != NULL) {
694 xmlBufferWriteQuotedString(buf->buffer, escaped);
695 xmlFree(escaped);
696 } else {
697 xmlBufferWriteQuotedString(buf->buffer, value);
698 }
699 } else {
700 xmlBufferWriteQuotedString(buf->buffer, value);
701 }
702 xmlFree(value);
703 } else {
704 xmlOutputBufferWriteString(buf, "=\"\"");
705 }
706 }
707}
708
709/**
710 * htmlAttrListDumpOutput:
711 * @buf: the HTML buffer output
712 * @doc: the document
713 * @cur: the first attribute pointer
714 * @encoding: the encoding string
715 *
716 * Dump a list of HTML attributes
717 */
718static void
719htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
720 if (cur == NULL) {
721 return;
722 }
723 while (cur != NULL) {
724 htmlAttrDumpOutput(buf, doc, cur, encoding);
725 cur = cur->next;
726 }
727}
728
729
730
731/**
732 * htmlNodeListDumpOutput:
733 * @buf: the HTML buffer output
734 * @doc: the document
735 * @cur: the first node
736 * @encoding: the encoding string
737 * @format: should formatting spaces been added
738 *
739 * Dump an HTML node list, recursive behaviour,children are printed too.
740 */
741static void
742htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
743 xmlNodePtr cur, const char *encoding, int format) {
744 if (cur == NULL) {
745 return;
746 }
747 while (cur != NULL) {
748 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
749 cur = cur->next;
750 }
751}
752
753/**
754 * htmlNodeDumpFormatOutput:
755 * @buf: the HTML buffer output
756 * @doc: the document
757 * @cur: the current node
758 * @encoding: the encoding string
759 * @format: should formatting spaces been added
760 *
761 * Dump an HTML node, recursive behaviour,children are printed too.
762 */
763void
764htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
765 xmlNodePtr cur, const char *encoding, int format) {
766 const htmlElemDesc * info;
767
768 xmlInitParser();
769
770 if ((cur == NULL) || (buf == NULL)) {
771 return;
772 }
773 /*
774 * Special cases.
775 */
776 if (cur->type == XML_DTD_NODE)
777 return;
778 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
779 (cur->type == XML_DOCUMENT_NODE)){
780 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
781 return;
782 }
783 if (cur->type == XML_ATTRIBUTE_NODE) {
784 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
785 return;
786 }
787 if (cur->type == HTML_TEXT_NODE) {
788 if (cur->content != NULL) {
789 if (((cur->name == (const xmlChar *)xmlStringText) ||
790 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
791 ((cur->parent == NULL) ||
792 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
793 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
794 xmlChar *buffer;
795
796 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
797 if (buffer != NULL) {
798 xmlOutputBufferWriteString(buf, (const char *)buffer);
799 xmlFree(buffer);
800 }
801 } else {
802 xmlOutputBufferWriteString(buf, (const char *)cur->content);
803 }
804 }
805 return;
806 }
807 if (cur->type == HTML_COMMENT_NODE) {
808 if (cur->content != NULL) {
809 xmlOutputBufferWriteString(buf, "<!--");
810 xmlOutputBufferWriteString(buf, (const char *)cur->content);
811 xmlOutputBufferWriteString(buf, "-->");
812 }
813 return;
814 }
815 if (cur->type == HTML_PI_NODE) {
816 if (cur->name == NULL)
817 return;
818 xmlOutputBufferWriteString(buf, "<?");
819 xmlOutputBufferWriteString(buf, (const char *)cur->name);
820 if (cur->content != NULL) {
821 xmlOutputBufferWriteString(buf, " ");
822 xmlOutputBufferWriteString(buf, (const char *)cur->content);
823 }
824 xmlOutputBufferWriteString(buf, ">");
825 return;
826 }
827 if (cur->type == HTML_ENTITY_REF_NODE) {
828 xmlOutputBufferWriteString(buf, "&");
829 xmlOutputBufferWriteString(buf, (const char *)cur->name);
830 xmlOutputBufferWriteString(buf, ";");
831 return;
832 }
833 if (cur->type == HTML_PRESERVE_NODE) {
834 if (cur->content != NULL) {
835 xmlOutputBufferWriteString(buf, (const char *)cur->content);
836 }
837 return;
838 }
839
840 /*
841 * Get specific HTML info for that node.
842 */
843 if (cur->ns == NULL)
844 info = htmlTagLookup(cur->name);
845 else
846 info = NULL;
847
848 xmlOutputBufferWriteString(buf, "<");
849 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
850 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
851 xmlOutputBufferWriteString(buf, ":");
852 }
853 xmlOutputBufferWriteString(buf, (const char *)cur->name);
854 if (cur->nsDef)
855 xmlNsListDumpOutput(buf, cur->nsDef);
856 if (cur->properties != NULL)
857 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
858
859 if ((info != NULL) && (info->empty)) {
860 xmlOutputBufferWriteString(buf, ">");
861 if ((format) && (!info->isinline) && (cur->next != NULL)) {
862 if ((cur->next->type != HTML_TEXT_NODE) &&
863 (cur->next->type != HTML_ENTITY_REF_NODE) &&
864 (cur->parent != NULL) &&
865 (cur->parent->name != NULL) &&
866 (cur->parent->name[0] != 'p')) /* p, pre, param */
867 xmlOutputBufferWriteString(buf, "\n");
868 }
869 return;
870 }
871 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
872 (cur->children == NULL)) {
873 if ((info != NULL) && (info->saveEndTag != 0) &&
874 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
875 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
876 xmlOutputBufferWriteString(buf, ">");
877 } else {
878 xmlOutputBufferWriteString(buf, "></");
879 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
880 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
881 xmlOutputBufferWriteString(buf, ":");
882 }
883 xmlOutputBufferWriteString(buf, (const char *)cur->name);
884 xmlOutputBufferWriteString(buf, ">");
885 }
886 if ((format) && (cur->next != NULL) &&
887 (info != NULL) && (!info->isinline)) {
888 if ((cur->next->type != HTML_TEXT_NODE) &&
889 (cur->next->type != HTML_ENTITY_REF_NODE) &&
890 (cur->parent != NULL) &&
891 (cur->parent->name != NULL) &&
892 (cur->parent->name[0] != 'p')) /* p, pre, param */
893 xmlOutputBufferWriteString(buf, "\n");
894 }
895 return;
896 }
897 xmlOutputBufferWriteString(buf, ">");
898 if ((cur->type != XML_ELEMENT_NODE) &&
899 (cur->content != NULL)) {
900 /*
901 * Uses the OutputBuffer property to automatically convert
902 * invalids to charrefs
903 */
904
905 xmlOutputBufferWriteString(buf, (const char *) cur->content);
906 }
907 if (cur->children != NULL) {
908 if ((format) && (info != NULL) && (!info->isinline) &&
909 (cur->children->type != HTML_TEXT_NODE) &&
910 (cur->children->type != HTML_ENTITY_REF_NODE) &&
911 (cur->children != cur->last) &&
912 (cur->name != NULL) &&
913 (cur->name[0] != 'p')) /* p, pre, param */
914 xmlOutputBufferWriteString(buf, "\n");
915 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
916 if ((format) && (info != NULL) && (!info->isinline) &&
917 (cur->last->type != HTML_TEXT_NODE) &&
918 (cur->last->type != HTML_ENTITY_REF_NODE) &&
919 (cur->children != cur->last) &&
920 (cur->name != NULL) &&
921 (cur->name[0] != 'p')) /* p, pre, param */
922 xmlOutputBufferWriteString(buf, "\n");
923 }
924 xmlOutputBufferWriteString(buf, "</");
925 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
926 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
927 xmlOutputBufferWriteString(buf, ":");
928 }
929 xmlOutputBufferWriteString(buf, (const char *)cur->name);
930 xmlOutputBufferWriteString(buf, ">");
931 if ((format) && (info != NULL) && (!info->isinline) &&
932 (cur->next != NULL)) {
933 if ((cur->next->type != HTML_TEXT_NODE) &&
934 (cur->next->type != HTML_ENTITY_REF_NODE) &&
935 (cur->parent != NULL) &&
936 (cur->parent->name != NULL) &&
937 (cur->parent->name[0] != 'p')) /* p, pre, param */
938 xmlOutputBufferWriteString(buf, "\n");
939 }
940}
941
942/**
943 * htmlNodeDumpOutput:
944 * @buf: the HTML buffer output
945 * @doc: the document
946 * @cur: the current node
947 * @encoding: the encoding string
948 *
949 * Dump an HTML node, recursive behaviour,children are printed too,
950 * and formatting returns/spaces are added.
951 */
952void
953htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
954 xmlNodePtr cur, const char *encoding) {
955 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
956}
957
958/**
959 * htmlDocContentDumpFormatOutput:
960 * @buf: the HTML buffer output
961 * @cur: the document
962 * @encoding: the encoding string
963 * @format: should formatting spaces been added
964 *
965 * Dump an HTML document.
966 */
967void
968htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
969 const char *encoding, int format) {
970 int type;
971
972 xmlInitParser();
973
974 if ((buf == NULL) || (cur == NULL))
975 return;
976
977 /*
978 * force to output the stuff as HTML, especially for entities
979 */
980 type = cur->type;
981 cur->type = XML_HTML_DOCUMENT_NODE;
982 if (cur->intSubset != NULL) {
983 htmlDtdDumpOutput(buf, cur, NULL);
984 }
985 if (cur->children != NULL) {
986 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
987 }
988 xmlOutputBufferWriteString(buf, "\n");
989 cur->type = (xmlElementType) type;
990}
991
992/**
993 * htmlDocContentDumpOutput:
994 * @buf: the HTML buffer output
995 * @cur: the document
996 * @encoding: the encoding string
997 *
998 * Dump an HTML document. Formating return/spaces are added.
999 */
1000void
1001htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1002 const char *encoding) {
1003 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1004}
1005
1006/************************************************************************
1007 * *
1008 * Saving functions front-ends *
1009 * *
1010 ************************************************************************/
1011
1012/**
1013 * htmlDocDump:
1014 * @f: the FILE*
1015 * @cur: the document
1016 *
1017 * Dump an HTML document to an open FILE.
1018 *
1019 * returns: the number of byte written or -1 in case of failure.
1020 */
1021int
1022htmlDocDump(FILE *f, xmlDocPtr cur) {
1023 xmlOutputBufferPtr buf;
1024 xmlCharEncodingHandlerPtr handler = NULL;
1025 const char *encoding;
1026 int ret;
1027
1028 xmlInitParser();
1029
1030 if ((cur == NULL) || (f == NULL)) {
1031 return(-1);
1032 }
1033
1034 encoding = (const char *) htmlGetMetaEncoding(cur);
1035
1036 if (encoding != NULL) {
1037 xmlCharEncoding enc;
1038
1039 enc = xmlParseCharEncoding(encoding);
1040 if (enc != cur->charset) {
1041 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1042 /*
1043 * Not supported yet
1044 */
1045 return(-1);
1046 }
1047
1048 handler = xmlFindCharEncodingHandler(encoding);
1049 if (handler == NULL)
1050 return(-1);
1051 } else {
1052 handler = xmlFindCharEncodingHandler(encoding);
1053 }
1054 }
1055
1056 /*
1057 * Fallback to HTML or ASCII when the encoding is unspecified
1058 */
1059 if (handler == NULL)
1060 handler = xmlFindCharEncodingHandler("HTML");
1061 if (handler == NULL)
1062 handler = xmlFindCharEncodingHandler("ascii");
1063
1064 buf = xmlOutputBufferCreateFile(f, handler);
1065 if (buf == NULL) return(-1);
1066 htmlDocContentDumpOutput(buf, cur, NULL);
1067
1068 ret = xmlOutputBufferClose(buf);
1069 return(ret);
1070}
1071
1072/**
1073 * htmlSaveFile:
1074 * @filename: the filename (or URL)
1075 * @cur: the document
1076 *
1077 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1078 * used.
1079 * returns: the number of byte written or -1 in case of failure.
1080 */
1081int
1082htmlSaveFile(const char *filename, xmlDocPtr cur) {
1083 xmlOutputBufferPtr buf;
1084 xmlCharEncodingHandlerPtr handler = NULL;
1085 const char *encoding;
1086 int ret;
1087
1088 if ((cur == NULL) || (filename == NULL))
1089 return(-1);
1090
1091 xmlInitParser();
1092
1093 encoding = (const char *) htmlGetMetaEncoding(cur);
1094
1095 if (encoding != NULL) {
1096 xmlCharEncoding enc;
1097
1098 enc = xmlParseCharEncoding(encoding);
1099 if (enc != cur->charset) {
1100 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1101 /*
1102 * Not supported yet
1103 */
1104 return(-1);
1105 }
1106
1107 handler = xmlFindCharEncodingHandler(encoding);
1108 if (handler == NULL)
1109 return(-1);
1110 }
1111 }
1112
1113 /*
1114 * Fallback to HTML or ASCII when the encoding is unspecified
1115 */
1116 if (handler == NULL)
1117 handler = xmlFindCharEncodingHandler("HTML");
1118 if (handler == NULL)
1119 handler = xmlFindCharEncodingHandler("ascii");
1120
1121 /*
1122 * save the content to a temp buffer.
1123 */
1124 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1125 if (buf == NULL) return(0);
1126
1127 htmlDocContentDumpOutput(buf, cur, NULL);
1128
1129 ret = xmlOutputBufferClose(buf);
1130 return(ret);
1131}
1132
1133/**
1134 * htmlSaveFileFormat:
1135 * @filename: the filename
1136 * @cur: the document
1137 * @format: should formatting spaces been added
1138 * @encoding: the document encoding
1139 *
1140 * Dump an HTML document to a file using a given encoding.
1141 *
1142 * returns: the number of byte written or -1 in case of failure.
1143 */
1144int
1145htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1146 const char *encoding, int format) {
1147 xmlOutputBufferPtr buf;
1148 xmlCharEncodingHandlerPtr handler = NULL;
1149 int ret;
1150
1151 if ((cur == NULL) || (filename == NULL))
1152 return(-1);
1153
1154 xmlInitParser();
1155
1156 if (encoding != NULL) {
1157 xmlCharEncoding enc;
1158
1159 enc = xmlParseCharEncoding(encoding);
1160 if (enc != cur->charset) {
1161 if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1162 /*
1163 * Not supported yet
1164 */
1165 return(-1);
1166 }
1167
1168 handler = xmlFindCharEncodingHandler(encoding);
1169 if (handler == NULL)
1170 return(-1);
1171 htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1172 }
1173 } else {
1174 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1175 }
1176
1177 /*
1178 * Fallback to HTML or ASCII when the encoding is unspecified
1179 */
1180 if (handler == NULL)
1181 handler = xmlFindCharEncodingHandler("HTML");
1182 if (handler == NULL)
1183 handler = xmlFindCharEncodingHandler("ascii");
1184
1185 /*
1186 * save the content to a temp buffer.
1187 */
1188 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1189 if (buf == NULL) return(0);
1190
1191 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1192
1193 ret = xmlOutputBufferClose(buf);
1194 return(ret);
1195}
1196
1197/**
1198 * htmlSaveFileEnc:
1199 * @filename: the filename
1200 * @cur: the document
1201 * @encoding: the document encoding
1202 *
1203 * Dump an HTML document to a file using a given encoding
1204 * and formatting returns/spaces are added.
1205 *
1206 * returns: the number of byte written or -1 in case of failure.
1207 */
1208int
1209htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1210 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1211}
1212
1213#endif /* LIBXML_OUTPUT_ENABLED */
1214
1215#define bottom_HTMLtree
1216#include "elfgcchack.h"
1217#endif /* LIBXML_HTML_ENABLED */
Note: See TracBrowser for help on using the repository browser.

© 2023 Oracle
ContactPrivacy policyTerms of Use