diff -r 000000000000 -r e35f40988205 xml/libxml2libs/src/libxml2/libxml2_htmltree.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xml/libxml2libs/src/libxml2/libxml2_htmltree.c Thu Dec 17 09:29:21 2009 +0200 @@ -0,0 +1,1222 @@ +/* + * libxml2_htmltree.c : implementation of access function for an HTML tree. + * + * See Copyright for the status of this software. + * + * daniel@veillard.com + * Portion Copyright © 2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. + */ + +#define IN_LIBXML +#include "xmlenglibxml.h" + +#include /* for memset() only ! */ + +#ifdef HAVE_CTYPE_H +#include +#endif +#ifdef HAVE_STDLIB_H +#include +#endif + +#include +#include "libxml2_htmlparser.h" +#include "libxml2_htmltree.h" +#include +#include +#include +#include "libxml2_xmlerror2.h" +#include +#include +#include +#include + +#ifdef LIBXML_HTML_ENABLED + +/************************************************************************ + * * + * Getting/Setting encoding meta tags * + * * + ************************************************************************/ + +/** + * htmlGetMetaEncoding: + * @param doc the document + * + * Encoding definition lookup in the Meta tags + * + * Returns the current encoding as flagged in the HTML source + */ +const xmlChar * +htmlGetMetaEncoding(htmlDocPtr doc) { + htmlNodePtr cur; + const xmlChar *content; + const xmlChar *encoding; + + if (doc == NULL) + return(NULL); + cur = doc->children; + + /* + * Search the html + */ + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrEqual(cur->name, BAD_CAST"html")) + break; + if (xmlStrEqual(cur->name, BAD_CAST"head")) + goto found_head; + if (xmlStrEqual(cur->name, BAD_CAST"meta")) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(NULL); + cur = cur->children; + + /* + * Search the head + */ + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrEqual(cur->name, BAD_CAST"head")) + break; + if (xmlStrEqual(cur->name, BAD_CAST"meta")) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(NULL); +found_head: + cur = cur->children; + + /* + * Search the meta elements + */ +found_meta: + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrEqual(cur->name, BAD_CAST"meta")) { + xmlAttrPtr attr = cur->properties; + int http; + const xmlChar *value; + + content = NULL; + http = 0; + while (attr != NULL) { + if ((attr->children != NULL) && + (attr->children->type == XML_TEXT_NODE) && + (attr->children->next == NULL)) { + value = attr->children->content; + if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) + && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) + http = 1; + else if ((value != NULL) + && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) + content = value; + if ((http != 0) && (content != NULL)) + goto found_content; + } + attr = attr->next; + } + } + } + cur = cur->next; + } + return(NULL); + +found_content: + encoding = xmlStrstr(content, BAD_CAST"charset="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"Charset="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"CHARSET="); + if (encoding != NULL) { + encoding += 8; + } else { + encoding = xmlStrstr(content, BAD_CAST"charset ="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"Charset ="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); + if (encoding != NULL) + encoding += 9; + } + if (encoding != NULL) { + while ((*encoding == ' ') || (*encoding == '\t')) encoding++; + } + return(encoding); +} + + +#endif /* LIBXML_HTML_ENABLED */ + +#if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) +/** + * htmlSetMetaEncoding: + * @param doc the document + * @param encoding the encoding string + * + * Sets the current encoding in the Meta tags + * NOTE: this will not change the document content encoding, just + * the META flag associated. + * + * Returns 0 in case of success and -1 in case of error + */ +XMLPUBFUNEXPORT int +htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { + htmlNodePtr cur, meta; + const xmlChar *content; + char newcontent[100]; + LOAD_GS_SAFE_DOC(doc) + + if (doc == NULL) + return(-1); + + if (encoding != NULL) { + snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", + encoding); + newcontent[sizeof(newcontent) - 1] = 0; + } + + cur = doc->children; + + /* + * Search the html + */ + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) + break; + if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) + goto found_head; + if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(-1); + cur = cur->children; + + /* + * Search the head + */ + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) + break; + if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(-1); +found_head: + if (cur->children == NULL) { + if (encoding == NULL) + return(0); + meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); + xmlAddChild(cur, meta); + xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); + xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); + if ( OOM_FLAG ) //oom set in xmlNewProp + return(-1); + return(0); + } + cur = cur->children; + +found_meta: + if (encoding != NULL) { + /* + * Create a new Meta element with the right attributes + */ + + meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); + xmlAddPrevSibling(cur, meta); + xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); + xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); + } + + /* + * Search and destroy all the remaining the meta elements carrying + * encoding informations + */ + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { + xmlAttrPtr attr = cur->properties; + int http; + const xmlChar *value; + + content = NULL; + http = 0; + while (attr != NULL) { + if ((attr->children != NULL) && + (attr->children->type == XML_TEXT_NODE) && + (attr->children->next == NULL)) { + value = attr->children->content; + if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) + && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) + http = 1; + else + { + if ((value != NULL) && + (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) + content = value; + } + if ((http != 0) && (content != NULL)) + break; + } + attr = attr->next; + } + if ((http != 0) && (content != NULL)) { + meta = cur; + cur = cur->next; + xmlUnlinkNode(meta); + xmlFreeNode(meta); + continue; + } + + } + } + cur = cur->next; + } + return(0); +} + +/** + * booleanHTMLAttrs: + * + * These are the HTML attributes which will be output + * in minimized form, i.e.