|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define IN_LIBXML |
|
|
#include "libxml.h" |
|
|
#ifdef LIBXML_HTML_ENABLED |
|
|
|
|
|
#include <string.h> |
|
|
#include <ctype.h> |
|
|
#include <stdlib.h> |
|
|
|
|
|
#include <libxml/xmlmemory.h> |
|
|
#include <libxml/HTMLparser.h> |
|
|
#include <libxml/HTMLtree.h> |
|
|
#include <libxml/entities.h> |
|
|
#include <libxml/xmlerror.h> |
|
|
#include <libxml/parserInternals.h> |
|
|
#include <libxml/uri.h> |
|
|
|
|
|
#include "private/buf.h" |
|
|
#include "private/html.h" |
|
|
#include "private/error.h" |
|
|
#include "private/html.h" |
|
|
#include "private/io.h" |
|
|
#include "private/save.h" |
|
|
#include "private/tree.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct { |
|
|
xmlAttrPtr attr; |
|
|
const xmlChar *attrValue; |
|
|
htmlMetaEncodingOffsets off; |
|
|
} htmlMetaEncoding; |
|
|
|
|
|
static htmlNodePtr |
|
|
htmlFindFirstChild(htmlNodePtr parent, const char *name) { |
|
|
htmlNodePtr child; |
|
|
|
|
|
for (child = parent->children; child != NULL; child = child->next) { |
|
|
if ((child->type == XML_ELEMENT_NODE) && |
|
|
(xmlStrcasecmp(child->name, BAD_CAST name) == 0)) |
|
|
return(child); |
|
|
} |
|
|
|
|
|
return(NULL); |
|
|
} |
|
|
|
|
|
static htmlNodePtr |
|
|
htmlFindHead(htmlDocPtr doc) { |
|
|
htmlNodePtr html; |
|
|
|
|
|
if (doc == NULL) |
|
|
return(NULL); |
|
|
|
|
|
html = htmlFindFirstChild((htmlNodePtr) doc, "html"); |
|
|
if (html == NULL) |
|
|
return(NULL); |
|
|
|
|
|
return(htmlFindFirstChild(html, "head")); |
|
|
} |
|
|
|
|
|
int |
|
|
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) { |
|
|
const xmlChar *p = val; |
|
|
|
|
|
while (1) { |
|
|
size_t start, end; |
|
|
|
|
|
while ((*p != 'c') && (*p != 'C')) { |
|
|
if (*p == 0) |
|
|
return(0); |
|
|
p += 1; |
|
|
} |
|
|
p += 1; |
|
|
|
|
|
if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0) |
|
|
continue; |
|
|
|
|
|
p += 6; |
|
|
while (IS_WS_HTML(*p)) p += 1; |
|
|
|
|
|
if (*p != '=') |
|
|
continue; |
|
|
|
|
|
p += 1; |
|
|
while (IS_WS_HTML(*p)) p += 1; |
|
|
|
|
|
if (*p == 0) |
|
|
return(0); |
|
|
|
|
|
if ((*p == '"') || (*p == '\'')) { |
|
|
int quote = *p; |
|
|
|
|
|
p += 1; |
|
|
while (IS_WS_HTML(*p)) p += 1; |
|
|
|
|
|
start = p - val; |
|
|
end = start; |
|
|
|
|
|
while (*p != quote) { |
|
|
if (*p == 0) |
|
|
return(0); |
|
|
if (!IS_WS_HTML(*p)) |
|
|
end = p + 1 - val; |
|
|
p += 1; |
|
|
} |
|
|
} else { |
|
|
start = p - val; |
|
|
|
|
|
while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p))) |
|
|
p += 1; |
|
|
|
|
|
end = p - val; |
|
|
} |
|
|
|
|
|
off->start = start; |
|
|
off->end = end; |
|
|
off->size = p - val + strlen((char *) p); |
|
|
|
|
|
return(1); |
|
|
} |
|
|
|
|
|
return(0); |
|
|
} |
|
|
|
|
|
static xmlAttrPtr |
|
|
htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) { |
|
|
xmlAttrPtr attr, contentAttr = NULL; |
|
|
int isContentType = 0; |
|
|
|
|
|
if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0) |
|
|
return(NULL); |
|
|
|
|
|
for (attr = elem->properties; attr != NULL; attr = attr->next) { |
|
|
if (attr->ns != NULL) |
|
|
continue; |
|
|
if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) { |
|
|
*outIsContentType = 0; |
|
|
return(attr); |
|
|
} |
|
|
if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0) |
|
|
contentAttr = attr; |
|
|
if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) && |
|
|
(attr->children != NULL) && |
|
|
(attr->children->type == XML_TEXT_NODE) && |
|
|
(attr->children->next == NULL) && |
|
|
(xmlStrcasecmp(attr->children->content, |
|
|
BAD_CAST "Content-Type") == 0)) |
|
|
isContentType = 1; |
|
|
} |
|
|
|
|
|
if ((isContentType) && (contentAttr != NULL)) { |
|
|
*outIsContentType = 1; |
|
|
return(contentAttr); |
|
|
} |
|
|
|
|
|
return(NULL); |
|
|
} |
|
|
|
|
|
static int |
|
|
htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) { |
|
|
xmlAttrPtr attr; |
|
|
const xmlChar *val = NULL; |
|
|
int isContentType; |
|
|
|
|
|
if ((elem->type != XML_ELEMENT_NODE) || |
|
|
(xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)) |
|
|
return(0); |
|
|
|
|
|
attr = htmlFindMetaEncodingAttr(elem, &isContentType); |
|
|
if (attr == NULL) |
|
|
return(0); |
|
|
|
|
|
if ((attr->children != NULL) && |
|
|
(attr->children->type == XML_TEXT_NODE) && |
|
|
(attr->children->next == NULL) && |
|
|
(attr->children->content != NULL)) |
|
|
val = attr->children->content; |
|
|
else |
|
|
val = BAD_CAST ""; |
|
|
|
|
|
|
|
|
if (!isContentType) { |
|
|
size_t size = strlen((char *) val); |
|
|
size_t start = 0; |
|
|
size_t end = size; |
|
|
|
|
|
while ((start < size) && (IS_WS_HTML(val[start]))) |
|
|
start += 1; |
|
|
|
|
|
while ((end > 0) && (IS_WS_HTML(val[end-1]))) |
|
|
end -= 1; |
|
|
|
|
|
menc->attr = attr; |
|
|
menc->attrValue = val; |
|
|
menc->off.start = start; |
|
|
menc->off.end = end; |
|
|
menc->off.size = size; |
|
|
|
|
|
return(1); |
|
|
} else { |
|
|
if (htmlParseContentType(val, &menc->off)) { |
|
|
menc->attr = attr; |
|
|
menc->attrValue = val; |
|
|
|
|
|
return(1); |
|
|
} |
|
|
} |
|
|
|
|
|
return(0); |
|
|
} |
|
|
|
|
|
static xmlChar * |
|
|
htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) { |
|
|
xmlChar *newVal, *p; |
|
|
size_t size, oldEncSize, newEncSize; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0) |
|
|
encoding = "ASCII"; |
|
|
|
|
|
oldEncSize = menc->off.end - menc->off.start; |
|
|
newEncSize = strlen((char *) encoding); |
|
|
size = menc->off.size - oldEncSize + newEncSize; |
|
|
newVal = xmlMalloc(size + 1); |
|
|
if (newVal == NULL) |
|
|
return(NULL); |
|
|
|
|
|
p = newVal; |
|
|
memcpy(p, menc->attrValue, menc->off.start); |
|
|
p += menc->off.start; |
|
|
memcpy(p, encoding, newEncSize); |
|
|
p += newEncSize; |
|
|
memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end); |
|
|
newVal[size] = 0; |
|
|
|
|
|
return(newVal); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const xmlChar * |
|
|
htmlGetMetaEncoding(xmlDoc *doc) { |
|
|
htmlNodePtr head, node; |
|
|
|
|
|
head = htmlFindHead(doc); |
|
|
if (head == NULL) |
|
|
return(NULL); |
|
|
|
|
|
for (node = head->children; node != NULL; node = node->next) { |
|
|
htmlMetaEncoding menc; |
|
|
|
|
|
if (htmlParseMetaEncoding(node, &menc)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return(menc.attrValue + menc.off.start); |
|
|
} |
|
|
} |
|
|
|
|
|
return(NULL); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int |
|
|
htmlSetMetaEncoding(xmlDoc *doc, const xmlChar *encoding) { |
|
|
htmlNodePtr head, meta; |
|
|
int found = 0; |
|
|
|
|
|
if (encoding == NULL) |
|
|
return(1); |
|
|
|
|
|
head = htmlFindHead(doc); |
|
|
if (head == NULL) |
|
|
return(1); |
|
|
|
|
|
for (meta = head->children; meta != NULL; meta = meta->next) { |
|
|
htmlMetaEncoding menc; |
|
|
|
|
|
if (htmlParseMetaEncoding(meta, &menc)) { |
|
|
xmlChar *newVal; |
|
|
int ret; |
|
|
|
|
|
found = 1; |
|
|
|
|
|
newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding); |
|
|
if (newVal == NULL) |
|
|
return(-1); |
|
|
xmlNodeSetContent((xmlNodePtr) menc.attr, NULL); |
|
|
ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal); |
|
|
xmlFree(newVal); |
|
|
|
|
|
if (ret < 0) |
|
|
return(-1); |
|
|
} |
|
|
} |
|
|
|
|
|
if (found) |
|
|
return(0); |
|
|
|
|
|
meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL); |
|
|
if (meta == NULL) |
|
|
return(-1); |
|
|
|
|
|
if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) { |
|
|
xmlFreeNode(meta); |
|
|
return(-1); |
|
|
} |
|
|
|
|
|
if (head->children == NULL) |
|
|
xmlAddChild(head, meta); |
|
|
else |
|
|
xmlAddPrevSibling(head->children, meta); |
|
|
|
|
|
return(0); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int |
|
|
htmlIsBooleanAttr(const xmlChar *name) |
|
|
{ |
|
|
const char *str = NULL; |
|
|
|
|
|
if (name == NULL) |
|
|
return(0); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
switch (name[0] | 0x20) { |
|
|
case 'c': |
|
|
name += 1; |
|
|
switch (name[0] | 0x20) { |
|
|
case 'h': str = "ecked"; break; |
|
|
case 'o': str = "mpact"; break; |
|
|
} |
|
|
break; |
|
|
case 'd': |
|
|
name += 1; |
|
|
switch (name[0] | 0x20) { |
|
|
case 'e': |
|
|
name += 1; |
|
|
switch (name[0] | 0x20) { |
|
|
case 'c': str = "lare"; break; |
|
|
case 'f': str = "er"; break; |
|
|
} |
|
|
break; |
|
|
case 'i': str = "sabled"; break; |
|
|
} |
|
|
break; |
|
|
case 'i': |
|
|
str = "smap"; |
|
|
break; |
|
|
case 'm': |
|
|
str = "ultiple"; |
|
|
break; |
|
|
case 'n': |
|
|
name += 1; |
|
|
if ((name[0] | 0x20) != 'o') |
|
|
break; |
|
|
name += 1; |
|
|
switch (name[0] | 0x20) { |
|
|
case 'h': str = "ref"; break; |
|
|
case 'r': str = "esize"; break; |
|
|
case 's': str = "hade"; break; |
|
|
case 'w': str = "rap"; break; |
|
|
} |
|
|
break; |
|
|
case 'r': |
|
|
str = "eadonly"; |
|
|
break; |
|
|
case 's': |
|
|
str = "elected"; |
|
|
break; |
|
|
} |
|
|
|
|
|
if (str == NULL) |
|
|
return(0); |
|
|
|
|
|
return(xmlStrcasecmp(name + 1, BAD_CAST str) == 0); |
|
|
} |
|
|
|
|
|
#ifdef LIBXML_OUTPUT_ENABLED |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static xmlParserErrors |
|
|
htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) { |
|
|
|
|
|
|
|
|
|
|
|
if (encoding == NULL) |
|
|
encoding = "HTML"; |
|
|
|
|
|
return(xmlOpenCharEncodingHandler(encoding, 1, out)); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static size_t |
|
|
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED, |
|
|
xmlNodePtr cur, int format) { |
|
|
size_t use; |
|
|
size_t ret; |
|
|
xmlOutputBufferPtr outbuf; |
|
|
|
|
|
if (cur == NULL) { |
|
|
return ((size_t) -1); |
|
|
} |
|
|
if (buf == NULL) { |
|
|
return ((size_t) -1); |
|
|
} |
|
|
outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); |
|
|
if (outbuf == NULL) |
|
|
return ((size_t) -1); |
|
|
memset(outbuf, 0, sizeof(xmlOutputBuffer)); |
|
|
outbuf->buffer = buf; |
|
|
outbuf->encoder = NULL; |
|
|
outbuf->writecallback = NULL; |
|
|
outbuf->closecallback = NULL; |
|
|
outbuf->context = NULL; |
|
|
outbuf->written = 0; |
|
|
|
|
|
use = xmlBufUse(buf); |
|
|
htmlNodeDumpInternal(outbuf, cur, NULL, format); |
|
|
if (outbuf->error) |
|
|
ret = (size_t) -1; |
|
|
else |
|
|
ret = xmlBufUse(buf) - use; |
|
|
xmlFree(outbuf); |
|
|
return (ret); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int |
|
|
htmlNodeDump(xmlBuffer *buf, xmlDoc *doc, xmlNode *cur) { |
|
|
xmlBufPtr buffer; |
|
|
size_t ret1; |
|
|
int ret2; |
|
|
|
|
|
if ((buf == NULL) || (cur == NULL)) |
|
|
return(-1); |
|
|
|
|
|
xmlInitParser(); |
|
|
buffer = xmlBufFromBuffer(buf); |
|
|
if (buffer == NULL) |
|
|
return(-1); |
|
|
|
|
|
ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1); |
|
|
|
|
|
ret2 = xmlBufBackToBuffer(buffer, buf); |
|
|
|
|
|
if ((ret1 == (size_t) -1) || (ret2 < 0)) |
|
|
return(-1); |
|
|
return(ret1 > INT_MAX ? INT_MAX : ret1); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int |
|
|
htmlNodeDumpFileFormat(FILE *out, xmlDoc *doc ATTRIBUTE_UNUSED, |
|
|
xmlNode *cur, const char *encoding, int format) { |
|
|
xmlOutputBufferPtr buf; |
|
|
xmlCharEncodingHandlerPtr handler; |
|
|
int ret; |
|
|
|
|
|
xmlInitParser(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK) |
|
|
return(-1); |
|
|
buf = xmlOutputBufferCreateFile(out, handler); |
|
|
if (buf == NULL) { |
|
|
xmlCharEncCloseFunc(handler); |
|
|
return(-1); |
|
|
} |
|
|
|
|
|
htmlNodeDumpInternal(buf, cur, NULL, format); |
|
|
|
|
|
ret = xmlOutputBufferClose(buf); |
|
|
return(ret); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void |
|
|
htmlNodeDumpFile(FILE *out, xmlDoc *doc, xmlNode *cur) { |
|
|
htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void |
|
|
htmlDocDumpMemoryFormat(xmlDoc *cur, xmlChar**mem, int *size, int format) { |
|
|
xmlOutputBufferPtr buf; |
|
|
xmlCharEncodingHandlerPtr handler = NULL; |
|
|
|
|
|
xmlInitParser(); |
|
|
|
|
|
if ((mem == NULL) || (size == NULL)) |
|
|
return; |
|
|
*mem = NULL; |
|
|
*size = 0; |
|
|
if (cur == NULL) |
|
|
return; |
|
|
|
|
|
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK) |
|
|
return; |
|
|
buf = xmlAllocOutputBuffer(handler); |
|
|
if (buf == NULL) { |
|
|
xmlCharEncCloseFunc(handler); |
|
|
return; |
|
|
} |
|
|
|
|
|
htmlDocContentDumpFormatOutput(buf, cur, NULL, format); |
|
|
|
|
|
xmlOutputBufferFlush(buf); |
|
|
|
|
|
if (!buf->error) { |
|
|
if (buf->conv != NULL) { |
|
|
*size = xmlBufUse(buf->conv); |
|
|
*mem = xmlStrndup(xmlBufContent(buf->conv), *size); |
|
|
} else { |
|
|
*size = xmlBufUse(buf->buffer); |
|
|
*mem = xmlStrndup(xmlBufContent(buf->buffer), *size); |
|
|
} |
|
|
} |
|
|
|
|
|
xmlOutputBufferClose(buf); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void |
|
|
htmlDocDumpMemory(xmlDoc *cur, xmlChar**mem, int *size) { |
|
|
htmlDocDumpMemoryFormat(cur, mem, size, 1); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void |
|
|
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
|
|
const char *encoding ATTRIBUTE_UNUSED) { |
|
|
xmlDtdPtr cur = doc->intSubset; |
|
|
|
|
|
if (cur == NULL) |
|
|
return; |
|
|
xmlOutputBufferWrite(buf, 10, "<!DOCTYPE "); |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name); |
|
|
if (cur->ExternalID != NULL) { |
|
|
xmlOutputBufferWrite(buf, 8, " PUBLIC "); |
|
|
xmlOutputBufferWriteQuotedString(buf, cur->ExternalID); |
|
|
if (cur->SystemID != NULL) { |
|
|
xmlOutputBufferWrite(buf, 1, " "); |
|
|
xmlOutputBufferWriteQuotedString(buf, cur->SystemID); |
|
|
} |
|
|
} else if (cur->SystemID != NULL && |
|
|
xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) { |
|
|
xmlOutputBufferWrite(buf, 8, " SYSTEM "); |
|
|
xmlOutputBufferWriteQuotedString(buf, cur->SystemID); |
|
|
} |
|
|
xmlOutputBufferWrite(buf, 2, ">\n"); |
|
|
} |
|
|
|
|
|
static void |
|
|
htmlSerializeUri(xmlOutputBufferPtr buf, const xmlChar *content) { |
|
|
const xmlChar *tmp = content; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while (IS_WS_HTML(*tmp)) tmp++; |
|
|
if (tmp > content) { |
|
|
xmlOutputBufferWrite(buf, tmp - content, (char *) content); |
|
|
content = tmp; |
|
|
} |
|
|
|
|
|
while (1) { |
|
|
char escbuf[3]; |
|
|
const char *repl; |
|
|
int replSize; |
|
|
int c = *tmp; |
|
|
|
|
|
while ((c > 0x20) && (c < 0x7F) && (c != '"') && (c != '&')) { |
|
|
tmp += 1; |
|
|
c = *tmp; |
|
|
} |
|
|
|
|
|
if (tmp > content) |
|
|
xmlOutputBufferWrite(buf, tmp - content, (char *) content); |
|
|
|
|
|
if ((c <= 0x20) || (c >= 0x7F)) { |
|
|
static const char hex[16] = { |
|
|
'0', '1', '2', '3', '4', '5', '6', '7', |
|
|
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' |
|
|
}; |
|
|
|
|
|
if (c == 0) |
|
|
break; |
|
|
|
|
|
escbuf[0] = '%'; |
|
|
escbuf[1] = hex[(c >> 4) & 0x0F]; |
|
|
escbuf[2] = hex[c & 0x0F]; |
|
|
repl = escbuf; |
|
|
replSize = 3; |
|
|
} else if (c == '"') { |
|
|
repl = """; |
|
|
replSize = 6; |
|
|
} else { |
|
|
repl = "&"; |
|
|
replSize = 5; |
|
|
} |
|
|
|
|
|
xmlOutputBufferWrite(buf, replSize, repl); |
|
|
tmp += 1; |
|
|
content = tmp; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void |
|
|
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlAttrPtr cur) { |
|
|
xmlOutputBufferWrite(buf, 1, " "); |
|
|
|
|
|
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
|
|
xmlOutputBufferWrite(buf, 1, ":"); |
|
|
} |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { |
|
|
xmlNodePtr child; |
|
|
int isUri; |
|
|
|
|
|
xmlOutputBufferWrite(buf, 2, "=\""); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
isUri = (cur->ns == NULL) && (cur->parent != NULL) && |
|
|
(cur->parent->ns == NULL) && |
|
|
((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || |
|
|
(!xmlStrcasecmp(cur->name, BAD_CAST "action")) || |
|
|
(!xmlStrcasecmp(cur->name, BAD_CAST "src")) || |
|
|
((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && |
|
|
(!xmlStrcasecmp(cur->parent->name, BAD_CAST "a")))); |
|
|
|
|
|
for (child = cur->children; child != NULL; child = child->next) { |
|
|
if (child->type == XML_TEXT_NODE) { |
|
|
const xmlChar *content = child->content; |
|
|
|
|
|
if (content == NULL) |
|
|
continue; |
|
|
|
|
|
if (isUri) { |
|
|
htmlSerializeUri(buf, content); |
|
|
} else { |
|
|
xmlSerializeText(buf, content, SIZE_MAX, |
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
|
|
} |
|
|
} else if (child->type == XML_ENTITY_REF_NODE) { |
|
|
|
|
|
xmlOutputBufferWrite(buf, 1, "&"); |
|
|
xmlOutputBufferWriteString(buf, (char *) child->name); |
|
|
xmlOutputBufferWrite(buf, 1, ";"); |
|
|
} |
|
|
} |
|
|
|
|
|
xmlOutputBufferWrite(buf, 1, "\""); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void |
|
|
htmlNodeDumpInternal(xmlOutputBuffer *buf, xmlNode *cur, |
|
|
const char *encoding, int format) { |
|
|
xmlNodePtr root, parent, metaHead = NULL; |
|
|
xmlAttrPtr attr; |
|
|
const htmlElemDesc * info; |
|
|
int isRaw = 0; |
|
|
|
|
|
xmlInitParser(); |
|
|
|
|
|
if ((cur == NULL) || (buf == NULL)) { |
|
|
return; |
|
|
} |
|
|
|
|
|
root = cur; |
|
|
parent = cur->parent; |
|
|
while (1) { |
|
|
switch (cur->type) { |
|
|
case XML_HTML_DOCUMENT_NODE: |
|
|
case XML_DOCUMENT_NODE: |
|
|
if (((xmlDocPtr) cur)->intSubset != NULL) { |
|
|
htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); |
|
|
} |
|
|
if (cur->children != NULL) { |
|
|
|
|
|
if (cur->parent == parent) { |
|
|
parent = cur; |
|
|
cur = cur->children; |
|
|
continue; |
|
|
} |
|
|
} else { |
|
|
xmlOutputBufferWrite(buf, 1, "\n"); |
|
|
} |
|
|
break; |
|
|
|
|
|
case XML_ELEMENT_NODE: { |
|
|
htmlMetaEncoding menc; |
|
|
int isMeta = 0; |
|
|
int addMeta = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ((cur->parent != parent) && (cur->children != NULL)) { |
|
|
htmlNodeDumpInternal(buf, cur, encoding, format); |
|
|
break; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (cur->ns == NULL) |
|
|
info = htmlTagLookup(cur->name); |
|
|
else |
|
|
info = NULL; |
|
|
|
|
|
if (encoding != NULL) { |
|
|
isMeta = htmlParseMetaEncoding(cur, &menc); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ((xmlStrcasecmp(BAD_CAST encoding, |
|
|
BAD_CAST "HTML") != 0) && |
|
|
(xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) && |
|
|
(parent != NULL) && |
|
|
(xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) && |
|
|
(parent->parent != NULL) && |
|
|
(parent->parent->parent == NULL) && |
|
|
(metaHead == NULL)) { |
|
|
xmlNodePtr n; |
|
|
|
|
|
metaHead = cur; |
|
|
addMeta = 1; |
|
|
|
|
|
for (n = cur->children; n != NULL; n = n->next) { |
|
|
int unused; |
|
|
|
|
|
if (htmlFindMetaEncodingAttr(n, &unused) != NULL) { |
|
|
metaHead = NULL; |
|
|
addMeta = 0; |
|
|
break; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
xmlOutputBufferWrite(buf, 1, "<"); |
|
|
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
|
|
xmlOutputBufferWrite(buf, 1, ":"); |
|
|
} |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name); |
|
|
if (cur->nsDef) |
|
|
xmlNsListDumpOutput(buf, cur->nsDef); |
|
|
attr = cur->properties; |
|
|
while (attr != NULL) { |
|
|
if ((!isMeta) || (attr != menc.attr)) { |
|
|
htmlAttrDumpOutput(buf, attr); |
|
|
} else { |
|
|
xmlOutputBufferWrite(buf, 1, " "); |
|
|
xmlOutputBufferWriteString(buf, (char *) attr->name); |
|
|
|
|
|
xmlOutputBufferWrite(buf, 2, "=\""); |
|
|
xmlSerializeText(buf, menc.attrValue, menc.off.start, |
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
|
|
xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX, |
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
|
|
xmlSerializeText(buf, menc.attrValue + menc.off.end, |
|
|
menc.off.size - menc.off.end, |
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
|
|
xmlOutputBufferWrite(buf, 1, "\""); |
|
|
} |
|
|
attr = attr->next; |
|
|
} |
|
|
|
|
|
if ((info != NULL) && (info->empty)) { |
|
|
xmlOutputBufferWrite(buf, 1, ">"); |
|
|
} else if (cur->children == NULL) { |
|
|
if (addMeta) { |
|
|
xmlOutputBufferWrite(buf, 16, "><meta charset=\""); |
|
|
xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX, |
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
|
|
xmlOutputBufferWrite(buf, 4, "\"></"); |
|
|
} else { |
|
|
xmlOutputBufferWrite(buf, 3, "></"); |
|
|
} |
|
|
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
|
|
xmlOutputBufferWriteString(buf, |
|
|
(const char *)cur->ns->prefix); |
|
|
xmlOutputBufferWrite(buf, 1, ":"); |
|
|
} |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name); |
|
|
xmlOutputBufferWrite(buf, 1, ">"); |
|
|
} else { |
|
|
xmlOutputBufferWrite(buf, 1, ">"); |
|
|
if ((format) && |
|
|
((addMeta) || |
|
|
((info != NULL) && (!info->isinline) && |
|
|
(cur->children->type != HTML_TEXT_NODE) && |
|
|
(cur->children->type != HTML_ENTITY_REF_NODE) && |
|
|
(cur->children != cur->last) && |
|
|
(cur->name != NULL) && |
|
|
(cur->name[0] != 'p')))) |
|
|
xmlOutputBufferWrite(buf, 1, "\n"); |
|
|
if (addMeta) { |
|
|
xmlOutputBufferWrite(buf, 15, "<meta charset=\""); |
|
|
xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX, |
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
|
|
xmlOutputBufferWrite(buf, 2, "\">"); |
|
|
if ((format) && |
|
|
(cur->children->type != HTML_TEXT_NODE) && |
|
|
(cur->children->type != HTML_ENTITY_REF_NODE)) |
|
|
xmlOutputBufferWrite(buf, 1, "\n"); |
|
|
} |
|
|
|
|
|
if ((info != NULL) && (info->dataMode >= DATA_RAWTEXT)) |
|
|
isRaw = 1; |
|
|
|
|
|
parent = cur; |
|
|
cur = cur->children; |
|
|
continue; |
|
|
} |
|
|
|
|
|
if ((format) && (cur->next != NULL) && |
|
|
(info != NULL) && (!info->isinline)) { |
|
|
if ((cur->next->type != HTML_TEXT_NODE) && |
|
|
(cur->next->type != HTML_ENTITY_REF_NODE) && |
|
|
(parent != NULL) && |
|
|
(parent->name != NULL) && |
|
|
(parent->name[0] != 'p')) |
|
|
xmlOutputBufferWrite(buf, 1, "\n"); |
|
|
} |
|
|
|
|
|
break; |
|
|
} |
|
|
|
|
|
case XML_ATTRIBUTE_NODE: |
|
|
htmlAttrDumpOutput(buf, (xmlAttrPtr) cur); |
|
|
break; |
|
|
|
|
|
case HTML_TEXT_NODE: |
|
|
if (cur->content == NULL) |
|
|
break; |
|
|
if ((cur->name == (const xmlChar *)xmlStringTextNoenc) || |
|
|
(isRaw)) { |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->content); |
|
|
} else { |
|
|
xmlSerializeText(buf, cur->content, SIZE_MAX, XML_ESCAPE_HTML); |
|
|
} |
|
|
break; |
|
|
|
|
|
case HTML_COMMENT_NODE: |
|
|
if (cur->content != NULL) { |
|
|
xmlOutputBufferWrite(buf, 4, "<!--"); |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->content); |
|
|
xmlOutputBufferWrite(buf, 3, "-->"); |
|
|
} |
|
|
break; |
|
|
|
|
|
case HTML_PI_NODE: |
|
|
if (cur->name != NULL) { |
|
|
xmlOutputBufferWrite(buf, 2, "<?"); |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name); |
|
|
if (cur->content != NULL) { |
|
|
xmlOutputBufferWrite(buf, 1, " "); |
|
|
xmlOutputBufferWriteString(buf, |
|
|
(const char *)cur->content); |
|
|
} |
|
|
xmlOutputBufferWrite(buf, 1, ">"); |
|
|
} |
|
|
break; |
|
|
|
|
|
case HTML_ENTITY_REF_NODE: |
|
|
xmlOutputBufferWrite(buf, 1, "&"); |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name); |
|
|
xmlOutputBufferWrite(buf, 1, ";"); |
|
|
break; |
|
|
|
|
|
case HTML_PRESERVE_NODE: |
|
|
if (cur->content != NULL) { |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->content); |
|
|
} |
|
|
break; |
|
|
|
|
|
default: |
|
|
break; |
|
|
} |
|
|
|
|
|
while (1) { |
|
|
if (cur == root) |
|
|
return; |
|
|
if (cur->next != NULL) { |
|
|
cur = cur->next; |
|
|
break; |
|
|
} |
|
|
|
|
|
isRaw = 0; |
|
|
|
|
|
cur = parent; |
|
|
|
|
|
parent = cur->parent; |
|
|
|
|
|
if ((cur->type == XML_HTML_DOCUMENT_NODE) || |
|
|
(cur->type == XML_DOCUMENT_NODE)) { |
|
|
xmlOutputBufferWrite(buf, 1, "\n"); |
|
|
} else { |
|
|
if ((format) && (cur->ns == NULL)) |
|
|
info = htmlTagLookup(cur->name); |
|
|
else |
|
|
info = NULL; |
|
|
|
|
|
if ((format) && (info != NULL) && (!info->isinline) && |
|
|
(cur->last->type != HTML_TEXT_NODE) && |
|
|
(cur->last->type != HTML_ENTITY_REF_NODE) && |
|
|
((cur->children != cur->last) || (cur == metaHead)) && |
|
|
(cur->name != NULL) && |
|
|
(cur->name[0] != 'p')) |
|
|
xmlOutputBufferWrite(buf, 1, "\n"); |
|
|
|
|
|
xmlOutputBufferWrite(buf, 2, "</"); |
|
|
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
|
|
xmlOutputBufferWrite(buf, 1, ":"); |
|
|
} |
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name); |
|
|
xmlOutputBufferWrite(buf, 1, ">"); |
|
|
|
|
|
if ((format) && (info != NULL) && (!info->isinline) && |
|
|
(cur->next != NULL)) { |
|
|
if ((cur->next->type != HTML_TEXT_NODE) && |
|
|
(cur->next->type != HTML_ENTITY_REF_NODE) && |
|
|
(parent != NULL) && |
|
|
(parent->name != NULL) && |
|
|
(parent->name[0] != 'p')) |
|
|
xmlOutputBufferWrite(buf, 1, "\n"); |
|
|
} |
|
|
|
|
|
if (cur == metaHead) |
|
|
metaHead = NULL; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void |
|
|
htmlNodeDumpFormatOutput(xmlOutputBuffer *buf, |
|
|
xmlDoc *doc ATTRIBUTE_UNUSED, xmlNode *cur, |
|
|
const char *encoding ATTRIBUTE_UNUSED, int format) { |
|
|
htmlNodeDumpInternal(buf, cur, NULL, format); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void |
|
|
htmlNodeDumpOutput(xmlOutputBuffer *buf, xmlDoc *doc ATTRIBUTE_UNUSED, |
|
|
xmlNode *cur, const char *encoding ATTRIBUTE_UNUSED) { |
|
|
htmlNodeDumpInternal(buf, cur, NULL, 1); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void |
|
|
htmlDocContentDumpFormatOutput(xmlOutputBuffer *buf, xmlDoc *cur, |
|
|
const char *encoding ATTRIBUTE_UNUSED, |
|
|
int format) { |
|
|
htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, format); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void |
|
|
htmlDocContentDumpOutput(xmlOutputBuffer *buf, xmlDoc *cur, |
|
|
const char *encoding ATTRIBUTE_UNUSED) { |
|
|
htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, 1); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int |
|
|
htmlDocDump(FILE *f, xmlDoc *cur) { |
|
|
xmlOutputBufferPtr buf; |
|
|
xmlCharEncodingHandlerPtr handler = NULL; |
|
|
int ret; |
|
|
|
|
|
xmlInitParser(); |
|
|
|
|
|
if ((cur == NULL) || (f == NULL)) { |
|
|
return(-1); |
|
|
} |
|
|
|
|
|
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK) |
|
|
return(-1); |
|
|
buf = xmlOutputBufferCreateFile(f, handler); |
|
|
if (buf == NULL) { |
|
|
xmlCharEncCloseFunc(handler); |
|
|
return(-1); |
|
|
} |
|
|
htmlDocContentDumpOutput(buf, cur, NULL); |
|
|
|
|
|
ret = xmlOutputBufferClose(buf); |
|
|
return(ret); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int |
|
|
htmlSaveFile(const char *filename, xmlDoc *cur) { |
|
|
return(htmlSaveFileFormat(filename, cur, NULL, 1)); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int |
|
|
htmlSaveFileFormat(const char *filename, xmlDoc *cur, |
|
|
const char *encoding, int format) { |
|
|
xmlOutputBufferPtr buf; |
|
|
xmlCharEncodingHandlerPtr handler = NULL; |
|
|
int ret; |
|
|
|
|
|
if ((cur == NULL) || (filename == NULL)) |
|
|
return(-1); |
|
|
|
|
|
xmlInitParser(); |
|
|
|
|
|
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK) |
|
|
return(-1); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); |
|
|
if (buf == NULL) { |
|
|
xmlCharEncCloseFunc(handler); |
|
|
return(0); |
|
|
} |
|
|
|
|
|
htmlDocContentDumpFormatOutput(buf, cur, encoding, format); |
|
|
|
|
|
ret = xmlOutputBufferClose(buf); |
|
|
return(ret); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int |
|
|
htmlSaveFileEnc(const char *filename, xmlDoc *cur, const char *encoding) { |
|
|
return(htmlSaveFileFormat(filename, cur, encoding, 1)); |
|
|
} |
|
|
|
|
|
#endif |
|
|
|
|
|
#endif |
|
|
|