libxml / tests /tests_HTMLparser_htmlParseStartTag.c
AryaWu's picture
Upload folder using huggingface_hub
6baed57 verified
#include "unity/unity.h"
#include <libxml/HTMLparser.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
/* Wrapper provided in the module for calling the static function */
extern void test_htmlParseStartTag(htmlParserCtxtPtr ctxt);
/* Simple capture of SAX startElement events */
typedef struct {
xmlChar *name;
int att_count; /* number of attribute pairs */
xmlChar **atts; /* flattened name, value, name, value..., NULL terminated */
} StartEvent;
typedef struct {
int nevents;
StartEvent events[64];
} SAXCapture;
static void capture_init(SAXCapture *cap) {
memset(cap, 0, sizeof(*cap));
}
static void capture_free(SAXCapture *cap) {
for (int i = 0; i < cap->nevents; i++) {
if (cap->events[i].name) {
xmlFree(cap->events[i].name);
}
if (cap->events[i].atts) {
/* Free copied attribute names and values */
int j = 0;
while (cap->events[i].atts[j] != NULL) {
xmlFree(cap->events[i].atts[j]);
j++;
}
free(cap->events[i].atts);
}
}
memset(cap, 0, sizeof(*cap));
}
static void test_sax_startElement(void *ctx, const xmlChar *name, const xmlChar **atts) {
SAXCapture *cap = (SAXCapture *)ctx;
if (cap->nevents >= (int)(sizeof(cap->events)/sizeof(cap->events[0])))
return;
StartEvent *ev = &cap->events[cap->nevents++];
ev->name = xmlStrdup(name);
int count = 0;
if (atts != NULL) {
const xmlChar **p = atts;
while (*p != NULL) {
/* name */
p++;
/* value */
if (*p == NULL) break;
p++;
count++;
}
}
ev->att_count = count;
if (atts != NULL && count > 0) {
/* allocate space for 2*count + 1 NULL terminator */
ev->atts = (xmlChar **)calloc((size_t)(2 * count + 1), sizeof(xmlChar *));
int idx = 0;
for (int i = 0; i < count; i++) {
const xmlChar *aname = atts[2*i];
const xmlChar *aval = atts[2*i + 1];
ev->atts[idx++] = xmlStrdup(aname);
ev->atts[idx++] = (aval != NULL) ? xmlStrdup(aval) : NULL;
}
ev->atts[idx] = NULL;
} else {
ev->atts = (xmlChar **)calloc(1, sizeof(xmlChar *));
ev->atts[0] = NULL;
}
}
static const xmlChar* find_attr_value(const StartEvent *ev, const char *name) {
if (ev->atts == NULL) return NULL;
for (int i = 0; ev->atts[i] != NULL && ev->atts[i+1] != NULL; i += 2) {
if (xmlStrcasecmp(ev->atts[i], (const xmlChar *)name) == 0) {
return ev->atts[i+1];
}
}
return NULL;
}
/* Create a parser context from a memory buffer and attach our SAX handler */
static htmlParserCtxtPtr make_ctxt(const char *buf, int flags, SAXCapture *cap) {
htmlParserCtxtPtr ctxt = htmlCreateMemoryParserCtxt(buf, (int)strlen(buf));
TEST_ASSERT_NOT_NULL_MESSAGE(ctxt, "Failed to create HTML parser context");
static xmlSAXHandler sax; /* static to ensure it lives long enough */
memset(&sax, 0, sizeof(sax));
sax.startElement = test_sax_startElement;
ctxt->sax = &sax;
ctxt->userData = cap;
ctxt->options |= flags;
return ctxt;
}
void setUp(void) {
/* Initialize libxml2 for safety */
xmlInitParser();
}
void tearDown(void) {
/* No global cleanup here to avoid interfering between tests */
}
/* Test: simple start tag without implied element insertion (NOIMPLIED) */
void test_htmlParseStartTag_simple_div_noimplied(void) {
const char *src = "<div>";
SAXCapture cap; capture_init(&cap);
htmlParserCtxtPtr ctxt = make_ctxt(src, HTML_PARSE_NOIMPLIED, &cap);
test_htmlParseStartTag(ctxt);
TEST_ASSERT_EQUAL_INT(1, cap.nevents);
TEST_ASSERT_NOT_NULL(cap.events[0].name);
TEST_ASSERT_EQUAL_STRING("div", (const char *)cap.events[0].name);
TEST_ASSERT_EQUAL_INT(0, cap.events[0].att_count);
capture_free(&cap);
htmlFreeParserCtxt(ctxt);
}
/* Test: uppercase tag and attribute names are lowercased; values preserved */
void test_htmlParseStartTag_uppercase_and_attr_lowercased(void) {
const char *src = "<DIV CLASS=AbC ID=42>";
SAXCapture cap; capture_init(&cap);
htmlParserCtxtPtr ctxt = make_ctxt(src, HTML_PARSE_NOIMPLIED, &cap);
test_htmlParseStartTag(ctxt);
TEST_ASSERT_EQUAL_INT(1, cap.nevents);
TEST_ASSERT_EQUAL_STRING("div", (const char *)cap.events[0].name);
TEST_ASSERT_TRUE(cap.events[0].att_count >= 2);
/* Attribute names should be lowercased */
const xmlChar *vclass = find_attr_value(&cap.events[0], "class");
const xmlChar *vid = find_attr_value(&cap.events[0], "id");
TEST_ASSERT_NOT_NULL(vclass);
TEST_ASSERT_NOT_NULL(vid);
TEST_ASSERT_EQUAL_STRING("AbC", (const char *)vclass);
TEST_ASSERT_EQUAL_STRING("42", (const char *)vid);
capture_free(&cap);
htmlFreeParserCtxt(ctxt);
}
/* Test: duplicate attributes are de-duplicated with first value preserved */
void test_htmlParseStartTag_duplicate_attributes_dedup(void) {
const char *src = "<div class='a' CLASS=\"b\" class=c>";
SAXCapture cap; capture_init(&cap);
htmlParserCtxtPtr ctxt = make_ctxt(src, HTML_PARSE_NOIMPLIED, &cap);
test_htmlParseStartTag(ctxt);
TEST_ASSERT_EQUAL_INT(1, cap.nevents);
/* Only one 'class' attribute should remain, with the first value 'a' */
const StartEvent *ev = &cap.events[0];
/* Count how many 'class' attributes remain */
int class_count = 0;
for (int i = 0; ev->atts[i] != NULL && ev->atts[i+1] != NULL; i += 2) {
if (xmlStrcasecmp(ev->atts[i], BAD_CAST "class") == 0)
class_count++;
}
TEST_ASSERT_EQUAL_INT(1, class_count);
const xmlChar *v = find_attr_value(ev, "class");
TEST_ASSERT_NOT_NULL(v);
TEST_ASSERT_EQUAL_STRING("a", (const char *)v);
capture_free(&cap);
htmlFreeParserCtxt(ctxt);
}
/* Test: unexpected solidus inside tag (not as '/>') is ignored */
void test_htmlParseStartTag_unexpected_solidus_ignored(void) {
const char *src = "<div / id='x'>";
SAXCapture cap; capture_init(&cap);
htmlParserCtxtPtr ctxt = make_ctxt(src, HTML_PARSE_NOIMPLIED, &cap);
test_htmlParseStartTag(ctxt);
TEST_ASSERT_EQUAL_INT(1, cap.nevents);
TEST_ASSERT_EQUAL_STRING("div", (const char *)cap.events[0].name);
const xmlChar *vx = find_attr_value(&cap.events[0], "id");
TEST_ASSERT_NOT_NULL(vx);
TEST_ASSERT_EQUAL_STRING("x", (const char *)vx);
capture_free(&cap);
htmlFreeParserCtxt(ctxt);
}
/* Test: self-closing tags '/>' are handled and produce a startElement event */
void test_htmlParseStartTag_self_closing(void) {
const char *src = "<br/>";
SAXCapture cap; capture_init(&cap);
htmlParserCtxtPtr ctxt = make_ctxt(src, HTML_PARSE_NOIMPLIED, &cap);
test_htmlParseStartTag(ctxt);
TEST_ASSERT_EQUAL_INT(1, cap.nevents);
TEST_ASSERT_EQUAL_STRING("br", (const char *)cap.events[0].name);
TEST_ASSERT_EQUAL_INT(0, cap.events[0].att_count);
capture_free(&cap);
htmlFreeParserCtxt(ctxt);
}
/* Test: incomplete tag without closing '>' is discarded (no SAX events) */
void test_htmlParseStartTag_incomplete_tag_discarded(void) {
const char *src = "<div id='x'";
SAXCapture cap; capture_init(&cap);
htmlParserCtxtPtr ctxt = make_ctxt(src, HTML_PARSE_NOIMPLIED, &cap);
test_htmlParseStartTag(ctxt);
TEST_ASSERT_EQUAL_INT(0, cap.nevents);
capture_free(&cap);
htmlFreeParserCtxt(ctxt);
}
/* Test: many attributes are all passed (exercises attribute storage growth) */
void test_htmlParseStartTag_many_attributes(void) {
/* Build a tag with many attributes */
char buf[4096];
strcpy(buf, "<span");
const int N = 20;
char tmp[64];
for (int i = 0; i < N; i++) {
snprintf(tmp, sizeof(tmp), " a%d='v%d'", i, i);
strcat(buf, tmp);
}
strcat(buf, ">");
SAXCapture cap; capture_init(&cap);
htmlParserCtxtPtr ctxt = make_ctxt(buf, HTML_PARSE_NOIMPLIED, &cap);
test_htmlParseStartTag(ctxt);
TEST_ASSERT_EQUAL_INT(1, cap.nevents);
TEST_ASSERT_EQUAL_STRING("span", (const char *)cap.events[0].name);
TEST_ASSERT_EQUAL_INT(N, cap.events[0].att_count);
/* Spot-check a few attributes */
const StartEvent *ev = &cap.events[0];
const xmlChar *v0 = find_attr_value(ev, "a0");
const xmlChar *v7 = find_attr_value(ev, "a7");
const xmlChar *v19 = find_attr_value(ev, "a19");
TEST_ASSERT_NOT_NULL(v0);
TEST_ASSERT_NOT_NULL(v7);
TEST_ASSERT_NOT_NULL(v19);
TEST_ASSERT_EQUAL_STRING("v0", (const char *)v0);
TEST_ASSERT_EQUAL_STRING("v7", (const char *)v7);
TEST_ASSERT_EQUAL_STRING("v19", (const char *)v19);
capture_free(&cap);
htmlFreeParserCtxt(ctxt);
}
int main(void) {
UNITY_BEGIN();
RUN_TEST(test_htmlParseStartTag_simple_div_noimplied);
RUN_TEST(test_htmlParseStartTag_uppercase_and_attr_lowercased);
RUN_TEST(test_htmlParseStartTag_duplicate_attributes_dedup);
RUN_TEST(test_htmlParseStartTag_unexpected_solidus_ignored);
RUN_TEST(test_htmlParseStartTag_self_closing);
RUN_TEST(test_htmlParseStartTag_incomplete_tag_discarded);
RUN_TEST(test_htmlParseStartTag_many_attributes);
return UNITY_END();
}