libxml / tests /tests_HTMLparser_htmlParseHTMLName.c
AryaWu's picture
Upload folder using huggingface_hub
6baed57 verified
#include "unity/unity.h"
#include <libxml/HTMLparser.h>
#include <libxml/dict.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
/* Wrapper for the static function provided in the source module */
extern xmlHashedString test_htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr);
static void assert_name_parse(const unsigned char *data, size_t len,
int attr,
const char *expected_name,
size_t expected_advance,
unsigned char expected_stop)
{
htmlParserCtxtPtr ctxt = htmlCreateMemoryParserCtxt((const char *)data, (int)len);
TEST_ASSERT_NOT_NULL_MESSAGE(ctxt, "Failed to create HTML parser context");
TEST_ASSERT_NOT_NULL_MESSAGE(ctxt->dict, "Parser context dictionary is NULL");
TEST_ASSERT_NOT_NULL_MESSAGE(ctxt->input, "Parser input is NULL");
TEST_ASSERT_NOT_NULL_MESSAGE(ctxt->input->base, "Parser input base is NULL");
const unsigned char *start = ctxt->input->base;
const unsigned char *before = ctxt->input->cur;
xmlHashedString ret = test_htmlParseHTMLName(ctxt, attr);
/* Verify the returned name */
TEST_ASSERT_NOT_NULL_MESSAGE(ret.name, "Returned name is NULL (memory error?)");
TEST_ASSERT_EQUAL_STRING_MESSAGE(expected_name, (const char *)ret.name,
"Parsed name doesn't match expected");
/* Verify input advancement and stop character */
size_t advanced = (size_t)(ctxt->input->cur - start);
size_t expected_adv = expected_advance;
TEST_ASSERT_EQUAL_size_t_MESSAGE(expected_adv, advanced, "Input pointer advanced by unexpected amount");
/* Ensure we didn't consume the stop character */
if (advanced < len) {
TEST_ASSERT_EQUAL_UINT8_MESSAGE(expected_stop, (unsigned char)ctxt->input->cur[0],
"Did not stop at expected stop character");
} else {
/* If we advanced to end, ensure expected stop indicates end (no stop char available) */
TEST_FAIL_MESSAGE("Input advanced to end unexpectedly; expected a stop character to remain");
}
htmlFreeParserCtxt(ctxt);
}
/* Unity setup/teardown */
void setUp(void) {
/* no-op */
}
void tearDown(void) {
/* no-op */
}
/* Tests */
/* ASCII name should be lowercased; stop on '>' without consuming it */
void test_htmlParseHTMLName_ascii_tag_lowercase_stop_on_gt(void) {
static const unsigned char data[] = "DIV>";
/* Expect "div", consumed 3 bytes, stop char '>' at index 3 */
assert_name_parse(data, sizeof(data) - 1, 0, "div", 3, '>');
}
/* Attribute name parsing stops at '=' without consuming it; ASCII lowercasing applies */
void test_htmlParseHTMLName_attr_stop_on_equal(void) {
static const unsigned char data[] = "HREF=/path";
/* Expect "href", consumed 4 bytes, stop on '=' */
assert_name_parse(data, sizeof(data) - 1, 1, "href", 4, '=');
}
/* Stop on whitespace for non-attr parsing; lowercasing of ASCII letters only */
void test_htmlParseHTMLName_stop_on_space(void) {
static const unsigned char data[] = "name value";
/* Expect "name", consumed 4 bytes, stop on space */
assert_name_parse(data, sizeof(data) - 1, 0, "name", 4, ' ');
}
/* Stop on '/' after some characters (e.g., before '/>' sequence) */
void test_htmlParseHTMLName_stop_on_slash(void) {
static const unsigned char data[] = "br/>";
/* Expect "br", consumed 2 bytes, stop on '/' */
assert_name_parse(data, sizeof(data) - 1, 0, "br", 2, '/');
}
/* Valid multibyte UTF-8 should be preserved; ASCII lowercased only */
void test_htmlParseHTMLName_valid_utf8_preserved(void) {
/* "Straße>" in UTF-8: 53 74 72 61 C3 9F 65 3E */
static const unsigned char data[] = { 'S','t','r','a', 0xC3,0x9F, 'e', '>' };
/* Expected name: "straße" => 73 74 72 61 C3 9F 65 */
static const char expected[] = { 's','t','r','a', (char)0xC3,(char)0x9F, 'e', '\0' };
/* Consumed 7 bytes (up to before '>'), stop on '>' */
assert_name_parse(data, sizeof(data), 0, expected, 7, '>');
}
/* Invalid UTF-8 bytes should be replaced with U+FFFD (EF BF BD) per invalid sequence handling */
void test_htmlParseHTMLName_invalid_utf8_replacement(void) {
/* Two invalid bytes followed by '>' */
static const unsigned char data[] = { 0xC0, 0xAF, '>' };
/* Each invalid byte results in a replacement char, so 2 replacements total */
static const char expected[] = { (char)0xEF,(char)0xBF,(char)0xBD,
(char)0xEF,(char)0xBF,(char)0xBD, '\0' };
/* Consumed 2 bytes (invalid ones), stop on '>' */
assert_name_parse(data, sizeof(data), 0, expected, 2, '>');
}
/* Embedded NUL bytes in input are turned into the UTF-8 replacement character in the output */
void test_htmlParseHTMLName_embedded_nul_replacement(void) {
/* "ab\0cd " -> expect "ab" + U+FFFD + "cd"; stops at trailing space */
static const unsigned char data[] = { 'a','b', 0x00, 'c','d',' ' };
static const char expected[] = { 'a','b', (char)0xEF,(char)0xBF,(char)0xBD, 'c','d', '\0' };
/* Consumed 5 bytes (up to before space), stop on ' ' */
assert_name_parse(data, sizeof(data), 0, expected, 5, ' ');
}
int main(void) {
UNITY_BEGIN();
RUN_TEST(test_htmlParseHTMLName_ascii_tag_lowercase_stop_on_gt);
RUN_TEST(test_htmlParseHTMLName_attr_stop_on_equal);
RUN_TEST(test_htmlParseHTMLName_stop_on_space);
RUN_TEST(test_htmlParseHTMLName_stop_on_slash);
RUN_TEST(test_htmlParseHTMLName_valid_utf8_preserved);
RUN_TEST(test_htmlParseHTMLName_invalid_utf8_replacement);
RUN_TEST(test_htmlParseHTMLName_embedded_nul_replacement);
int rc = UNITY_END();
/* Cleanup libxml2 global state */
xmlCleanupParser();
return rc;
}