# Public domain.
# Changes:
# 2023-09-21: detect libxml2 "bad name" errors.
# 2019-12-16: better error messages; debug support.
# 2018-10-24: avoid a fatal error by ignoring the nodes (comments...)
# outside the root element.
# 2017-08-05: first version.
package HTML5ToLibXML;
use strict;
use Carp;
use HTML::Gumbo;
use XML::LibXML;
# Void elements: https://www.w3.org/TR/html5/syntax.html#void-elements
my %voidelem = map { $_ => 1 } qw(area base br col embed hr img input keygen
link meta param source track wbr);
sub parse ($)
my ($html) = @_;
my $doc = XML::LibXML::Document->createDocument('1.0', 'utf-8');
my @nodes;
($html, format => 'callback', callback => sub
my ($event) = shift;
warn "Event: $event\n" if $debug;
if ($event =~ /^document (start|end)$/ )
{ }
elsif ($event eq 'start' )
my ($tag, $attr) = @_;
warn " Tag: <$tag>\n" if $debug;
my $element;
if (@nodes)
$element = $doc->createElement($tag);
$element =
$doc->createElementNS('http://www.w3.org/1999/xhtml', $tag);
while (@$attr)
@$attr >= 2 or confess "missing attribute value";
my $name = $attr->[0];
warn " Att: $name=\"$attr->[1]\"\n" if $debug;
# Gumbo does not signal a parse error for
# unexpected-character-in-attribute-name.
# Thus we need to detect that by checking
# libxml2 "bad name" errors.
eval { $element->setAttribute(splice @$attr, 0, 2); };
if ($@)
$@ =~ /^bad name / or die;
warn "$0: parse error (bad attribute name $name)"
unless $quiet;
push @nodes, $element unless $voidelem{$tag};
elsif ($event eq 'end')
$_[0] eq $nodes[-1]->nodeName or confess "internal error";
pop @nodes;
elsif (@nodes)
my $node;
if ($event =~ /^(text|space)$/)
{ $node = $doc->createTextNode($_[0]); }
elsif ($event eq 'comment')
{ $node = $doc->createComment($_[0]); }
elsif ($event eq 'cdata')
{ $node = $doc->createCDATASection($_[0]); }
{ confess "unknown event"; }
return $doc;
=head1 NAME
HTML5ToLibXML - HTML5 parsing to LibXML document
use HTML5ToLibXML;
print HTML5ToLibXML::parse('foo');
HTML5ToLibXML parses a HTML5 document provided as a string and produces a
LibXML document. It uses the L and L modules.
Nodes (comments...) outside the root element are not copied.
=head1 EXAMPLE
The following script takes a HTML5 file in argument or on the standard
input and outputs the corresponding XHTML file and the text content.
use HTML5ToLibXML;
use open ':encoding(UTF-8)';
binmode STDIN, ':encoding(UTF-8)';
my $doc = HTML5ToLibXML::parse(do { local $/; <> });
$doc->toFH(*STDOUT, 0);
binmode STDOUT, ':encoding(UTF-8)';
print "Text content: ", $doc->textContent;
=head1 AUTHOR
Vincent Lefevre
=head1 LICENSE
Public domain.
# $Id: HTML5ToLibXML.pm 161712 2023-09-21 11:11:34Z vinc17/cventin $