if C tag is not present
=item B<-r>
look for classes used in C elements and print the class usage
table, no conversion is done if this option is specified
=item B<-b> I
use the specified C classes to detect section headings
I is like "class1,class2;class3;class4,class5",
groups delimited by semicolons specify the classes for the same
logical depth, if there are multiple classes for a single depth,
they can be separated by commas. No spaces are allowed in the
I
=back
=head1 EXAMPLES
html2xml -b head -p 14,20 text.html
=head1 AUTHOR
Mike Matsnev
=cut
use strict;
use warnings;
use HTML::TreeBuilder;
use Getopt::Std;
my $xmllib=$^O eq "MSWin32" ? "use Win32::OLE" : "use XML::LibXML";
eval $xmllib;
if ($@) {
print STDERR "No XML support found.\n";
undef $xmllib;
}
# global options
my $minempty=1; # minimum number of empty paragraphs that
# produces an
my $minfont=0; # minimum size to be interpreted as a section
my $ST="strong"; # strong tag
my $EM="emphasis"; # emphasis tag
my $print_toc=0; # print table of contents in the end
my $minstyle=0; # minimum font-size style to be used as a section
my $maxstyle=10000; # maximum ....
my $Encoding; # encoding
my $Title; # title
my $Author;
my $report_styles; # styles
my %sect_styles; # styles to use as sections
# fetch options
my %options;
getopts('m:f:sScp:e:rb:',\%options);
$minempty=$options{m}+0 if exists $options{m};
$minfont=$options{f}+0 if exists $options{f};
($ST,$EM)=("s","em") if $options{S};
($ST,$EM)=($EM,$ST) if $options{s};
$print_toc=1 if $options{c};
if ($options{p} && $options{p} =~ /^(\d+)(?:,(\d+))?$/) {
$minstyle=$1;
$maxstyle=$2 if $2;
}
$Encoding=$options{e} if $options{e};
$report_styles=1 if $options{r};
if ($options{b}) {
my $depth=3;
for (split(/;/,$options{b})) {
$sect_styles{lc($_)}=$depth for split(/,/,$_);
$depth+=3;
}
}
select(STDERR); $|=1; select(STDOUT);
my $infile=shift(@ARGV);
my $inf;
if (defined($infile) && $infile ne "-") {
open(INF,"< $infile") || die "Can't open '$infile' for reading: $!\n";
binmode(INF);
$inf=\*INF;
}
my $tree=new HTML::TreeBuilder;
$tree->p_strict(1);
$tree->store_comments(1);
print STDERR "Loading file... ";
$tree->parse_file($inf ? $inf : \*STDIN);
print STDERR "done.\n";
close($inf) if $inf;
my %styles_stats;
sub stylecheck {
no warnings;
my $elem=shift;
return unless ref($elem);
$styles_stats{$_}++ if $elem->tag eq "p" && ($_=$elem->attr('class'));
stylecheck($_) for $elem->content_list;
}
if ($report_styles) {
stylecheck($tree);
printf("%10d %s\n",$_->[0],$_->[1]) for sort { $b->[0] <=> $a->[0] }
map { [$styles_stats{$_},$_] } keys %styles_stats;
exit 0;
}
my $outfile;
if (@ARGV) {
$outfile=shift(@ARGV);
} else {
no warnings;
$outfile=$infile;
$outfile =~ s/\.[^\.\\\/]*$/.xml/;
$outfile='-' unless $outfile;
}
if ($outfile eq "-") {
$outfile=\*STDOUT;
} else {
open(OUTF,"> $outfile") || die "Can't open '$outfile' for writing: $!\n";
$outfile=\*OUTF;
}
binmode($outfile);
my $textbuf=''; # current paragraph contents
my $realtext=0; # 1 if there is any real text in textbuf
my $space=''; # current trailing space
my $lastempty=0; # last paragraph was
my $strong=0; # enabled/disabled in source
my $emphasis=0; # enabled/disabled in source
my $curstate=0; # current state of and in XML output
my @elist; # list of all paragraphs/headers
my $body=0;
my %footnotes;
my $paraid;
my $poem=0; # we are parsing a poem
my $applevel=0;
sub cleanstate {
# reinit state
$textbuf='';
$realtext=0;
$space='';
$lastempty=0;
$strong=0;
$emphasis=0;
$curstate=0;
$body=0;
$applevel=0;
$paraid=undef;
undef(@elist);
}
sub cleanup {
$_[0] =~ y/\t\r\n \xa0/ /s;
$_[0] =~ s/^ //;
$_[0] =~ s/ $//;
}
my %html_entities=( "iexcl" => "¡", "cent" => "¢", "pound" =>
"£", "curren" => "¤", "yen" => "¥", "brvbar" => "¦", "sect" => "§",
"uml" => "¨", "copy" => "©", "ordf" => "ª", "laquo" => "«",
"not" => "¬", "shy" => "", "reg" => "®", "macr" => "¯",
"deg" => "°", "plusmn" => "±", "sup2" => "²", "sup3" => "³",
"acute" => "´", "micro" => "µ", "para" => "¶",
"middot" => "·", "cedil" => "¸", "sup1" => "¹", "ordm" =>
"º", "raquo" => "»", "frac14" => "¼", "frac12" => "½",
"frac34" => "¾", "iquest" => "¿", "Agrave" => "À", "Aacute" =>
"Á", "Acirc" => "Â", "Atilde" => "Ã", "Auml" => "Ä",
"Aring" => "Å", "AElig" => "Æ", "Ccedil" => "Ç", "Egrave" =>
"È", "Eacute" => "É", "Ecirc" => "Ê", "Euml" => "Ë",
"Igrave" => "Ì", "Iacute" => "Í", "Icirc" => "Î", "Iuml" =>
"Ï", "ETH" => "Ð", "Ntilde" => "Ñ", "Ograve" => "Ò",
"Oacute" => "Ó", "Ocirc" => "Ô", "Otilde" => "Õ", "Ouml" =>
"Ö", "times" => "×", "Oslash" => "Ø", "Ugrave" => "Ù",
"Uacute" => "Ú", "Ucirc" => "Û", "Uuml" => "Ü", "Yacute" =>
"Ý", "THORN" => "Þ", "szlig" => "ß", "agrave" => "à",
"aacute" => "á", "acirc" => "â", "atilde" => "ã", "auml" =>
"ä", "aring" => "å", "aelig" => "æ", "ccedil" => "ç",
"egrave" => "è", "eacute" => "é", "ecirc" => "ê", "euml" =>
"ë", "igrave" => "ì", "iacute" => "í", "icirc" => "î",
"iuml" => "ï", "eth" => "ð", "ntilde" => "ñ", "ograve" =>
"ò", "oacute" => "ó", "ocirc" => "ô", "otilde" => "õ",
"ouml" => "ö", "divide" => "÷", "oslash" => "ø", "ugrave" =>
"ù", "uacute" => "ú", "ucirc" => "û", "uuml" => "ü",
"yacute" => "ý", "thorn" => "þ", "yuml" => "ÿ", "fnof" =>
"ƒ", "Alpha" => "Α", "Beta" => "Β", "Gamma" => "Γ", "Delta" => "Δ",
"Epsilon" => "Ε", "Zeta" => "Ζ", "Eta" => "Η",
"Theta" => "Θ", "Iota" => "Ι", "Kappa" => "Κ", "Lambda" =>
"Λ", "Mu" => "Μ", "Nu" => "Ν", "Xi" => "Ξ", "Omicron" =>
"Ο", "Pi" => "Π", "Rho" => "Ρ", "Sigma" => "Σ", "Tau" =>
"Τ", "Upsilon" => "Υ", "Phi" => "Φ", "Chi" => "Χ", "Psi" =>
"Ψ", "Omega" => "Ω", "alpha" => "α", "beta" => "β", "gamma" => "γ",
"delta" => "δ", "epsilon" => "ε", "zeta" => "ζ",
"eta" => "η", "theta" => "θ", "iota" => "ι", "kappa" =>
"κ", "lambda" => "λ", "mu" => "μ", "nu" => "ν", "xi" =>
"ξ", "omicron" => "ο", "pi" => "π", "rho" => "ρ", "sigmaf" => "ς",
"sigma" => "σ", "tau" => "τ", "upsilon" => "υ",
"phi" => "φ", "chi" => "χ", "psi" => "ψ", "omega" => "ω",
"thetasym" => "ϑ", "upsih" => "ϒ", "piv" => "ϖ", "bull" =>
"•", "hellip" => "…", "prime" => "′", "Prime" => "″",
"oline" => "‾", "frasl" => "⁄", "weierp" => "℘", "image" =>
"ℑ", "real" => "ℜ", "trade" => "™", "alefsym" => "ℵ",
"larr" => "←", "uarr" => "↑", "rarr" => "→", "darr" =>
"↓", "harr" => "↔", "crarr" => "↵", "lArr" => "⇐",
"uArr" => "⇑", "rArr" => "⇒", "dArr" => "⇓", "hArr" =>
"⇔", "forall" => "∀", "part" => "∂", "exist" => "∃",
"empty" => "∅", "nabla" => "∇", "isin" => "∈", "notin" =>
"∉", "ni" => "∋", "prod" => "∏", "sum" => "∑", "minus" => "−",
"lowast" => "∗", "radic" => "√", "prop" => "∝",
"infin" => "∞", "ang" => "∠", "and" => "∧", "or" =>
"∨", "cap" => "∩", "cup" => "∪", "int" => "∫", "there4" => "∴",
"sim" => "∼", "cong" => "≅", "asymp" => "≈",
"ne" => "≠", "equiv" => "≡", "le" => "≤", "ge" => "≥",
"sub" => "⊂", "sup" => "⊃", "nsub" => "⊄", "sube" =>
"⊆", "supe" => "⊇", "oplus" => "⊕", "otimes" => "⊗",
"perp" => "⊥", "sdot" => "⋅", "lceil" => "⌈", "rceil" =>
"⌉", "lfloor" => "⌊", "rfloor" => "⌋", "lang" => "〈",
"rang" => "〉", "loz" => "◊", "spades" => "♠", "clubs" =>
"♣", "hearts" => "♥", "diams" => "♦", "quot" => "\"", "amp" => "&",
"lt" => "<", "gt" => ">", "OElig" => "Œ", "oelig" =>
"œ", "Scaron" => "Š", "scaron" => "š", "Yuml" => "Ÿ",
"circ" => "ˆ", "tilde" => "˜", "ensp" => " ", "emsp" =>
" ", "thinsp" => " ", "zwnj" => "", "zwj" => "",
"lrm" => "", "rlm" => "", "ndash" => "–", "mdash" =>
"—", "lsquo" => "‘", "rsquo" => "’", "sbquo" => "‚",
"ldquo" => "“", "rdquo" => "”", "bdquo" => "„", "dagger" =>
"†", "Dagger" => "‡", "permil" => "‰", "lsaquo" => "‹",
"rsaquo" => "›", "euro" => "€", "nbsp" => " ");
my %angbr=('<' => '<', '>' => '>');
my %dotmap=(' ' => ' ', "\xa0" => ' ', '' => '');
sub moretext {
my $txt=shift;
{
no warnings;
$txt =~ s/&(?!#)(?:([a-zA-Z]+);)?/$1 ? $html_entities{$1} : "&"/sge;
$txt =~ s/[<>]/$angbr{$1}/sg unless $_[0];
}
$txt =~ y/\t\r\n / /s; # also destroys nbsp
$space=' ' if $txt =~ /^\s/;
my $msp='';
$msp=' ' if $txt =~ /\s$/;
$txt =~ s/^ //;
$txt =~ s/ $//;
$txt =~ s/[ \xa0]\.[ \xa0]\.[ \xa0]\.([ \xa0]?)/…$dotmap{$1}/sg;
if (!length($txt)) { # whitespace only???
$space=(length($space) || length($msp)) && $realtext ? ' ' : '';
} else {
$textbuf.=$space if $realtext;
$realtext=1,checkhl() unless $realtext;
$textbuf.=$txt;
$space=$msp;
}
}
sub checkhl { # check and highlighting
my $newstate=0;
$newstate|=1 if @_ ? $_[0] : $strong;
$newstate|=2 if @_ ? $_[1] : $emphasis;
return if $curstate==$newstate || ($curstate==0 && !$realtext);
# always close whatever is open
$textbuf.="$ST>" if $curstate&1;
$textbuf.="$EM>" if $curstate&2;
# and now open new tags
$textbuf.="<$EM>" if $newstate&2;
$textbuf.="<$ST>" if $newstate&1;
$curstate=$newstate;
}
sub pbreak {
my $how=shift;
$textbuf =~ s/^\xa0+$//s; # treat paragraphs of nbsps as empty
if (length($textbuf)) {
# close highlighting
checkhl(0,0);
push(@elist,[ 0, undef, $poem ]) if $lastempty>$minempty;
push(@elist,[ 0, "\xa0" x $applevel . $textbuf, $poem, $paraid]);
$lastempty=0;
$textbuf='';
$space='';
$realtext=0;
$paraid=undef;
} else {
++$lastempty if !$how;
}
}
sub add_section {
my ($txt,$depth)=@_;
cleanup($txt);
pbreak();
push(@elist,[ $depth, $txt ]);
}
sub get_styles {
my $styles=$_[0]->attr('style');
return () unless $styles;
my @styles;
for (split(/;/,$styles)) {
push (@styles,$1,$2) if /^\s*(\S+)\s*:\s*(\S+)\s*$/;
}
@styles;
}
sub element {
my $elem=shift;
if ($body) {
if (!ref($elem)) {
moretext($elem);
return;
}
my $t=$elem->tag;
if ($t eq "div") {
my $class=$elem->attr('class');
if ($class && $class =~ /^app(\d)$/) {
my $tmp=$applevel;
$applevel+=$1;
element($_) for $elem->content_list;
$applevel=$tmp;
return;
}
} elsif ($t eq "p") {
my $class=$elem->attr('class');
if ($class) {
if ($sect_styles{lc($class)}) { # MS Office stuff
add_section($elem->as_text,$sect_styles{lc($class)});
return;
} elsif ($class eq "MsoFootnoteText" || $class eq "note") {
# skip it here
return;
} elsif ($class eq "lyrics" && !$poem) {
pbreak();
$poem=1;
element($_) for $elem->content_list;
pbreak();
$poem=0;
return;
} elsif ($class eq "intro") {
pbreak();
++$emphasis;
checkhl;
element($_) for $elem->content_list;
--$emphasis;
checkhl;
pbreak();
return;
}
}
pbreak(1); # start of p doesnt add an empty line, but it stops an unclosed paragraph
element($_) for $elem->content_list;
pbreak(); # end of paragraph always closes
return;
} elsif ($t eq "dd" || $t eq "br") {
pbreak();
} elsif ($t =~ /^h(\d)/) {
add_section($elem->as_text,$1*3);
return;
} elsif ($minstyle && $t eq "span") {
my %styles=get_styles($elem);
if ($styles{'font-size'} &&
$styles{'font-size'} =~ /^(\d+(?:\.\d+)?)(?:pt)?$/ &&
$1>=$minstyle && $1<=$maxstyle)
{
my $depth=36-$1;
$depth=1 if $depth<1;
$depth=20 if $depth>20;
add_section($elem->as_text,$depth);
return;
}
} elsif ($minfont && $t eq "font") {
my $size=$elem->attr('size');
if ($size && $size>=$minfont) {
my $depth=$minfont+4-$size;
$depth=1 if $depth<1;
$depth=4 if $depth>4;
add_section($elem->as_text,$depth*3);
return;
}
} elsif ($t eq "script" || $t eq "style") {
return;
} elsif ($t eq "i" || $t eq "em") {
++$emphasis;
checkhl;
element($_) for $elem->content_list;
--$emphasis;
checkhl;
return;
} elsif ($t eq "blockquote") {
pbreak();
++$emphasis;
checkhl;
element($_) for $elem->content_list;
--$emphasis;
checkhl;
return;
} elsif ($t eq "b" || $t eq "strong") {
++$strong;
checkhl;
element($_) for $elem->content_list;
--$strong;
checkhl;
return;
} elsif ($t eq "a") {
my $href=$elem->attr('href');
my $name=$elem->attr('name');
if ($name && $footnotes{$name}) {
$paraid=$name;
}
if ($href && $href =~ /^#/) { # hyperlink
$href=substr($href,1);
if ($footnotes{$href}) {
moretext("",1);
element($_) for $elem->content_list;
moretext("",1);
return;
}
}
}
} else {
return if !ref($elem);
if ($elem->tag eq "body") {
$body=1;
element($_) for $elem->content_list;
$body=0;
return;
} elsif ($elem->tag eq "meta") {
my $eq=$elem->attr('http-equiv');
my $content=$elem->attr('content');
$Encoding=$1 if $eq && $content && lc($eq) eq "content-type" &&
!$Encoding && $content =~ /charset=(\S+)/i;
} elsif ($elem->tag eq "title") {
$Title=$elem->as_text;
}
}
element($_) for $elem->content_list;
}
sub findnotes_element {
my $elem=shift;
return if !ref($elem);
my $t=$elem->tag;
if ($t eq "a" && $elem->attr("name")) {
$footnotes{$elem->attr("name")}=1;
}
findnotes_element($_) for $elem->content_list;
}
print STDERR "Collecting links... ";
findnotes_element($tree);
print STDERR "done.\n";
cleanstate();
print STDERR "Building paragraph list... ";
element($tree); # process text
pbreak();
if ($xmllib) {
# scavenge m$ pseudo xml
my $dom;
for ($tree->find_by_tag_name("~comment")) {
my $cm=$_->attr('text');
if ($cm =~ s/^\[if [^\]]*\]>//s && $cm =~ s///s) {
$cm =~ s/o:(\w)/$1/sg;
# now parse this as an XML
if ($dom=parse_xml($cm)) {
my $eba=first_node($dom,"//eBook-author");
my $pa=first_node($dom,"//Author");
my $pt=first_node($dom,"//eBook-title");
$pa=$eba if $eba;
$Title=$pt if $pt;
$Author=$pa if $pa;
} else {
#my $reason=$dom->{parseError}->{reason};
#print STDERR "XML parse error: $reason" if $reason !~ /namespace/;
}
}
}
}
$tree->delete;
$Encoding||="iso-8859-1";
$Title||="";
$Author||="";
cleanup($Title);
cleanup($Author);
print STDERR "done.\n";
# convert the list into a tree
my $root={ type => undef, chl => [ ], parent => undef, title => undef, level => 0 };
my $cur=$root;
print STDERR "Building sections tree... ";
# iterate over paragraphs
for my $p (@elist) {
if ($p->[0] == 0) { # plain text
if (!$cur->{type}) { # not defined yet
$cur->{type}='p'; # text
} elsif ($cur->{type} eq 's') { # section, create a new one
my $new={ type => 'p', chl => [ ], parent => $cur, title => undef, level => $cur->{level}+1};
$cur=$new;
}
push(@{$cur->{chl}},[$p->[1],$p->[2],$p->[3]]);
} else { # header
# back off to the proper level
while ($p->[0]<=$cur->{level}) {
$cur=$cur->{parent};
}
my $new={ type => undef, chl => [ ], parent => $cur, title => $p->[1], level => $p->[0] };
if (!$cur->{type} || $cur->{type} eq 's') { # no contents yet
$cur->{type}='s';
push(@{$cur->{chl}},$new);
} else { # convert to container
# create an implicit section
my $tmp={ type => 'p', chl => $cur->{chl}, parent => $cur, title => undef, level => $cur->{level} + 1 };
$cur->{chl}=[$tmp,$new];
$cur->{type}='s';
}
$cur=$new;
}
}
undef(@elist);
print STDERR "done.\n";
print STDERR "Cleaning up... ";
# remove leading and trailing empty lines
sub remove_empty_lines {
my $section=shift;
if ($section->{type} && $section->{type} eq 'p') {
my $n;
for ($n=0;$n<=$#{$section->{chl}} && !defined($section->{chl}[$n][0]);++$n) {}
splice(@{$section->{chl}},0,$n) if $n;
for ($n=0;$n<=$#{$section->{chl}} && !defined($section->{chl}[-$n-1][0]);++$n) {}
splice(@{$section->{chl}},-$n) if $n;
} else {
remove_empty_lines($_) for @{$section->{chl}};
}
}
remove_empty_lines($root);
# remove completely empty sections
sub remove_empty_sections {
no warnings;
my $section=shift;
return if $section->{type} && $section->{type} eq 'p';
remove_empty_sections($_) for @{$section->{chl}};
@{$section->{chl}}=grep { length($_->{title}) || @{$_->{chl}} } @{$section->{chl}};
# eliminate 's' sections with empty titles
my $chl=$section->{chl};
for (my $i=0;$i<=$#$chl;++$i) {
if ((!defined($chl->[$i]{type}) || $chl->[$i]{type} eq 's') &&
!length($chl->[$i]{title}))
{
my $n=@{$chl->[$i]{chl}};
splice(@$chl,$i,1,@{$chl->[$i]{chl}});
for (my $k=0;$k<$n;++$k) {
$chl->[$i+$k]{parent}=$section;
}
$i+=$n;
} else {
++$i;
}
}
# eliminate 'p' sections with empty titles
if ($#$chl==0 && defined($chl->[0]{type}) && $chl->[0]{type} eq 'p' &&
(!length($chl->[0]{title}) || !length($section->{title})))
{
$chl->[0]{parent}=undef;
$section->{title}=$chl->[0]{title} if length($chl->[0]{title});
$section->{type}=$chl->[0]{type};
$section->{level}=$chl->[0]{level} if $chl->[0]{level}&&!$section->{level};
@$chl=@{$chl->[0]{chl}};
}
}
remove_empty_sections($root);
print STDERR "done.\n";
print STDERR "Writing XML... " unless $print_toc;
# print a standard header
my @auth=split(' ',$Author);
if (@auth==0) { # no author
@auth=('','','');
} elsif (@auth==1) {
@auth=('','',$auth[0]);
} elsif (@auth==2) {
@auth=($auth[0],'',$auth[1]);
} elsif (@auth>3) {
@auth=('','',join(' ',@auth));
}
print $outfile <
$auth[0]
$auth[1]
$auth[2]
$Title
EOF
# let's see what we've done
my $indent=3;
sub stopen($$$) {
$_[0]=1,++$_[2],print $outfile " " x ($_[2]-1),"\n" if !$_[0];
$_[1]=1,++$_[2],print $outfile " " x ($_[2]-1),"\n" if !$_[1];
}
sub stclose($$) {
$_[0]=0,--$_[1],print $outfile " " x $_[1],"\n" if $_[0];
}
sub pstclose($$$) {
$_[0]=0,--$_[2],print $outfile " " x $_[2],"\n" if $_[0];
$_[1]=0,--$_[2],print $outfile " " x $_[2],"\n" if $_[1];
}
sub glink($) {
$_->[2] ? " id=\"L$_->[2]\"" : "";
}
sub section {
my $sect=shift;
my $st_open=0;
my $poem_open=0;
print $outfile " " x $indent, "";
print $outfile "",$sect->{title},"
" if defined $sect->{title} && length $sect->{title};
print $outfile "\n";
print " " x $indent, $sect->{title}||"", "\n" if $print_toc;
++$indent;
if ($sect->{type} && $sect->{type} eq "p") {
for (@{$sect->{chl}}) {
if (defined($_->[0])) {
if ($_->[1]) {
stopen($poem_open,$st_open,$indent);
print $outfile " " x $indent, "",$_->[0],"\n";
} else {
pstclose($poem_open,$st_open,$indent);
print $outfile " " x $indent, "",$_->[0],"
\n"
}
} else {
if ($_->[1]) {
stclose($st_open,$indent);
} else {
pstclose($poem_open,$st_open,$indent);
print $outfile " " x $indent, "\n";
}
}
}
pstclose($poem_open,$st_open,$indent);
} else {
section($_) for @{$sect->{chl}};
}
--$indent;
print $outfile " " x $indent,"\n";
}
if (!($root->{type} && $root->{type} eq 'p') && (!defined($root->{title}) ||
!length($root->{title})))
{
section($_) for @{$root->{chl}};
} else {
section($root);
}
print $outfile <
EOF
print STDERR "done.\n" unless $print_toc;
exit 0;
sub parse_xml {
my $xml=shift;
if ($^O eq "MSWin32") {
my $dom=new Win32::OLE('Msxml2.DOMDocument.4.0');
$dom->{async}=0;
$dom->{validateOnParse}=0;
return $dom if $dom->loadXML($xml);
} else {
my $parser=new XML::LibXML;
my $dom=eval {
$parser->parse_string($xml);
};
return $dom if $dom && !$@;
}
undef;
}
sub first_node {
my ($dom,$xpe)=@_;
if ($^O eq "MSWin32") {
my $node=$dom->selectSingleNode($xpe);
return $node->text if $node;
undef;
} else {
$dom->findvalue($xpe);
}
}