#!/usr/bin/perl -w =head1 NAME html2xml - convert HTML to FictioBook XML =head1 SYNOPSIS B [options] [source file [destination file]] =head1 DESCRIPTION I parses its HTML input and converts text and simple formatting to FictionBook XML. Source file can be omited or specified as '-' to read standard input. Destination file is produced by replacing any extension of source file with .xml, if destination file is not specified and source is stdin, the result is written to stdout. =head1 OPTIONS =over 4 =item B<-m> I treat I consecutive paragraph end events as a single end of paragraph, this can be used with files that add

after each paragraph for example =item B<-f> I minimum size in the C element that will be converted to a section heading, logical depth will be clamp(I+4-I,1,4)*3 =item B<-s> swap the meaning of C and C tags =item B<-S> use C and C elements instead of C and C =item B<-c> print the detected table of contents to stdout =item B<-p> I look for C elements with the I style specified and use the ones with I <= I <= I as section headings, logical depth will be clamp(36-I,1,20) =item B<-e> I set the charset in the xml declaration to I, if not specified then the charset in html C tag will be used, or I if C tag is not present =item B<-r> look for classes used in C

elements and print the class usage table, no conversion is done if this option is specified =item B<-b> I use the specified C

classes to detect section headings I is like "class1,class2;class3;class4,class5", groups delimited by semicolons specify the classes for the same logical depth, if there are multiple classes for a single depth, they can be separated by commas. No spaces are allowed in the I =back =head1 EXAMPLES html2xml -b head -p 14,20 text.html =head1 AUTHOR Mike Matsnev =cut use strict; use warnings; use HTML::TreeBuilder; use Getopt::Std; my $xmllib=$^O eq "MSWin32" ? "use Win32::OLE" : "use XML::LibXML"; eval $xmllib; if ($@) { print STDERR "No XML support found.\n"; undef $xmllib; } # global options my $minempty=1; # minimum number of empty paragraphs that # produces an my $minfont=0; # minimum size to be interpreted as a section my $ST="strong"; # strong tag my $EM="emphasis"; # emphasis tag my $print_toc=0; # print table of contents in the end my $minstyle=0; # minimum font-size style to be used as a section my $maxstyle=10000; # maximum .... my $Encoding; # encoding my $Title; # title my $Author; my $report_styles; # styles my %sect_styles; # styles to use as sections # fetch options my %options; getopts('m:f:sScp:e:rb:',\%options); $minempty=$options{m}+0 if $options{m} && $options{m}>0; $minfont=$options{f}+0 if $options{f}; ($ST,$EM)=("s","em") if $options{S}; ($ST,$EM)=($EM,$ST) if $options{s}; $print_toc=1 if $options{c}; if ($options{p} && $options{p} =~ /^(\d+)(?:,(\d+))?$/) { $minstyle=$1; $maxstyle=$2 if $2; } $Encoding=$options{e} if $options{e}; $report_styles=1 if $options{r}; if ($options{b}) { my $depth=3; for (split(/;/,$options{b})) { $sect_styles{lc($_)}=$depth for split(/,/,$_); $depth+=3; } } select(STDERR); $|=1; select(STDOUT); my $infile=shift(@ARGV); my $inf; if (defined($infile) && $infile ne "-") { open(INF,"< $infile") || die "Can't open '$infile' for reading: $!\n"; binmode(INF); $inf=\*INF; } my $tree=new HTML::TreeBuilder; $tree->p_strict(1); $tree->store_comments(1); print STDERR "Loading file... "; $tree->parse_file($inf ? $inf : \*STDIN); print STDERR "done.\n"; close($inf) if $inf; my %styles_stats; sub stylecheck { no warnings; my $elem=shift; return unless ref($elem); $styles_stats{$_}++ if $elem->tag eq "p" && ($_=$elem->attr('class')); stylecheck($_) for $elem->content_list; } if ($report_styles) { stylecheck($tree); printf("%10d %s\n",$_->[0],$_->[1]) for sort { $b->[0] <=> $a->[0] } map { [$styles_stats{$_},$_] } keys %styles_stats; exit 0; } my $outfile; if (@ARGV) { $outfile=shift(@ARGV); } else { no warnings; $outfile=$infile; $outfile =~ s/\.[^\.\\\/]*$/.xml/; $outfile='-' unless $outfile; } if ($outfile eq "-") { $outfile=\*STDOUT; } else { open(OUTF,"> $outfile") || die "Can't open '$outfile' for writing: $!\n"; $outfile=\*OUTF; } binmode($outfile); my $textbuf=''; # current paragraph contents my $realtext=0; # 1 if there is any real text in textbuf my $space=''; # current trailing space my $lastempty=0; # last paragraph was my $strong=0; # enabled/disabled in source my $emphasis=0; # enabled/disabled in source my $curstate=0; # current state of and in XML output my @elist; # list of all paragraphs/headers my $body=0; my $next_name=1; # footnotes my %footnotes; my $note_name=undef; my $note_count=0; my $poem=0; # we are parsing a poem sub cleanstate { # reinit state $textbuf=''; $realtext=0; $space=''; $lastempty=0; $strong=0; $emphasis=0; $curstate=0; $body=0; undef(@elist); } sub cleanup { $_[0] =~ y/\t\r\n \xa0/ /s; $_[0] =~ s/^ //; $_[0] =~ s/ $//; } my %html_entities=( "iexcl" => "¡", "cent" => "¢", "pound" => "£", "curren" => "¤", "yen" => "¥", "brvbar" => "¦", "sect" => "§", "uml" => "¨", "copy" => "©", "ordf" => "ª", "laquo" => "«", "not" => "¬", "shy" => "­", "reg" => "®", "macr" => "¯", "deg" => "°", "plusmn" => "±", "sup2" => "²", "sup3" => "³", "acute" => "´", "micro" => "µ", "para" => "¶", "middot" => "·", "cedil" => "¸", "sup1" => "¹", "ordm" => "º", "raquo" => "»", "frac14" => "¼", "frac12" => "½", "frac34" => "¾", "iquest" => "¿", "Agrave" => "À", "Aacute" => "Á", "Acirc" => "Â", "Atilde" => "Ã", "Auml" => "Ä", "Aring" => "Å", "AElig" => "Æ", "Ccedil" => "Ç", "Egrave" => "È", "Eacute" => "É", "Ecirc" => "Ê", "Euml" => "Ë", "Igrave" => "Ì", "Iacute" => "Í", "Icirc" => "Î", "Iuml" => "Ï", "ETH" => "Ð", "Ntilde" => "Ñ", "Ograve" => "Ò", "Oacute" => "Ó", "Ocirc" => "Ô", "Otilde" => "Õ", "Ouml" => "Ö", "times" => "×", "Oslash" => "Ø", "Ugrave" => "Ù", "Uacute" => "Ú", "Ucirc" => "Û", "Uuml" => "Ü", "Yacute" => "Ý", "THORN" => "Þ", "szlig" => "ß", "agrave" => "à", "aacute" => "á", "acirc" => "â", "atilde" => "ã", "auml" => "ä", "aring" => "å", "aelig" => "æ", "ccedil" => "ç", "egrave" => "è", "eacute" => "é", "ecirc" => "ê", "euml" => "ë", "igrave" => "ì", "iacute" => "í", "icirc" => "î", "iuml" => "ï", "eth" => "ð", "ntilde" => "ñ", "ograve" => "ò", "oacute" => "ó", "ocirc" => "ô", "otilde" => "õ", "ouml" => "ö", "divide" => "÷", "oslash" => "ø", "ugrave" => "ù", "uacute" => "ú", "ucirc" => "û", "uuml" => "ü", "yacute" => "ý", "thorn" => "þ", "yuml" => "ÿ", "fnof" => "ƒ", "Alpha" => "Α", "Beta" => "Β", "Gamma" => "Γ", "Delta" => "Δ", "Epsilon" => "Ε", "Zeta" => "Ζ", "Eta" => "Η", "Theta" => "Θ", "Iota" => "Ι", "Kappa" => "Κ", "Lambda" => "Λ", "Mu" => "Μ", "Nu" => "Ν", "Xi" => "Ξ", "Omicron" => "Ο", "Pi" => "Π", "Rho" => "Ρ", "Sigma" => "Σ", "Tau" => "Τ", "Upsilon" => "Υ", "Phi" => "Φ", "Chi" => "Χ", "Psi" => "Ψ", "Omega" => "Ω", "alpha" => "α", "beta" => "β", "gamma" => "γ", "delta" => "δ", "epsilon" => "ε", "zeta" => "ζ", "eta" => "η", "theta" => "θ", "iota" => "ι", "kappa" => "κ", "lambda" => "λ", "mu" => "μ", "nu" => "ν", "xi" => "ξ", "omicron" => "ο", "pi" => "π", "rho" => "ρ", "sigmaf" => "ς", "sigma" => "σ", "tau" => "τ", "upsilon" => "υ", "phi" => "φ", "chi" => "χ", "psi" => "ψ", "omega" => "ω", "thetasym" => "ϑ", "upsih" => "ϒ", "piv" => "ϖ", "bull" => "•", "hellip" => "…", "prime" => "′", "Prime" => "″", "oline" => "‾", "frasl" => "⁄", "weierp" => "℘", "image" => "ℑ", "real" => "ℜ", "trade" => "™", "alefsym" => "ℵ", "larr" => "←", "uarr" => "↑", "rarr" => "→", "darr" => "↓", "harr" => "↔", "crarr" => "↵", "lArr" => "⇐", "uArr" => "⇑", "rArr" => "⇒", "dArr" => "⇓", "hArr" => "⇔", "forall" => "∀", "part" => "∂", "exist" => "∃", "empty" => "∅", "nabla" => "∇", "isin" => "∈", "notin" => "∉", "ni" => "∋", "prod" => "∏", "sum" => "∑", "minus" => "−", "lowast" => "∗", "radic" => "√", "prop" => "∝", "infin" => "∞", "ang" => "∠", "and" => "∧", "or" => "∨", "cap" => "∩", "cup" => "∪", "int" => "∫", "there4" => "∴", "sim" => "∼", "cong" => "≅", "asymp" => "≈", "ne" => "≠", "equiv" => "≡", "le" => "≤", "ge" => "≥", "sub" => "⊂", "sup" => "⊃", "nsub" => "⊄", "sube" => "⊆", "supe" => "⊇", "oplus" => "⊕", "otimes" => "⊗", "perp" => "⊥", "sdot" => "⋅", "lceil" => "⌈", "rceil" => "⌉", "lfloor" => "⌊", "rfloor" => "⌋", "lang" => "〈", "rang" => "〉", "loz" => "◊", "spades" => "♠", "clubs" => "♣", "hearts" => "♥", "diams" => "♦", "quot" => "\"", "amp" => "&", "lt" => "<", "gt" => ">", "OElig" => "Œ", "oelig" => "œ", "Scaron" => "Š", "scaron" => "š", "Yuml" => "Ÿ", "circ" => "ˆ", "tilde" => "˜", "ensp" => " ", "emsp" => " ", "thinsp" => " ", "zwnj" => "‌", "zwj" => "‍", "lrm" => "‎", "rlm" => "‏", "ndash" => "–", "mdash" => "—", "lsquo" => "‘", "rsquo" => "’", "sbquo" => "‚", "ldquo" => "“", "rdquo" => "”", "bdquo" => "„", "dagger" => "†", "Dagger" => "‡", "permil" => "‰", "lsaquo" => "‹", "rsaquo" => "›", "euro" => "€", "nbsp" => " "); sub moretext { my $txt=shift; { no warnings; $txt =~ s/&(?!#)(?:([a-zA-Z]+);)?/$1 ? $html_entities{$1} : "&"/sge; } $txt =~ y/\t\r\n \xa0/ /s; # also destroys nbsp $space=' ' if $txt =~ /^\s/; my $msp=''; $msp=' ' if $txt =~ /\s$/; $txt =~ s/^ //; $txt =~ s/ $//; $txt =~ s/ \. \. \./…/sg; #$txt =~ s/&(?!#|(?:[a-z]+;))/&/g; # escape &, but don't touch entities if (!length($txt)) { # whitespace only??? $space=(length($space) || length($msp)) && $realtext ? ' ' : ''; } else { $textbuf.=$space if $realtext; $realtext=1,checkhl() unless $realtext; $textbuf.=$txt; $space=$msp; } } sub checkhl { # check and highlighting my $newstate=0; $newstate|=1 if @_ ? $_[0] : $strong; $newstate|=2 if @_ ? $_[1] : $emphasis; return if $curstate==$newstate || ($curstate==0 && !$realtext); # always close whatever is open $textbuf.="" if $curstate&1; $textbuf.="" if $curstate&2; # and now open new tags $textbuf.="<$EM>" if $newstate&2; $textbuf.="<$ST>" if $newstate&1; $curstate=$newstate; } sub pbreak { if (length($textbuf)) { # close highlighting checkhl(0,0); push(@elist,[ 0, undef, $poem ]) if $lastempty>$minempty; push(@elist,[ 0, $textbuf, $poem]); $lastempty=0; $textbuf=''; $space=''; $realtext=0; } else { ++$lastempty; } } sub add_section { my ($txt,$depth)=@_; cleanup($txt); pbreak(); push(@elist,[ $depth, $txt ]); } sub get_styles { my $styles=$_[0]->attr('style'); return () unless $styles; my @styles; for (split(/;/,$styles)) { push (@styles,$1,$2) if /^\s*(\S+)\s*:\s*(\S+)\s*$/; } @styles; } sub element { my $elem=shift; if ($body) { if (!ref($elem)) { moretext($elem); return; } my $t=$elem->tag; if ($t eq "p") { my $class=$elem->attr('class'); if ($class) { if ($sect_styles{lc($class)}) { # MS Office stuff add_section($elem->as_text,$sect_styles{lc($class)}); return; } elsif ($class eq "MsoFootnoteText" || $class eq "note") { # skip it here return; } elsif ($class eq "lyrics" && !$poem) { pbreak(); $poem=1; element($_) for $elem->content_list; pbreak(); $poem=0; return; } elsif ($class eq "intro") { pbreak(); ++$emphasis; checkhl; element($_) for $elem->content_list; --$emphasis; checkhl; pbreak(); return; } } pbreak(); } elsif ($t eq "dd" || $t eq "br") { pbreak(); } elsif ($t =~ /^h(\d)/) { add_section($elem->as_text,$1*3); return; } elsif ($minstyle && $t eq "span") { my %styles=get_styles($elem); if ($styles{'font-size'} && $styles{'font-size'} =~ /^(\d+(?:\.\d+)?)(?:pt)?$/ && $1>=$minstyle && $1<=$maxstyle) { my $depth=36-$1; $depth=1 if $depth<1; $depth=20 if $depth>20; add_section($elem->as_text,$depth); return; } } elsif ($minfont && $t eq "font") { my $size=$elem->attr('size'); if ($size && $size>=$minfont) { my $depth=$minfont+4-$size; $depth=1 if $depth<1; $depth=4 if $depth>4; add_section($elem->as_text,$depth*3); return; } } elsif ($t eq "script" || $t eq "style") { return; } elsif ($t eq "i" || $t eq "em") { ++$emphasis; checkhl; element($_) for $elem->content_list; --$emphasis; checkhl; return; } elsif ($t eq "blockquote") { pbreak(); ++$emphasis; checkhl; element($_) for $elem->content_list; --$emphasis; checkhl; return; } elsif ($t eq "b" || $t eq "strong") { ++$strong; checkhl; element($_) for $elem->content_list; --$strong; checkhl; return; } elsif ($t eq "a") { my $href=$elem->attr('href'); if ($href && $href =~ /^#/) { # ms office footnote $href=substr($href,1); if ($footnotes{$href}) { moretext(""); moretext("

$_

") for @{$footnotes{$href}}; moretext(""); ++$next_name; return; } } } } else { return if !ref($elem); if ($elem->tag eq "body") { $body=1; element($_) for $elem->content_list; $body=0; return; } elsif ($elem->tag eq "meta") { my $eq=$elem->attr('http-equiv'); my $content=$elem->attr('content'); $Encoding=$1 if $eq && $content && lc($eq) eq "content-type" && !$Encoding && $content =~ /charset=(\S+)/i; } elsif ($elem->tag eq "title") { $Title=$elem->as_text; } } element($_) for $elem->content_list; } sub findnotes_element { my $elem=shift; if ($body) { if (!ref($elem)) { moretext($elem); return; } my $t=$elem->tag; if ($t eq "p") { pbreak(); } elsif ($t eq "dd" || $t eq "br") { pbreak(); } elsif ($t =~ /^h(\d)/) { pbreak(); } elsif ($t eq "script" || $t eq "style") { return; } elsif ($t eq "i") { ++$emphasis; checkhl; element($_) for $elem->content_list; --$emphasis; checkhl; return; } elsif ($t eq "b") { ++$strong; checkhl; element($_) for $elem->content_list; --$strong; checkhl; return; } elsif ($t eq "a") { $note_name=$elem->attr('name'); } } else { return if !ref($elem); if ($elem->tag eq "p") { my $class=$elem->attr('class'); if ($class && ($class eq "MsoFootnoteText" || $class eq "note")) { $body=1; findnotes_element($_) for $elem->content_list; pbreak(); shift(@elist) while @elist && (!defined($elist[0][1]) || !length($elist[0][1])); pop(@elist) while @elist && (!defined($elist[-1][1])|| !length($elist[-1][1])); if ($note_name) { $footnotes{$note_name}=[map { $_->[1] } @elist]; $note_name=undef; ++$note_count; } cleanstate(); } } } findnotes_element($_) for $elem->content_list; } print STDERR "Building footnotes list... "; findnotes_element($tree); print STDERR "done ($note_count found).\n"; cleanstate(); print STDERR "Building paragraph list... "; element($tree); # process text pbreak(); if ($xmllib) { # scavenge m$ pseudo xml my $dom; for ($tree->find_by_tag_name("~comment")) { my $cm=$_->attr('text'); if ($cm =~ s/^\[if [^\]]*\]>//s && $cm =~ s///s) { $cm =~ s/o:(\w)/$1/sg; # now parse this as an XML if ($dom=parse_xml($cm)) { my $eba=first_node($dom,"//eBook-author"); my $pa=first_node($dom,"//Author"); my $pt=first_node($dom,"//eBook-title"); $pa=$eba if $eba; $Title=$pt if $pt; $Author=$pa if $pa; } else { #my $reason=$dom->{parseError}->{reason}; #print STDERR "XML parse error: $reason" if $reason !~ /namespace/; } } } } $tree->delete; $Encoding||="iso-8859-1"; $Title||=""; $Author||=""; cleanup($Title); cleanup($Author); print STDERR "done.\n"; # convert the list into a tree my $root={ type => undef, chl => [ ], parent => undef, title => undef, level => 0 }; my $cur=$root; print STDERR "Building sections tree... "; # iterate over paragraphs for my $p (@elist) { if ($p->[0] == 0) { # plain text if (!$cur->{type}) { # not defined yet $cur->{type}='p'; # text } elsif ($cur->{type} eq 's') { # section, create a new one my $new={ type => 'p', chl => [ ], parent => $cur, title => undef, level => $cur->{level}+1}; $cur=$new; } push(@{$cur->{chl}},[$p->[1],$p->[2]]); } else { # header # back off to the proper level while ($p->[0]<=$cur->{level}) { $cur=$cur->{parent}; } my $new={ type => undef, chl => [ ], parent => $cur, title => $p->[1], level => $p->[0] }; if (!$cur->{type} || $cur->{type} eq 's') { # no contents yet $cur->{type}='s'; push(@{$cur->{chl}},$new); } else { # convert to container # create an implicit section my $tmp={ type => 'p', chl => $cur->{chl}, parent => $cur, title => undef, level => $cur->{level} + 1 }; $cur->{chl}=[$tmp,$new]; $cur->{type}='s'; } $cur=$new; } } undef(@elist); print STDERR "done.\n"; print STDERR "Cleaning up... "; # remove leading and trailing empty lines sub remove_empty_lines { my $section=shift; if ($section->{type} && $section->{type} eq 'p') { my $n; for ($n=0;$n<=$#{$section->{chl}} && !defined($section->{chl}[$n][0]);++$n) {} splice(@{$section->{chl}},0,$n) if $n; for ($n=0;$n<=$#{$section->{chl}} && !defined($section->{chl}[-$n-1][0]);++$n) {} splice(@{$section->{chl}},-$n) if $n; } else { remove_empty_lines($_) for @{$section->{chl}}; } } remove_empty_lines($root); # remove completely empty sections sub remove_empty_sections { no warnings; my $section=shift; return if $section->{type} && $section->{type} eq 'p'; remove_empty_sections($_) for @{$section->{chl}}; @{$section->{chl}}=grep { length($_->{title}) || @{$_->{chl}} } @{$section->{chl}}; # eliminate 's' sections with empty titles my $chl=$section->{chl}; for (my $i=0;$i<=$#$chl;++$i) { if ((!defined($chl->[$i]{type}) || $chl->[$i]{type} eq 's') && !length($chl->[$i]{title})) { my $n=@{$chl->[$i]{chl}}; splice(@$chl,$i,1,@{$chl->[$i]{chl}}); for (my $k=0;$k<$n;++$k) { $chl->[$i+$k]{parent}=$section; } $i+=$n; } else { ++$i; } } # eliminate 'p' sections with empty titles if ($#$chl==0 && defined($chl->[0]{type}) && $chl->[0]{type} eq 'p' && (!length($chl->[0]{title}) || !length($section->{title}))) { $chl->[0]{parent}=undef; $section->{title}=$chl->[0]{title} if length($chl->[0]{title}); $section->{type}=$chl->[0]{type}; $section->{level}=$chl->[0]{level} if $chl->[0]{level}&&!$section->{level}; @$chl=@{$chl->[0]{chl}}; } } remove_empty_sections($root); print STDERR "done.\n"; print STDERR "Writing XML... " unless $print_toc; # print a standard header my @auth=split(' ',$Author); if (@auth==0) { # no author @auth=('','',''); } elsif (@auth==1) { @auth=('','',$auth[0]); } elsif (@auth==2) { @auth=($auth[0],'',$auth[1]); } elsif (@auth>3) { @auth=('','',join(' ',@auth)); } print $outfile < $auth[0] $auth[1] $auth[2] $Title

EOF # let's see what we've done my $indent=3; sub stopen($$$) { $_[0]=1,++$_[2],print $outfile " " x ($_[2]-1),"\n" if !$_[0]; $_[1]=1,++$_[2],print $outfile " " x ($_[2]-1),"\n" if !$_[1]; } sub stclose($$) { $_[0]=0,--$_[1],print $outfile " " x $_[1],"\n" if $_[0]; } sub pstclose($$$) { $_[0]=0,--$_[2],print $outfile " " x $_[2],"\n" if $_[0]; $_[1]=0,--$_[2],print $outfile " " x $_[2],"\n" if $_[1]; } sub section { my $sect=shift; my $st_open=0; my $poem_open=0; if ($sect->{title}) { print $outfile " " x $indent, "

{title},"\">\n"; print " " x $indent, $sect->{title}, "\n" if $print_toc; } else { print $outfile " " x $indent, "
\n"; } ++$indent; if ($sect->{type} && $sect->{type} eq "p") { for (@{$sect->{chl}}) { if (defined($_->[0])) { if ($_->[1]) { stopen($poem_open,$st_open,$indent); print $outfile " " x $indent, "",$_->[0],"\n"; } else { pstclose($poem_open,$st_open,$indent); print $outfile " " x $indent, "

",$_->[0],"

\n" } } else { if ($_->[1]) { stclose($st_open,$indent); } else { pstclose($poem_open,$st_open,$indent); print $outfile " " x $indent, "\n"; } } } pstclose($poem_open,$st_open,$indent); } else { section($_) for @{$sect->{chl}}; } --$indent; print $outfile " " x $indent,"
\n"; } if (!($root->{type} && $root->{type} eq 'p') && (!defined($root->{title}) || !length($root->{title}))) { section($_) for @{$root->{chl}}; } else { section($root); } print $outfile < EOF print STDERR "done.\n" unless $print_toc; exit 0; sub parse_xml { my $xml=shift; if ($^O eq "MSWin32") { my $dom=new Win32::OLE('Msxml2.DOMDocument.4.0'); $dom->{async}=0; $dom->{validateOnParse}=0; return $dom if $dom->loadXML($xml); } else { my $parser=new XML::LibXML; my $dom=eval { $parser->parse_string($xml); }; return $dom if $dom && !$@; } undef; } sub first_node { my ($dom,$xpe)=@_; if ($^O eq "MSWin32") { my $node=$dom->selectSingleNode($xpe); return $node->text if $node; undef; } else { $dom->findvalue($xpe); } }