package T2X; use strict; sub remws_inp { # remove whitespace $_[0] =~ tr/\000-\040/ /s; $_[0] =~ s/^ //; $_[0] =~ s/ $//; } sub remws { # remove whitespace no warnings; my $tmp=shift; $tmp =~ tr/\000-\040/ /s; $tmp =~ s/^ //; $tmp =~ s/ $//; $tmp; } sub escape { # xml-escape no warnings; my $p=shift; my $h=shift; $p =~ s/&/&/g; if (!$h) { $p =~ s//>/g; } $p; } sub conv_quotes { # nicely convert possibly nested quotes local $^W=0; my ($l,$r)=("\x93","\x94"); ($l,$r)=("\xab","\xbb") if $_[0] =~ tr/\200-\377//; my $t=""; my $nest=0; my $last=0; while ($_[0] =~ /((?<=[^ ])")|(?:"(?=[^ ,.]))/sgc) { if (length($1)) { # close return if $nest<=0; $t.=substr($_[0],$last,pos($_[0])-$last-1); $t.=$r; --$nest; } else { #open return if $nest>1; $t.=substr($_[0],$last,pos($_[0])-$last-1); $t.=$l; ++$nest; } $last=pos($_[0]); } if ($nest==0) { # properly balanced $t.=substr($_[0],pos($_[0])); $_[0]=$t; } } sub beautify { # beautify a paragraph my $p=shift; my $h=shift; $p =~ s/^- /– /; conv_quotes($p); $p =~ s/\.\.\./…/g; if (!$h) { $p =~ s/&/&/g; $p =~ s//>/g; } $p; } sub make_xml { # convert a book into xml no warnings; my $book=shift; my $max=shift; my $xml=""; my @au=split(' ',escape(remws($book->{author}))); if (1==@au) { unshift(@au,"",""); } elsif (2==@au) { splice(@au,1,0,""); } elsif (3<@au) { @au=("","",escape(remws($book->{author}))); } my $title=escape(remws($book->{title})); $xml .= qq(\n\n); $xml .= qq(\n); $xml .= qq( \n); $xml .= qq( $title\n); $xml .= qq( \n); $xml .= qq( $au[0]\n); $xml .= qq( $au[1]\n); $xml .= qq( $au[2]\n); $xml .= qq( \n); $xml .= qq( ); $xml .= escape(join("",map { "

" . remws($_) . "

" } split (/\n/,$book->{annotation})),1); $xml .= qq(
\n); $xml .= qq(
\n); $xml .= qq( \n); for (@{$book->{body}}) { $xml .= qq( \n); for (@{$_->{text}}) { my $tmp=beautify($_,$book->{html}); if (length($tmp)) { if ($tmp =~ /^\s*\*\s*\*\s*\*\s*$/) { $xml .= qq( * * *\n); } else { $xml .= qq(

); $xml .= $tmp; $xml .= qq(

\n); } } else { $xml .= qq( \n); } last if defined($max) && length($xml)>=$max; } $xml .= qq(
\n); last if defined($max) && length($xml)>=$max; } $xml .= qq( \n); $xml .= qq(
\n); $xml; } sub replace_tag { my ($tag,$end)=@_; return "<${end}emphasis>" if $tag eq "i"; return "<${end}strong>" if $tag eq "b"; if ($tag =~ /^h[0-9]$/ ) { return "\n <$tag>" if !length($end); return "\n "; } return "
" if $tag eq "hr"; ""; # delete all other tags } my %html_entities=( "iexcl" => "¡", "cent" => "¢", "pound" => "£", "curren" => "¤", "yen" => "¥", "brvbar" => "¦", "sect" => "§", "uml" => "¨", "copy" => "©", "ordf" => "ª", "laquo" => "«", "not" => "¬", "shy" => "­", "reg" => "®", "macr" => "¯", "deg" => "°", "plusmn" => "±", "sup2" => "²", "sup3" => "³", "acute" => "´", "micro" => "µ", "para" => "¶", "middot" => "·", "cedil" => "¸", "sup1" => "¹", "ordm" => "º", "raquo" => "»", "frac14" => "¼", "frac12" => "½", "frac34" => "¾", "iquest" => "¿", "Agrave" => "À", "Aacute" => "Á", "Acirc" => "Â", "Atilde" => "Ã", "Auml" => "Ä", "Aring" => "Å", "AElig" => "Æ", "Ccedil" => "Ç", "Egrave" => "È", "Eacute" => "É", "Ecirc" => "Ê", "Euml" => "Ë", "Igrave" => "Ì", "Iacute" => "Í", "Icirc" => "Î", "Iuml" => "Ï", "ETH" => "Ð", "Ntilde" => "Ñ", "Ograve" => "Ò", "Oacute" => "Ó", "Ocirc" => "Ô", "Otilde" => "Õ", "Ouml" => "Ö", "times" => "×", "Oslash" => "Ø", "Ugrave" => "Ù", "Uacute" => "Ú", "Ucirc" => "Û", "Uuml" => "Ü", "Yacute" => "Ý", "THORN" => "Þ", "szlig" => "ß", "agrave" => "à", "aacute" => "á", "acirc" => "â", "atilde" => "ã", "auml" => "ä", "aring" => "å", "aelig" => "æ", "ccedil" => "ç", "egrave" => "è", "eacute" => "é", "ecirc" => "ê", "euml" => "ë", "igrave" => "ì", "iacute" => "í", "icirc" => "î", "iuml" => "ï", "eth" => "ð", "ntilde" => "ñ", "ograve" => "ò", "oacute" => "ó", "ocirc" => "ô", "otilde" => "õ", "ouml" => "ö", "divide" => "÷", "oslash" => "ø", "ugrave" => "ù", "uacute" => "ú", "ucirc" => "û", "uuml" => "ü", "yacute" => "ý", "thorn" => "þ", "yuml" => "ÿ", "fnof" => "ƒ", "Alpha" => "Α", "Beta" => "Β", "Gamma" => "Γ", "Delta" => "Δ", "Epsilon" => "Ε", "Zeta" => "Ζ", "Eta" => "Η", "Theta" => "Θ", "Iota" => "Ι", "Kappa" => "Κ", "Lambda" => "Λ", "Mu" => "Μ", "Nu" => "Ν", "Xi" => "Ξ", "Omicron" => "Ο", "Pi" => "Π", "Rho" => "Ρ", "Sigma" => "Σ", "Tau" => "Τ", "Upsilon" => "Υ", "Phi" => "Φ", "Chi" => "Χ", "Psi" => "Ψ", "Omega" => "Ω", "alpha" => "α", "beta" => "β", "gamma" => "γ", "delta" => "δ", "epsilon" => "ε", "zeta" => "ζ", "eta" => "η", "theta" => "θ", "iota" => "ι", "kappa" => "κ", "lambda" => "λ", "mu" => "μ", "nu" => "ν", "xi" => "ξ", "omicron" => "ο", "pi" => "π", "rho" => "ρ", "sigmaf" => "ς", "sigma" => "σ", "tau" => "τ", "upsilon" => "υ", "phi" => "φ", "chi" => "χ", "psi" => "ψ", "omega" => "ω", "thetasym" => "ϑ", "upsih" => "ϒ", "piv" => "ϖ", "bull" => "•", "hellip" => "…", "prime" => "′", "Prime" => "″", "oline" => "‾", "frasl" => "⁄", "weierp" => "℘", "image" => "ℑ", "real" => "ℜ", "trade" => "™", "alefsym" => "ℵ", "larr" => "←", "uarr" => "↑", "rarr" => "→", "darr" => "↓", "harr" => "↔", "crarr" => "↵", "lArr" => "⇐", "uArr" => "⇑", "rArr" => "⇒", "dArr" => "⇓", "hArr" => "⇔", "forall" => "∀", "part" => "∂", "exist" => "∃", "empty" => "∅", "nabla" => "∇", "isin" => "∈", "notin" => "∉", "ni" => "∋", "prod" => "∏", "sum" => "∑", "minus" => "−", "lowast" => "∗", "radic" => "√", "prop" => "∝", "infin" => "∞", "ang" => "∠", "and" => "∧", "or" => "∨", "cap" => "∩", "cup" => "∪", "int" => "∫", "there4" => "∴", "sim" => "∼", "cong" => "≅", "asymp" => "≈", "ne" => "≠", "equiv" => "≡", "le" => "≤", "ge" => "≥", "sub" => "⊂", "sup" => "⊃", "nsub" => "⊄", "sube" => "⊆", "supe" => "⊇", "oplus" => "⊕", "otimes" => "⊗", "perp" => "⊥", "sdot" => "⋅", "lceil" => "⌈", "rceil" => "⌉", "lfloor" => "⌊", "rfloor" => "⌋", "lang" => "〈", "rang" => "〉", "loz" => "◊", "spades" => "♠", "clubs" => "♣", "hearts" => "♥", "diams" => "♦", "quot" => "\"", "amp" => "&", "lt" => "<", "gt" => ">", "OElig" => "Œ", "oelig" => "œ", "Scaron" => "Š", "scaron" => "š", "Yuml" => "Ÿ", "circ" => "ˆ", "tilde" => "˜", "ensp" => " ", "emsp" => " ", "thinsp" => " ", "zwnj" => "‌", "zwj" => "‍", "lrm" => "‎", "rlm" => "‏", "ndash" => "–", "mdash" => "—", "lsquo" => "‘", "rsquo" => "’", "sbquo" => "‚", "ldquo" => "“", "rdquo" => "”", "bdquo" => "„", "dagger" => "†", "Dagger" => "‡", "permil" => "‰", "lsaquo" => "‹", "rsaquo" => "›", "euro" => "€", "nbsp" => " " ); sub replace_ent { my $ent=shift; return $html_entities{$ent} if $html_entities{$ent}; return "&$ent;" if $ent =~ /^#\d+$/ || $ent =~ /^#x[\dA-Fa-f]+$/; return "&" if !length($ent); ""; } sub load_file { my $infile=shift; my $text; my $fmt=shift; my $info=''; # load file if (defined($infile)) { open(SRC,"< $infile") || die "Can't open '$infile': $!\n"; binmode(SRC); { local $/=undef; $text=; } close(SRC); } else { binmode(STDIN); { local $/=undef; $text=; } } $info=length($text) . " byte(s)"; conv_charset($text); my $html=0; # check moshkow's formatting if ($text =~ /\x14/) { $text =~ s|\x14 \*([^\n]*)\* \x15|

$1

|sg; $text =~ s|\x14([^\n]*)\x15|

$1

|sg; $html = 1; } # remove msdos garbage and trailing whitespace $text =~ tr/\r//d; $text =~ tr/\000-\011\013-\037\xa0/ /; $text =~ s/ +$//mg; # check if it is some crappy html $html||= $text =~ //i; my $anno=""; my $title=""; my $author=""; if ($html) { # delete most of the garbage # all before title $text =~ s/^.*?(?=)//s; # all past the end of text my $tmp=rindex($text,"]*>=replace_tag($2,$1)=sge; $text =~ s/&(([^;]{1,15});)?/replace_ent($2)/sge; $text =~ s=\s+= =g; # remove empty space $text =~ s=\s+= =g; # remove empty space # convert runs of --- to hr $text =~ s/-{5,}/
/sg; # stuff between first two hr is moved to annotation $anno=join("\n",map { remws($_) } grep { length } split(/\n/,$1)) if $text =~ s/
(.*?)
//s; # first becomes the title $title=$1 if $text =~ s/([^<]*)<\/h[0-9]>//s; $author=$1 if $title =~ s/(.*\.)(.*)/$2/; $author =~ s/(?s $text =~ s/
//sg; remws_inp($title); remws_inp($author); $info.=", html"; } # count total number of lines my $nlines= $text =~ tr/\n//; # check if the whole file is indented and remove that extra indentation my @indent; ++$indent[length($1)] while $text =~ /^(\s+)/mg; for (my $space=1;$space<@indent;++$space) { no warnings; if ($indent[$space]>$nlines*0.87) { $text =~ s/^\s{1-$space}//mg; splice(@indent,0,$space+1); last; } } # try to guess file format # check remaining indentation if (!defined($fmt)) { no warnings; my $ilines=0; $ilines+=$_ for @indent; $info.=", IL=$ilines, NL=$nlines"; if ($ilines<$nlines*0.03) { $fmt=1; } elsif ($ilines>$nlines*0.08) { $fmt=0; } else { # shold really ask user here $fmt=1; } } $info.=", FMT=$fmt"; # split the whole thing into paragraphs my @plist; if ($fmt==3) { @plist=split(/\n\s*\n\s*\n\s*\n/s,$text); } if ($fmt==2) { @plist=split(/\n\n/s,$text); } elsif ($fmt==1) { @plist=split(/\n/s,$text); } else { @plist=split(/(?:\n )|(?:\n(?=\n))/s,$text); } # delete unneeded whitespace remws_inp($_) for @plist; # try to find chapter headings my $max=$fmt ? 50 : 120; my @chlist; my $curch=[]; my $curtitle=""; push(@plist,""); if ($html) { for (my $i=0;$i<@plist-1;++$i) { if ($plist[$i] =~ m=^(.*)$=) { # this seems to be a chapter heading if (@$curch) { push(@chlist,{ 'title' => $curtitle, 'text' => $curch}); $curch=[]; } $curtitle=$1; $curtitle =~ s/<[^>]*>//g; # remove markup from titles } else { push(@$curch,$plist[$i]); } } } else { if ($fmt==2) { for (my $i=0;$i<@plist;++$i) { if ($plist[$i] !~ /^\s*\*\s*\*\s*\*\s*$/ && $plist[$i] =~ /^(chapter|section|book)/i) { if (@$curch) { push(@chlist,{ 'title' => $curtitle, 'text' => $curch}); $curch=[]; } $curtitle=$plist[$i]; } else { push(@$curch,$plist[$i]); } } } else { for (my $i=0;$i<@plist-1;++$i) { if (!length($plist[$i-1]) && !length($plist[$i+1]) && length($plist[$i])>0 && length($plist[$i])<$max && $plist[$i] !~ /^\s*\*\s*\*\s*\*\s*$/) { # this seems to be a chapter heading if (@$curch) { push(@chlist,{ 'title' => $curtitle, 'text' => $curch}); $curch=[]; } $curtitle=$plist[$i]; } else { push(@$curch,$plist[$i]); } } } } push(@chlist,{ 'title' => $curtitle, 'text' => $curch}); # remove empty paragraphs for my $chap (@chlist) { for (my $i=0;$i<@{$chap->{text}};++$i) { if (!length($chap->{text}[$i])) { my $j; for ($j=1;$i+$j<@{$chap->{text}} && !length($chap->{text}[$i+$j]);++$j) { } splice(@{$chap->{text}},$i,$j-1) if $j>1; } } shift(@{$chap->{text}}) if @{$chap->{text}} && !length($chap->{text}[0]); pop(@{$chap->{text}}) if @{$chap->{text}} && !length($chap->{text}[-1]); } # delete empty chapters @chlist=grep { length($_->{title}) || @{$_->{text}} } @chlist; $info.=", CH=" . ($#chlist+1); # return the whole thing now my %book; $book{title}=$title; $book{annotation}=$anno; $book{author}=$author; $book{body}=\@chlist; $book{file}=$infile; $book{html}=$html; $book{info}=$info; $book{fmt}=$fmt; \%book; } ########################################## # charset support my @letter_maps=( ["WIN", [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, ]], ["DOS" , [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]], ["KOI", [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x01, 0x02, 0x17, 0x05, 0x06, 0x15, 0x04, 0x16, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x20, 0x11, 0x12, 0x13, 0x14, 0x07, 0x03, 0x1d, 0x1c, 0x08, 0x19, 0x1e, 0x1a, 0x18, 0x1b, 0x1f, 0x01, 0x02, 0x17, 0x05, 0x06, 0x15, 0x04, 0x16, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x20, 0x11, 0x12, 0x13, 0x14, 0x07, 0x03, 0x1d, 0x1c, 0x08, 0x19, 0x1e, 0x1a, 0x18, 0x1b, ]], ["ISO", [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]]); my @distrib=( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 77, 259, 55, 298, 90, 111, 296, 12, 96, 384, 709, 284, 340, 1, 101, 381, 393, 422, 4, 1, 88, 6, 84, 49, 32, 0, 0, 0, 0, 86, 181, 0, 52, 0, 4, 0, 1, 180, 2, 0, 91, 0, 18, 86, 3, 45, 237, 0, 196, 13, 2, 73, 0, 3, 0, 0, 2, 18, 11, 271, 2, 0, 0, 36, 0, 416, 1, 7, 0, 20, 530, 0, 65, 283, 0, 12, 80, 9, 158, 548, 19, 63, 254, 15, 73, 0, 3, 2, 4, 93, 0, 0, 220, 16, 0, 0, 22, 0, 98, 0, 0, 0, 109, 30, 0, 0, 58, 0, 15, 194, 0, 59, 616, 0, 83, 0, 1, 49, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 412, 3, 76, 1, 5, 433, 40, 0, 214, 0, 29, 63, 2, 252, 358, 9, 186, 22, 10, 132, 0, 5, 7, 4, 3, 0, 0, 60, 42, 0, 2, 21, 0, 5, 102, 200, 260, 255, 140, 83, 113, 9, 281, 170, 544, 400, 776, 18, 84, 631, 427, 403, 10, 0, 43, 31, 105, 85, 55, 0, 0, 0, 0, 11, 15, 0, 126, 0, 0, 1, 96, 347, 1, 0, 109, 0, 10, 4, 0, 126, 8, 0, 0, 0, 0, 27, 0, 0, 0, 6, 0, 0, 0, 0, 3, 0, 0, 0, 0, 488, 17, 94, 45, 109, 49, 2, 2, 37, 0, 20, 24, 40, 154, 77, 0, 34, 0, 1, 33, 0, 0, 0, 0, 0, 0, 1, 27, 12, 0, 0, 16, 0, 4, 45, 233, 40, 120, 213, 33, 191, 33, 88, 217, 485, 226, 277, 10, 16, 85, 269, 433, 0, 0, 170, 76, 96, 64, 16, 0, 0, 0, 0, 18, 124, 0, 0, 0, 0, 8, 16, 0, 0, 0, 0, 0, 3, 0, 8, 21, 2, 0, 115, 32, 34, 0, 2, 0, 15, 17, 5, 0, 0, 0, 0, 0, 0, 0, 0, 540, 0, 6, 0, 0, 43, 1, 0, 328, 0, 55, 63, 0, 42, 800, 0, 160, 40, 39, 152, 0, 0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 849, 3, 2, 34, 52, 501, 36, 1, 629, 0, 22, 25, 0, 37, 568, 5, 0, 166, 4, 125, 0, 0, 0, 6, 0, 0, 0, 69, 373, 0, 86, 164, 0, 248, 1, 0, 8, 0, 346, 0, 0, 234, 0, 12, 28, 1, 145, 330, 4, 6, 9, 0, 144, 0, 0, 1, 7, 0, 0, 0, 105, 7, 0, 0, 45, 0, 998, 0, 0, 1, 86, 878, 2, 30, 831, 0, 34, 0, 0, 258, 1016, 0, 6, 40, 139, 350, 1, 0, 20, 14, 0, 14, 0, 374, 77, 0, 8, 109, 0, 0, 395, 640, 446, 407, 157, 241, 123, 95, 310, 212, 633, 487, 707, 25, 108, 652, 587, 575, 6, 1, 71, 9, 157, 59, 35, 0, 0, 0, 1, 12, 61, 0, 122, 0, 0, 0, 0, 226, 0, 0, 63, 0, 6, 112, 0, 26, 909, 5, 636, 0, 4, 61, 0, 0, 0, 0, 0, 0, 0, 49, 1, 0, 0, 16, 0, 880, 4, 33, 12, 82, 654, 31, 5, 417, 0, 28, 11, 18, 111, 751, 9, 18, 29, 55, 260, 21, 21, 3, 8, 17, 2, 0, 159, 45, 0, 7, 98, 0, 239, 4, 131, 14, 21, 237, 7, 1, 111, 0, 278, 256, 86, 87, 279, 205, 17, 130, 960, 65, 3, 20, 2, 29, 4, 0, 1, 29, 317, 4, 10, 392, 0, 528, 4, 203, 0, 6, 510, 0, 0, 321, 0, 63, 36, 6, 90, 1026, 15, 248, 119, 3, 97, 0, 0, 1, 5, 1, 2, 0, 173, 553, 0, 1, 70, 0, 15, 51, 81, 80, 144, 13, 102, 32, 0, 9, 65, 227, 82, 26, 0, 65, 36, 117, 146, 0, 0, 39, 0, 77, 50, 24, 0, 0, 0, 1, 105, 6, 0, 50, 0, 0, 0, 0, 20, 0, 0, 29, 0, 0, 1, 0, 0, 27, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 28, 0, 20, 0, 0, 4, 0, 0, 16, 0, 0, 11, 11, 38, 212, 0, 15, 7, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0, 7, 0, 0, 59, 0, 0, 24, 0, 0, 0, 0, 0, 27, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 185, 0, 5, 0, 0, 226, 0, 0, 162, 0, 19, 0, 0, 94, 8, 0, 1, 0, 268, 44, 0, 0, 0, 0, 15, 0, 0, 0, 24, 0, 0, 0, 0, 66, 0, 2, 0, 0, 162, 0, 0, 155, 0, 53, 40, 0, 19, 19, 0, 1, 0, 2, 18, 0, 0, 0, 0, 0, 0, 0, 0, 58, 0, 0, 0, 0, 42, 0, 0, 0, 0, 137, 0, 0, 88, 0, 0, 0, 0, 10, 0, 0, 1, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 33, 75, 14, 10, 117, 6, 9, 0, 157, 11, 167, 130, 9, 0, 12, 25, 57, 76, 0, 0, 109, 0, 16, 34, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7, 0, 1, 0, 27, 0, 22, 2, 0, 82, 0, 9, 129, 0, 0, 0, 104, 3, 0, 0, 0, 15, 13, 56, 0, 0, 0, 0, 0, 42, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 2, 6, 0, 0, 0, 0, 149, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 3, 46, 0, 26, 1, 0, 0, 0, 0, 2, 10, 0, 0, 1, 11, 43, 0, 0, 0, 0, 5, 0, 47, 0, 0, 0, 0, 2, 0, 0, 0, 0, 44, 14, 69, 10, 17, 23, 4, 19, 7, 80, 55, 57, 0, 0, 11, 63, 100, 0, 0, 21, 1, 6, 2, 39, 0, 0, 0, 0, 12, 9, ); sub detect_charset { my ($msv,$enc)=(0,"WIN"); my @chars=map { ord } split(//,$_[0]); return "utf-16" if $chars[0]==0xff && $chars[1]==0xfe; local $^W=0; for (@letter_maps) { my $map=$_->[1]; my $last=0; my @dt; ++$dt[$last*33+$map->[$_]],$last=$map->[$_] for @chars; my $sum=0; $sum+=$dt[$_]*$distrib[$_] for 0..33*33-1; $enc=$_->[0],$msv=$sum if $sum>$msv; } $enc; } sub conv_charset { my $cs=detect_charset(substr($_[0],0,1024)); if ($cs eq "KOI") { $_[0] =~ tr/\200-\377/\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\250\272\245\263\277\250\251\252\253\254\255\256\257\260\261\262\250\252\265\262\257\270\271\272\273\274\275\276\277\376\340\341\366\344\345\364\343\365\350\351\352\353\354\355\356\357\377\360\361\362\363\346\342\374\373\347\370\375\371\367\372\336\300\301\326\304\305\324\303\325\310\311\312\313\314\315\316\317\337\320\321\322\323\306\302\334\333\307\330\335\331\327\332/; } elsif ($cs eq "DOS") { $_[0] =~ tr/\200-\377/\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377/; } elsif ($cs eq "ISO") { $_[0] =~ tr/\200-\377/\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377/; } elsif ($cs eq "utf-16") { eval { require Unicode::String; $_[0]=Unicode::String::utf16($_[0])->latin1; }; } } 1;