package T2X;
use strict;
sub remws_inp { # remove whitespace
$_[0] =~ tr/\000-\040/ /s;
$_[0] =~ s/^ //;
$_[0] =~ s/ $//;
}
sub remws { # remove whitespace
no warnings;
my $tmp=shift;
$tmp =~ tr/\000-\040/ /s;
$tmp =~ s/^ //;
$tmp =~ s/ $//;
$tmp;
}
sub escape { # xml-escape
no warnings;
my $p=shift;
my $h=shift;
$p =~ s/&/&/g;
if (!$h) {
$p =~ s/</g;
$p =~ s/"/"/g;
$p =~ s/>/>/g;
}
$p;
}
sub conv_quotes { # nicely convert possibly nested quotes
local $^W=0;
my ($l,$r)=("\x93","\x94");
($l,$r)=("\xab","\xbb") if $_[0] =~ tr/\200-\377//;
my $t="";
my $nest=0;
my $last=0;
while ($_[0] =~ /((?<=[^ ])")|(?:"(?=[^ ,.]))/sgc) {
if (length($1)) { # close
return if $nest<=0;
$t.=substr($_[0],$last,pos($_[0])-$last-1);
$t.=$r;
--$nest;
} else { #open
return if $nest>1;
$t.=substr($_[0],$last,pos($_[0])-$last-1);
$t.=$l;
++$nest;
}
$last=pos($_[0]);
}
if ($nest==0) { # properly balanced
$t.=substr($_[0],pos($_[0]));
$_[0]=$t;
}
}
sub beautify { # beautify a paragraph
my $p=shift;
my $h=shift;
$p =~ s/^- /– /;
conv_quotes($p);
$p =~ s/\.\.\./…/g;
if (!$h) {
$p =~ s/&/&/g;
$p =~ s/</g;
$p =~ s/"/"/g;
$p =~ s/>/>/g;
}
$p;
}
sub make_xml { # convert a book into xml
no warnings;
my $book=shift;
my $max=shift;
my $xml="";
my @au=split(' ',escape(remws($book->{author})));
if (1==@au) {
unshift(@au,"","");
} elsif (2==@au) {
splice(@au,1,0,"");
} elsif (3<@au) {
@au=("","",escape(remws($book->{author})));
}
my $title=escape(remws($book->{title}));
$xml .= qq(\n\n);
$xml .= qq(\n);
$xml .= qq( \n);
$xml .= qq( \n);
$xml .= qq( \n);
$xml .= qq( $au[0]\n);
$xml .= qq( $au[1]\n);
$xml .= qq( $au[2]\n);
$xml .= qq( \n);
$xml .= qq( $title\n);
$xml .= qq( );
$xml .= escape(join("",map { "" . remws($_) . "
" }
split (/\n/,$book->{annotation})),1);
$xml .= qq( \n);
$xml .= qq( \n);
$xml .= qq( \n);
$xml .= qq( \n);
for (@{$book->{body}}) {
$xml .= qq( \n);
$xml .= qq( );
$xml .= escape($_->{title});
$xml .= qq("
\n);
for (@{$_->{text}}) {
my $tmp=beautify($_,$book->{html});
if (length($tmp)) {
if ($tmp =~ /^\s*\*\s*\*\s*\*\s*$/) {
$xml .= qq( * * *\n);
} else {
$xml .= qq( );
$xml .= $tmp;
$xml .= qq(
\n);
}
} else {
$xml .= qq( \n);
}
last if defined($max) && length($xml)>=$max;
}
$xml .= qq( \n);
last if defined($max) && length($xml)>=$max;
}
$xml .= qq( \n);
$xml .= qq(\n);
$xml;
}
sub replace_tag {
my ($tag,$end)=@_;
return "<${end}emphasis>" if $tag eq "i";
return "<${end}strong>" if $tag eq "b";
if ($tag =~ /^h[0-9]$/ ) {
return "\n <$tag>" if !length($end);
return "$tag>\n ";
}
return "
" if $tag eq "hr";
""; # delete all other tags
}
my %html_entities=(
"iexcl" => "¡",
"cent" => "¢",
"pound" => "£",
"curren" => "¤",
"yen" => "¥",
"brvbar" => "¦",
"sect" => "§",
"uml" => "¨",
"copy" => "©",
"ordf" => "ª",
"laquo" => "«",
"not" => "¬",
"shy" => "",
"reg" => "®",
"macr" => "¯",
"deg" => "°",
"plusmn" => "±",
"sup2" => "²",
"sup3" => "³",
"acute" => "´",
"micro" => "µ",
"para" => "¶",
"middot" => "·",
"cedil" => "¸",
"sup1" => "¹",
"ordm" => "º",
"raquo" => "»",
"frac14" => "¼",
"frac12" => "½",
"frac34" => "¾",
"iquest" => "¿",
"Agrave" => "À",
"Aacute" => "Á",
"Acirc" => "Â",
"Atilde" => "Ã",
"Auml" => "Ä",
"Aring" => "Å",
"AElig" => "Æ",
"Ccedil" => "Ç",
"Egrave" => "È",
"Eacute" => "É",
"Ecirc" => "Ê",
"Euml" => "Ë",
"Igrave" => "Ì",
"Iacute" => "Í",
"Icirc" => "Î",
"Iuml" => "Ï",
"ETH" => "Ð",
"Ntilde" => "Ñ",
"Ograve" => "Ò",
"Oacute" => "Ó",
"Ocirc" => "Ô",
"Otilde" => "Õ",
"Ouml" => "Ö",
"times" => "×",
"Oslash" => "Ø",
"Ugrave" => "Ù",
"Uacute" => "Ú",
"Ucirc" => "Û",
"Uuml" => "Ü",
"Yacute" => "Ý",
"THORN" => "Þ",
"szlig" => "ß",
"agrave" => "à",
"aacute" => "á",
"acirc" => "â",
"atilde" => "ã",
"auml" => "ä",
"aring" => "å",
"aelig" => "æ",
"ccedil" => "ç",
"egrave" => "è",
"eacute" => "é",
"ecirc" => "ê",
"euml" => "ë",
"igrave" => "ì",
"iacute" => "í",
"icirc" => "î",
"iuml" => "ï",
"eth" => "ð",
"ntilde" => "ñ",
"ograve" => "ò",
"oacute" => "ó",
"ocirc" => "ô",
"otilde" => "õ",
"ouml" => "ö",
"divide" => "÷",
"oslash" => "ø",
"ugrave" => "ù",
"uacute" => "ú",
"ucirc" => "û",
"uuml" => "ü",
"yacute" => "ý",
"thorn" => "þ",
"yuml" => "ÿ",
"fnof" => "ƒ",
"Alpha" => "Α",
"Beta" => "Β",
"Gamma" => "Γ",
"Delta" => "Δ",
"Epsilon" => "Ε",
"Zeta" => "Ζ",
"Eta" => "Η",
"Theta" => "Θ",
"Iota" => "Ι",
"Kappa" => "Κ",
"Lambda" => "Λ",
"Mu" => "Μ",
"Nu" => "Ν",
"Xi" => "Ξ",
"Omicron" => "Ο",
"Pi" => "Π",
"Rho" => "Ρ",
"Sigma" => "Σ",
"Tau" => "Τ",
"Upsilon" => "Υ",
"Phi" => "Φ",
"Chi" => "Χ",
"Psi" => "Ψ",
"Omega" => "Ω",
"alpha" => "α",
"beta" => "β",
"gamma" => "γ",
"delta" => "δ",
"epsilon" => "ε",
"zeta" => "ζ",
"eta" => "η",
"theta" => "θ",
"iota" => "ι",
"kappa" => "κ",
"lambda" => "λ",
"mu" => "μ",
"nu" => "ν",
"xi" => "ξ",
"omicron" => "ο",
"pi" => "π",
"rho" => "ρ",
"sigmaf" => "ς",
"sigma" => "σ",
"tau" => "τ",
"upsilon" => "υ",
"phi" => "φ",
"chi" => "χ",
"psi" => "ψ",
"omega" => "ω",
"thetasym" => "ϑ",
"upsih" => "ϒ",
"piv" => "ϖ",
"bull" => "•",
"hellip" => "…",
"prime" => "′",
"Prime" => "″",
"oline" => "‾",
"frasl" => "⁄",
"weierp" => "℘",
"image" => "ℑ",
"real" => "ℜ",
"trade" => "™",
"alefsym" => "ℵ",
"larr" => "←",
"uarr" => "↑",
"rarr" => "→",
"darr" => "↓",
"harr" => "↔",
"crarr" => "↵",
"lArr" => "⇐",
"uArr" => "⇑",
"rArr" => "⇒",
"dArr" => "⇓",
"hArr" => "⇔",
"forall" => "∀",
"part" => "∂",
"exist" => "∃",
"empty" => "∅",
"nabla" => "∇",
"isin" => "∈",
"notin" => "∉",
"ni" => "∋",
"prod" => "∏",
"sum" => "∑",
"minus" => "−",
"lowast" => "∗",
"radic" => "√",
"prop" => "∝",
"infin" => "∞",
"ang" => "∠",
"and" => "∧",
"or" => "∨",
"cap" => "∩",
"cup" => "∪",
"int" => "∫",
"there4" => "∴",
"sim" => "∼",
"cong" => "≅",
"asymp" => "≈",
"ne" => "≠",
"equiv" => "≡",
"le" => "≤",
"ge" => "≥",
"sub" => "⊂",
"sup" => "⊃",
"nsub" => "⊄",
"sube" => "⊆",
"supe" => "⊇",
"oplus" => "⊕",
"otimes" => "⊗",
"perp" => "⊥",
"sdot" => "⋅",
"lceil" => "⌈",
"rceil" => "⌉",
"lfloor" => "⌊",
"rfloor" => "⌋",
"lang" => "〈",
"rang" => "〉",
"loz" => "◊",
"spades" => "♠",
"clubs" => "♣",
"hearts" => "♥",
"diams" => "♦",
"quot" => "\"",
"amp" => "&",
"lt" => "<",
"gt" => ">",
"OElig" => "Œ",
"oelig" => "œ",
"Scaron" => "Š",
"scaron" => "š",
"Yuml" => "Ÿ",
"circ" => "ˆ",
"tilde" => "˜",
"ensp" => " ",
"emsp" => " ",
"thinsp" => " ",
"zwnj" => "",
"zwj" => "",
"lrm" => "",
"rlm" => "",
"ndash" => "–",
"mdash" => "—",
"lsquo" => "‘",
"rsquo" => "’",
"sbquo" => "‚",
"ldquo" => "“",
"rdquo" => "”",
"bdquo" => "„",
"dagger" => "†",
"Dagger" => "‡",
"permil" => "‰",
"lsaquo" => "‹",
"rsaquo" => "›",
"euro" => "€",
"nbsp" => " "
);
sub replace_ent {
my $ent=shift;
return $html_entities{$ent} if $html_entities{$ent};
return "&$ent;" if $ent =~ /^#\d+$/ || $ent =~ /^#x[\dA-Fa-f]+$/;
return "&" if !length($ent);
"";
}
sub load_file {
my $infile=shift;
my $text;
my $fmt=shift;
my $info='';
# load file
if (defined($infile)) {
open(SRC,"< $infile") || die "Can't open '$infile': $!\n";
binmode(SRC);
{ local $/=undef; $text=; }
close(SRC);
} else {
binmode(STDIN);
{ local $/=undef; $text=; }
}
$info=length($text) . " byte(s)";
conv_charset($text);
my $html=0;
# check moshkow's formatting
if ($text =~ /\x14/) {
$text =~ s|\x14 \*([^\n]*)\* \x15|$1
|sg;
$text =~ s|\x14([^\n]*)\x15|$1
|sg;
$html = 1;
}
# remove msdos garbage and trailing whitespace
$text =~ tr/\r//d;
$text =~ tr/\000-\011\013-\037\xa0/ /;
$text =~ s/ +$//mg;
# check if it is some crappy html
$html||= $text =~ //i;
my $anno="";
my $title="";
my $author="";
if ($html) {
# delete most of the garbage
# all before title
$text =~ s/^.*?(?=)//s;
# all past the end of text
my $tmp=rindex($text,"
]*>=replace_tag($2,$1)=sge;
$text =~ s/&(([^;]{1,15});)?/replace_ent($2)/sge;
$text =~ s=\s+= =g; # remove empty space
$text =~ s=\s+= =g; # remove empty space
# convert runs of --- to hr
$text =~ s/-{5,}/
/sg;
# stuff between first two hr is moved to annotation
$anno=join("\n",map { remws($_) } grep { length } split(/\n/,$1))
if $text =~ s/
(.*?)
//s;
# first becomes the title
$title=$1 if $text =~ s/([^<]*)<\/h[0-9]>//s;
$author=$1 if $title =~ s/(.*\.)(.*)/$2/;
$author =~ s/(?s
$text =~ s/
//sg;
remws_inp($title); remws_inp($author);
$info.=", html";
}
# count total number of lines
my $nlines= $text =~ tr/\n//;
# check if the whole file is indented and remove that extra indentation
my @indent;
++$indent[length($1)] while $text =~ /^(\s+)/mg;
for (my $space=1;$space<@indent;++$space) {
no warnings;
if ($indent[$space]>$nlines*0.87) {
$text =~ s/^\s{1-$space}//mg;
splice(@indent,0,$space+1);
last;
}
}
# try to guess file format
# check remaining indentation
if (!defined($fmt)) {
no warnings;
my $ilines=0;
$ilines+=$_ for @indent;
$info.=", IL=$ilines, NL=$nlines";
if ($ilines<$nlines*0.03) {
$fmt=1;
} elsif ($ilines>$nlines*0.08) {
$fmt=0;
} else { # shold really ask user here
$fmt=1;
}
}
$info.=", FMT=$fmt";
# split the whole thing into paragraphs
my @plist;
if ($fmt==3) {
@plist=split(/\n\s*\n\s*\n\s*\n/s,$text);
} if ($fmt==2) {
@plist=split(/\n\n/s,$text);
} elsif ($fmt==1) {
@plist=split(/\n/s,$text);
} else {
@plist=split(/(?:\n )|(?:\n(?=\n))/s,$text);
}
# delete unneeded whitespace
remws_inp($_) for @plist;
# try to find chapter headings
my $max=$fmt ? 50 : 120;
my @chlist;
my $curch=[];
my $curtitle="";
push(@plist,"");
if ($html) {
for (my $i=0;$i<@plist-1;++$i) {
if ($plist[$i] =~ m=^(.*)$=) {
# this seems to be a chapter heading
if (@$curch) {
push(@chlist,{ 'title' => $curtitle, 'text' => $curch});
$curch=[];
}
$curtitle=$1;
$curtitle =~ s/<[^>]*>//g; # remove markup from titles
} else {
push(@$curch,$plist[$i]);
}
}
} else {
if ($fmt==2) {
for (my $i=0;$i<@plist;++$i) {
if ($plist[$i] !~ /^\s*\*\s*\*\s*\*\s*$/ && $plist[$i] =~ /^(chapter|section|book)/i) {
if (@$curch) {
push(@chlist,{ 'title' => $curtitle, 'text' => $curch});
$curch=[];
}
$curtitle=$plist[$i];
} else {
push(@$curch,$plist[$i]);
}
}
} else {
for (my $i=0;$i<@plist-1;++$i) {
if (!length($plist[$i-1]) && !length($plist[$i+1]) &&
length($plist[$i])>0 && length($plist[$i])<$max && $plist[$i] !~ /^\s*\*\s*\*\s*\*\s*$/)
{
# this seems to be a chapter heading
if (@$curch) {
push(@chlist,{ 'title' => $curtitle, 'text' => $curch});
$curch=[];
}
$curtitle=$plist[$i];
} else {
push(@$curch,$plist[$i]);
}
}
}
}
push(@chlist,{ 'title' => $curtitle, 'text' => $curch});
# remove empty paragraphs
for my $chap (@chlist) {
for (my $i=0;$i<@{$chap->{text}};++$i) {
if (!length($chap->{text}[$i])) {
my $j;
for ($j=1;$i+$j<@{$chap->{text}} && !length($chap->{text}[$i+$j]);++$j) { }
splice(@{$chap->{text}},$i,$j-1) if $j>1;
}
}
shift(@{$chap->{text}}) if @{$chap->{text}} && !length($chap->{text}[0]);
pop(@{$chap->{text}}) if @{$chap->{text}} && !length($chap->{text}[-1]);
}
# delete empty chapters
@chlist=grep { length($_->{title}) || @{$_->{text}} } @chlist;
$info.=", CH=" . ($#chlist+1);
# return the whole thing now
my %book;
$book{title}=$title;
$book{annotation}=$anno;
$book{author}=$author;
$book{body}=\@chlist;
$book{file}=$infile;
$book{html}=$html;
$book{info}=$info;
$book{fmt}=$fmt;
\%book;
}
##########################################
# charset support
my @letter_maps=( ["WIN", [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
0x1e, 0x1f, 0x20, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
]], ["DOS" , [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02,
0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c,
0x1d, 0x1e, 0x1f, 0x20, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a,
0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
]], ["KOI", [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x01, 0x02,
0x17, 0x05, 0x06, 0x15, 0x04, 0x16, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x20, 0x11, 0x12, 0x13, 0x14, 0x07, 0x03, 0x1d, 0x1c, 0x08, 0x19, 0x1e,
0x1a, 0x18, 0x1b, 0x1f, 0x01, 0x02, 0x17, 0x05, 0x06, 0x15, 0x04, 0x16, 0x09,
0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x20, 0x11, 0x12, 0x13, 0x14, 0x07,
0x03, 0x1d, 0x1c, 0x08, 0x19, 0x1e, 0x1a, 0x18, 0x1b,
]], ["ISO", [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
]]);
my @distrib=(
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 77, 259, 55, 298, 90, 111, 296, 12, 96, 384, 709,
284, 340, 1, 101, 381, 393, 422, 4, 1, 88, 6, 84, 49, 32, 0, 0, 0, 0, 86, 181,
0, 52, 0, 4, 0, 1, 180, 2, 0, 91, 0, 18, 86, 3, 45, 237, 0, 196, 13, 2, 73, 0,
3, 0, 0, 2, 18, 11, 271, 2, 0, 0, 36, 0, 416, 1, 7, 0, 20, 530, 0, 65, 283, 0,
12, 80, 9, 158, 548, 19, 63, 254, 15, 73, 0, 3, 2, 4, 93, 0, 0, 220, 16, 0, 0,
22, 0, 98, 0, 0, 0, 109, 30, 0, 0, 58, 0, 15, 194, 0, 59, 616, 0, 83, 0, 1, 49,
0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 412, 3, 76, 1, 5, 433, 40, 0, 214, 0, 29,
63, 2, 252, 358, 9, 186, 22, 10, 132, 0, 5, 7, 4, 3, 0, 0, 60, 42, 0, 2, 21, 0,
5, 102, 200, 260, 255, 140, 83, 113, 9, 281, 170, 544, 400, 776, 18, 84, 631,
427, 403, 10, 0, 43, 31, 105, 85, 55, 0, 0, 0, 0, 11, 15, 0, 126, 0, 0, 1, 96,
347, 1, 0, 109, 0, 10, 4, 0, 126, 8, 0, 0, 0, 0, 27, 0, 0, 0, 6, 0, 0, 0, 0, 3,
0, 0, 0, 0, 488, 17, 94, 45, 109, 49, 2, 2, 37, 0, 20, 24, 40, 154, 77, 0, 34,
0, 1, 33, 0, 0, 0, 0, 0, 0, 1, 27, 12, 0, 0, 16, 0, 4, 45, 233, 40, 120, 213,
33, 191, 33, 88, 217, 485, 226, 277, 10, 16, 85, 269, 433, 0, 0, 170, 76, 96,
64, 16, 0, 0, 0, 0, 18, 124, 0, 0, 0, 0, 8, 16, 0, 0, 0, 0, 0, 3, 0, 8, 21, 2,
0, 115, 32, 34, 0, 2, 0, 15, 17, 5, 0, 0, 0, 0, 0, 0, 0, 0, 540, 0, 6, 0, 0, 43,
1, 0, 328, 0, 55, 63, 0, 42, 800, 0, 160, 40, 39, 152, 0, 0, 3, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 849, 3, 2, 34, 52, 501, 36, 1, 629, 0, 22, 25, 0, 37, 568, 5, 0,
166, 4, 125, 0, 0, 0, 6, 0, 0, 0, 69, 373, 0, 86, 164, 0, 248, 1, 0, 8, 0, 346,
0, 0, 234, 0, 12, 28, 1, 145, 330, 4, 6, 9, 0, 144, 0, 0, 1, 7, 0, 0, 0, 105,
7, 0, 0, 45, 0, 998, 0, 0, 1, 86, 878, 2, 30, 831, 0, 34, 0, 0, 258, 1016, 0,
6, 40, 139, 350, 1, 0, 20, 14, 0, 14, 0, 374, 77, 0, 8, 109, 0, 0, 395, 640,
446, 407, 157, 241, 123, 95, 310, 212, 633, 487, 707, 25, 108, 652, 587, 575,
6, 1, 71, 9, 157, 59, 35, 0, 0, 0, 1, 12, 61, 0, 122, 0, 0, 0, 0, 226, 0, 0,
63, 0, 6, 112, 0, 26, 909, 5, 636, 0, 4, 61, 0, 0, 0, 0, 0, 0, 0, 49, 1, 0, 0,
16, 0, 880, 4, 33, 12, 82, 654, 31, 5, 417, 0, 28, 11, 18, 111, 751, 9, 18, 29,
55, 260, 21, 21, 3, 8, 17, 2, 0, 159, 45, 0, 7, 98, 0, 239, 4, 131, 14, 21, 237,
7, 1, 111, 0, 278, 256, 86, 87, 279, 205, 17, 130, 960, 65, 3, 20, 2, 29, 4, 0,
1, 29, 317, 4, 10, 392, 0, 528, 4, 203, 0, 6, 510, 0, 0, 321, 0, 63, 36, 6, 90,
1026, 15, 248, 119, 3, 97, 0, 0, 1, 5, 1, 2, 0, 173, 553, 0, 1, 70, 0, 15, 51,
81, 80, 144, 13, 102, 32, 0, 9, 65, 227, 82, 26, 0, 65, 36, 117, 146, 0, 0, 39,
0, 77, 50, 24, 0, 0, 0, 1, 105, 6, 0, 50, 0, 0, 0, 0, 20, 0, 0, 29, 0, 0, 1, 0,
0, 27, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 28, 0, 20, 0, 0,
4, 0, 0, 16, 0, 0, 11, 11, 38, 212, 0, 15, 7, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 50, 0, 7, 0, 0, 59, 0, 0, 24, 0, 0, 0, 0, 0, 27, 0, 0, 0, 0, 9, 0,
0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 185, 0, 5, 0, 0, 226, 0, 0, 162, 0, 19, 0,
0, 94, 8, 0, 1, 0, 268, 44, 0, 0, 0, 0, 15, 0, 0, 0, 24, 0, 0, 0, 0, 66, 0, 2,
0, 0, 162, 0, 0, 155, 0, 53, 40, 0, 19, 19, 0, 1, 0, 2, 18, 0, 0, 0, 0, 0, 0,
0, 0, 58, 0, 0, 0, 0, 42, 0, 0, 0, 0, 137, 0, 0, 88, 0, 0, 0, 0, 10, 0, 0, 1,
0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 33, 75,
14, 10, 117, 6, 9, 0, 157, 11, 167, 130, 9, 0, 12, 25, 57, 76, 0, 0, 109, 0,
16, 34, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7, 0, 1, 0, 27, 0, 22, 2, 0, 82, 0, 9, 129,
0, 0, 0, 104, 3, 0, 0, 0, 15, 13, 56, 0, 0, 0, 0, 0, 42, 27, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 3, 0, 2, 6, 0, 0, 0, 0, 149, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 25, 0, 3, 46, 0, 26, 1, 0, 0, 0, 0, 2, 10, 0, 0, 1, 11, 43, 0, 0,
0, 0, 5, 0, 47, 0, 0, 0, 0, 2, 0, 0, 0, 0, 44, 14, 69, 10, 17, 23, 4, 19, 7,
80, 55, 57, 0, 0, 11, 63, 100, 0, 0, 21, 1, 6, 2, 39, 0, 0, 0, 0, 12, 9,
);
sub detect_charset {
my ($msv,$enc)=(0,"WIN");
my @chars=map { ord } split(//,$_[0]);
return "utf-16" if $chars[0]==0xff && $chars[1]==0xfe;
local $^W=0;
for (@letter_maps) {
my $map=$_->[1];
my $last=0;
my @dt;
++$dt[$last*33+$map->[$_]],$last=$map->[$_] for @chars;
my $sum=0;
$sum+=$dt[$_]*$distrib[$_] for 0..33*33-1;
$enc=$_->[0],$msv=$sum if $sum>$msv;
}
$enc;
}
sub conv_charset {
my $cs=detect_charset(substr($_[0],0,1024));
if ($cs eq "KOI") {
$_[0] =~ tr/\200-\377/\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\250\272\245\263\277\250\251\252\253\254\255\256\257\260\261\262\250\252\265\262\257\270\271\272\273\274\275\276\277\376\340\341\366\344\345\364\343\365\350\351\352\353\354\355\356\357\377\360\361\362\363\346\342\374\373\347\370\375\371\367\372\336\300\301\326\304\305\324\303\325\310\311\312\313\314\315\316\317\337\320\321\322\323\306\302\334\333\307\330\335\331\327\332/;
} elsif ($cs eq "DOS") {
$_[0] =~ tr/\200-\377/\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377/;
} elsif ($cs eq "ISO") {
$_[0] =~ tr/\200-\377/\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377/;
} elsif ($cs eq "utf-16") {
eval {
require Unicode::String;
$_[0]=Unicode::String::utf16($_[0])->latin1;
};
}
}
1;