diff --git a/Makefile.PL b/Makefile.PL index 8b3932e63b..a207c07b4b 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -59,6 +59,7 @@ WriteMakefile(NAME => 'LaTeXML', PREREQ_PM => { 'Archive::Zip' => 0, 'DB_File' => 0, + 'Encode::Locale' => 0, 'File::Which' => 0, 'Getopt::Long' => 2.37, 'Image::Size' => 0, diff --git a/bin/latexml b/bin/latexml index a7a8ea0a2b..f48bbb23c1 100755 --- a/bin/latexml +++ b/bin/latexml @@ -22,6 +22,10 @@ use LaTeXML::Util::Pathname; use LaTeXML::Common::Error; use Encode; +# decode @ARGV before further processing +use Encode::Locale; +Encode::Locale::decode_argv(Encode::FB_CROAK); + #********************************************************************** # Parse command line my ($verbosity, $strict, $comments, $noparse, $includestyles) = (0, 0, 1, 0, 0); diff --git a/bin/latexmlc b/bin/latexmlc index c3e6232fe0..e0c1b89449 100755 --- a/bin/latexmlc +++ b/bin/latexmlc @@ -41,6 +41,10 @@ use HTTP::Response; use HTTP::Request; use JSON::XS qw(decode_json); +# decode @ARGV before further processing +use Encode::Locale; +Encode::Locale::decode_argv(Encode::FB_CROAK); + # Determine if a socket server is installed locally and obtain its pathname: my $latexmls; $latexmls = catfile($RealBin_safe, 'latexmls') if (-e catfile($RealBin_safe, 'latexmls')); diff --git a/bin/latexmlfind b/bin/latexmlfind index 1ea8a76603..29d522ddf9 100755 --- a/bin/latexmlfind +++ b/bin/latexmlfind @@ -20,6 +20,10 @@ use LaTeXML::Common::XML; use Text::Wrap; use LaTeXML; # Currently, just for version information. +# decode @ARGV before further processing +use Encode::Locale; +Encode::Locale::decode_argv(Encode::FB_CROAK); + #********************************************************************** # Parse command line my ($verbosity) = (0); @@ -125,7 +129,7 @@ sub collect_matches { my ($description, $xpath) = @_; my @nodes = $XPATH->findnodes($xpath, $DOC); print "Query $description appears in " . scalar(@nodes) . " places\n" if $verbosity > -1; - print " [XPath = \"$xpath\"]\n" if $verbosity > 0; + print " [XPath = \"$xpath\"]\n" if $verbosity > 0; foreach my $node (@nodes) { my $object = id_object($node); push(@{ $$object{items} }, $node); } @@ -144,9 +148,9 @@ sub id_object { my $parent_object = id_object($node); my $type = $node->localname; my $labels = $node->getAttribute('labels'); - my ($refnum) = $XPATH->findnodes("child::ltx:tags/ltx:tag[\@refnum]/text()", $node); - my ($title) = $XPATH->findnodes("child::ltx:toctitle | child::ltx:title", $node); - my $desc = ($refnum ? ($title ? "$refnum. " . $title->textContent : $refnum) + my ($refnum) = $XPATH->findnodes("child::ltx:tags/ltx:tag[\@refnum]/text()", $node); + my ($title) = $XPATH->findnodes("child::ltx:toctitle | child::ltx:title", $node); + my $desc = ($refnum ? ($title ? "$refnum. " . $title->textContent : $refnum) : ($title ? $title->textContent : '')); $desc =~ s/\s+/ /g; $OBJECTS{$id} = $object diff --git a/bin/latexmlmath b/bin/latexmlmath index 4d58f76fd1..0c721a5049 100755 --- a/bin/latexmlmath +++ b/bin/latexmlmath @@ -27,6 +27,10 @@ use LaTeXML::Post::CrossRef; use LaTeXML::Util::ObjectDB; use LaTeXML::Common::Error; +# decode @ARGV before further processing +use Encode::Locale; +Encode::Locale::decode_argv(Encode::FB_CROAK); + #********************************************************************** # Parse command line diff --git a/bin/latexmlpost b/bin/latexmlpost index 690fdead42..36eee17a6e 100755 --- a/bin/latexmlpost +++ b/bin/latexmlpost @@ -25,6 +25,10 @@ use LaTeXML::Post::Scan; use LaTeXML::Util::Pathname; use LaTeXML::Util::ObjectDB; +# decode @ARGV before further processing +use Encode::Locale; +Encode::Locale::decode_argv(Encode::FB_CROAK); + #====================================================================== # Parse command line. #====================================================================== diff --git a/lib/LaTeXML/Util/Pathname.pm b/lib/LaTeXML/Util/Pathname.pm index 1c6fe69336..3f9cfe904e 100644 --- a/lib/LaTeXML/Util/Pathname.pm +++ b/lib/LaTeXML/Util/Pathname.pm @@ -32,6 +32,8 @@ use File::Spec; use File::Copy; use File::Which; use Cwd; +use Encode; +use Encode::Locale; use base qw(Exporter); our @EXPORT = qw( &pathname_find &pathname_findall &pathname_kpsewhich &pathname_make &pathname_canonical @@ -51,6 +53,12 @@ my $ISWINDOWS; BEGIN { $ISWINDOWS = $^O =~ /^(MSWin|NetWare|cygwin)/i; require Win32::ShellQuote if $ISWINDOWS; + + # configure perl to call I/O APIs using the current codepage rather than utf-8 + # NOTE: this is only a stopgap measure; creating a file with a name that does + # not fit the codepage may create a file with a similar looking but different name + require POSIX; + POSIX::setlocale(&POSIX::LC_ALL, '.ACP') if $ISWINDOWS; } # NOTE: For absolute pathnames, the directory component starts with @@ -219,17 +227,28 @@ sub pathname_to_url { $relative_pathname = join('/', split(/\Q$SEP\E/, $relative_pathname)); } return $relative_pathname; } +#====================================================================== +# Locale encoding operations, to be used in file system operations. +sub pathname_decode { + my ($pathname) = @_; + return $ISWINDOWS ? $pathname : decode('locale_fs', $pathname, Encode::FB_CROAK); } + +sub pathname_encode { + my ($pathname) = @_; + return $ISWINDOWS ? $pathname : encode('locale_fs', $pathname, Encode::FB_CROAK); } + #====================================================================== # Actual file system operations. + sub pathname_timestamp { - my ($pathname) = @_; + my $pathname = pathname_encode($_[0]); return -f $pathname ? (stat($pathname))[9] : 0; } our $CWD = undef; # DO NOT use pathname_cwd, unless you also use pathname_chdir to change dirs!!! sub pathname_cwd { if (!defined $CWD) { - if (my $cwd = cwd()) { + if (my $cwd = pathname_decode(cwd())) { $CWD = pathname_canonical($cwd); } else { # Fatal not imported @@ -238,7 +257,7 @@ sub pathname_cwd { return $CWD; } sub pathname_chdir { - my ($directory) = @_; + my $directory = pathname_encode($_[0]); chdir($directory); pathname_cwd(); # RE-cache $CWD! return; } @@ -250,7 +269,7 @@ sub pathname_mkdir { my ($volume, $dirs, $last) = File::Spec->splitpath($directory); my (@dirs) = (File::Spec->splitdir($dirs), $last); for (my $i = 0 ; $i <= $#dirs ; $i++) { - my $dir = File::Spec->catpath($volume, File::Spec->catdir(@dirs[0 .. $i]), ''); + my $dir = pathname_encode(File::Spec->catpath($volume, File::Spec->catdir(@dirs[0 .. $i]), '')); if (!-d $dir) { mkdir($dir) or return; } } return $directory; } @@ -262,7 +281,8 @@ sub pathname_copy { # If it _needs_ to be copied: $source = pathname_canonical($source); $destination = pathname_canonical($destination); - if ((!-f $destination) || (pathname_timestamp($source) > pathname_timestamp($destination))) { + my ($enc_source, $enc_destination) = (pathname_encode($source), pathname_encode($destination)); + if ((!-f $enc_destination) || (pathname_timestamp($source) > pathname_timestamp($destination))) { if (my $destdir = pathname_directory($destination)) { pathname_mkdir($destdir) or return; } ### if($^O =~ /^(MSWin32|NetWare)$/){ # Windows @@ -271,9 +291,9 @@ sub pathname_copy { ### else { # Unix ### system("cp --preserve=timestamps $source $destination")==0 or return; } # Hopefully this portably copies, preserving timestamp. - copy($source, $destination) or return; - my ($atime, $mtime) = (stat($source))[8, 9]; - utime $atime, $mtime, $destination; # And set the modification time + copy($enc_source, $enc_destination) or return; + my ($atime, $mtime) = (stat($enc_source))[8, 9]; + utime $atime, $mtime, $enc_destination; # And set the modification time } return $destination; } @@ -373,10 +393,11 @@ sub candidate_pathnames { qr/^\Q$name\E\Q$ext\E$/]); } } # Now, combine; precedence to leading directories. foreach my $dir (@dirs) { - opendir(DIR, $dir) or next; + opendir(DIR, pathname_encode($dir)) or next; my @dir_files = readdir(DIR); closedir(DIR); for my $local_file (@dir_files) { + $local_file = pathname_decode($local_file); for my $regex_pair (@regexes) { my ($i_regex, $regex) = @$regex_pair; if ($local_file =~ m/$i_regex/) { @@ -429,11 +450,11 @@ sub build_kpse_cache { # texpaths: the directories which contain the TeX related files we're interested in #. (but they're typically below where the ls-R indexes are!) my ($texmf,$texpaths) = split("\n", - `"$kpsewhich" --expand-var \'\\\$TEXMF\' --show-path tex $kpse_toolchain`); + `"$kpsewhich" --expand-var \'\\\$TEXMF\' --show-path tex $kpse_toolchain`); my @filters = (); # Really shouldn't end up empty. foreach my $path (split(/$KPATHSEP/, $texpaths)) { $path =~ s/^!!//; $path =~ s|//+$|/|; - push(@filters, $path) if -d $path; } + push(@filters, $path) if -d pathname_encode($path); } my $filterre = scalar(@filters) && '(?:' . join('|', map { "\Q$_\E"; } @filters) . ')'; $texmf =~ s/^["']//; $texmf =~ s/["']$//; $texmf =~ s/^\s*\\\{(.+?)}\s*$/$1/s; @@ -441,12 +462,13 @@ sub build_kpse_cache { my @dirs = split(/,/, $texmf); foreach my $dir (@dirs) { $dir =~ s/^!!//; + my $enc_dir = pathname_encode($dir); # Presumably if no ls-R, we can ignore the directory? - if (-f "$dir/ls-R") { + if (-f "$enc_dir/ls-R") { my $LSR; my $subdir; my $skip = 0; # whether to skip entries in the current subdirectory. - open($LSR, '<', "$dir/ls-R") or die "Cannot read $dir/ls-R: $!"; + open($LSR, '<', "$enc_dir/ls-R") or die "Cannot read $dir/ls-R: $!"; while (<$LSR>) { chop; next if !$_ || (substr($_, 0, 1) eq '%');