#!/usr/bin/perl -w
#
# combine multiple gettext files into one, OR split such a merged file
# into its component languages.
#

=head1 NAME

multimsgstr.pl - manage multiple locale files

=head1 SYNOPSIS

multimsgstr.pl [OPTION] filename.po ...

=head1 DESCRIPTION

multimsgstr.pl combines multiple gettext locale files into a single
file, or generates individual locale files from such a merged
file. This ensures that the individual locale files all have the same
message strings and are in the same order. The merged file is
generated to I<stdout>.

=over

=item --merge

Forces merge mode, in the case where you wish to generate the merged
file from a single locale file.

=item --locales I<locale>

Requests that only the specified locale be generated from the merged
file. May be used multiple times to generate several locales
simultaneously.

In split mode without this parameter, the tool defaults to generating
all locales contained in the merged file. Freshly generated locales
take their msgids and comments from the first locale in the merged
file, and except for the identifying msgstr (msgid "") take their
msgstrs from the msgid. The identifying msgstr is set to that of the
first locale in the merged file.

=item filename.po...

If one file is specified without C<--merge>, the tool operates in
split mode. Otherwise, the tool operates in merge mode.

=back

=head1 SEE ALSO

msgfmt(1), gettext(3), Locale::gettext(3)

=head1 BUGS

Handling of quotes inside message strings is flaky.

The locale files are not tagged with the locale they represent, nor is
the header updated.

In merge mode, the location of the input locale files is used to
determine their locale.

In split mode, the generated files are always named I<locale>.po.

=cut

use strict;
use Getopt::Long;
use Data::Dumper;

my $prog = $0;
$prog =~ s|^.*/||;

my $multi = 0;
my @generate_locales;
my $outfile = '%l.po';

GetOptions( "merge" => \$multi,
            "locales:s" => \@generate_locales,
            "output:s" => \$outfile,
          ) or die "usage: $prog [--merge] [--locale locale...] [--outfile filename] file(s)";

$multi = 1 if scalar( @ARGV ) > 1;

die "usage: $prog [--merge] [--locale locale...] [--outfile filename] file(s)"
  unless @ARGV;

if ( $multi ) {
    my %chref;
    my @caref;
    my @locales;
    while ( @ARGV) {
        print STDERR "Parsing $ARGV[0]...";

        # this is pretty specific...
        my ( $locale ) = grep /^[a-z][a-z]([\@\_].+|)$/,
          split( '/', $ARGV[0] );
        die "can't guess locale for this file!" unless $locale;
        $locale =~ s/\.po$//;
        push @locales, $locale;

        my ( $nhref, $naref ) = parse_msg_file( $ARGV[0] );

        print STDERR "done.\n";

        for my $msgid ( @{$naref} ) {
            my %locales;
            if ( !defined( $chref{$msgid} )) {
                $chref{$msgid} = \%locales;
                push @caref, $msgid;
            }
            $chref{$msgid}->{$locale} = $nhref->{$msgid};
        }
        shift @ARGV;
    }

    for my $msgid ( @caref ) {
        # all the comments should be identical. just print the first
        # one we can find.
        for my $locale ( @locales ) {
            if ( defined( $chref{$msgid}->{$locale})) {
                print join( "\n", @{$chref{$msgid}->{$locale}->[0]} );
                print "\n";
                last;
            }
        }
        print "msgid \"$msgid\"\n";
        for my $locale ( @locales ) {
            print "msgstr[$locale] \"";
            print quotequote( $chref{$msgid}->{$locale}->[1] );
            print "\"";
            print "\n";
        }
        print "\n";
    }

} else {
    # Split into component locales
    my ( $nhref, $naref, $lref ) = parse_msg_file( $ARGV[0] );

    my @locales;
    if ( @generate_locales ) {
        push @locales, @generate_locales;
    } else {
        push @locales, @$lref;
    }

    for my $locale ( @locales ) {
        my $filename = $outfile;
        $filename =~ s/%l/$locale/g;

        print STDERR "$prog: Generating file for locale $locale\n";
        if ( open( LOC, ">$filename" )) {
            for my $msgid ( @$naref ) {
                my $isknown = defined( $nhref->{$msgid}->{$locale});
                if ( $isknown ) {
                    print LOC join( "\n", @{$nhref->{$msgid}->{$locale}->[0]} );
                } else {
                    print STDERR "$prog: warning: no $locale translation for $msgid, faking it...\n";
                    print LOC join( "\n", @{$nhref->{$msgid}->{$lref->[0]}->[0]});
                }
                print LOC "\n";
                print LOC "msgid \"";
                print LOC quotequote( $msgid, 1 );
                print LOC "\"\n";
                print LOC "msgstr \"";
                if ( $isknown ) {
                    print LOC quotequote( $nhref->{$msgid}->{$locale}->[1], 1 );
                } else {
                    # we don't know about this locale, so just default
                    # special case: header
                    if ( $msgid eq "" ) {
                        print LOC quotequote( $nhref->{$msgid}->{$lref->[0]}->[1], 1 );
                    } else {
                        print LOC quotequote( $msgid, 1 );
                    }
                }
                print LOC "\"\n";
                print LOC "\n";
            }
        } else {
            die "$prog: $filename: $!";
        }
    }
}


sub parse_msg_file {
    my $file = shift;

    open( NEW, "<$file" ) or die "$file: $!";

    # basic format of msgstr file:
    # ignore lines starting with comment char (#)
    # msgid "...." can run over multiple lines
    # msgstr "...." can run over multiple lines
    my $instring = "";
    my @comments;
    my $msgid;
    my $msgstr;
    my %msgs;
    my @msgs;
    my @locales;
    my $locale;

    while (<NEW>) {
        chomp;
        if ( /^#/ ) {
            $instring = "";
            push @comments, $_;
            next;
        }
        if ( /^$/ ) {
            $instring = "";
            @comments = ();
            next;
        }
        if ( /^msgid\s+"(.*)"$/ ) {
            $msgid = $1;

            $instring = "msgid";
        } elsif ( /^msgstr(\[.*?\])?\s+"(.*)"$/ ) {
            $locale = $1;
            $locale ||= "";
            $msgstr = $2;

            $locale =~ s/[\[\]]//gs;

            my @com = @comments;

            # this allows me to keep track of the order. Doing it in
            # here makes sure that we get the full msgid (in the case
            # where it's been split across multiple lines)
            my $check = quotemeta( $msgid ); # otherwise bad things
            if ( !grep /^$check$/, @msgs ) {
                push @msgs, $msgid;
            }

            if ( $locale ) {
                my %locales;
                if ( !defined( $msgs{$msgid})) {
                    $msgs{$msgid} = \%locales;
                }

                if ( defined( $msgs{$msgid}->{$locale})) {
                    die "$prog: duplicate msgid $msgid in $ARGV[0], $locale\n";
                }

                $msgs{$msgid}->{$locale} = [ \@com, $msgstr ];
                push @locales, $locale unless grep /^$locale$/, @locales;

            } else {
                if ( defined( $msgs{$msgid})) {
                    die "$prog: duplicate msgid $msgid in $ARGV[0]\n";
                }
                $msgs{$msgid} = [ \@com, $msgstr ];
            }

            $instring = "msgstr";
        } elsif ( /^"/ ) {
            s/^"(.*)"$/$1/;
            if ( $instring eq "msgstr" ) {
                if ( $locale ) {
                    $msgs{$msgid}->{$locale}->[1] .= "$_";
                } else {
                    $msgs{$msgid}->[1] .= "$_";
                }
            } else {
                $msgid .= $_;
            }
        }
    }

    ( \%msgs, \@msgs, \@locales );
}

sub quotequote {
    my $string = shift;
    my $unwrap = shift;
    $unwrap ||= 0;
    $string ||= "";
    if ( $string =~ /"/ ) {
        $string =~ s/(^|[^\\])\"/$1\\"/gs;
    }
    if ( $unwrap and $string =~ /\\n|\r/ ) {
        $string =~ s/\\n/\\n"\n"/gs;
        $string =~ s/"\n"$//gs;
        $string = "\"\n\"$string";
    }
    return $string;
}
