#! /usr/bin/perl

=encoding utf-8
=head1 DESCRIPTION

This script converts XLIFF (both 1.x or 2.x supported) to the text format requested by ef-import.pl
Always returns segments, not units:
 - in XLIFF 1, use <seg-source> if possible
 - in XLIFF 2, use <segment>, not <unit>
It is written for Elefas Indicus but has no dependency to it, you can use it independently

=head1 USAGE

    perl xliff2text.pl [-lang]? [input file]? [output file]?

Converts input file (defaults STDIN) to output file (defaults STDOUT)
If [-lang] is set, also generates source and target language as ef-import.pl would use them
    
=cut

use XML::Parser;

my 	$optLg = 0; $optLg = shift if ($ARGV[0] =~ /^\-l/i);

if (my $IN = shift) { close(STDIN); open(STDIN, '<:encoding(utf-8)', $IN); }
if (my $OUT = shift) { close(STDOUT); open(STDOUT, '>:encoding(utf-8)', $OUT); }

my $count_tu = 0;
my $seg_txt = undef; my $level = 0; my @segs = (); my $source = undef;

my $parser = new XML::Parser(Handlers => {
		Start => \&handle_start, End => \&handle_end, Char  => \&handle_char
	});
$parser->parse(STDIN);

print STDERR "$count_tu entries\n";

sub handle_start {
	my (undef, $el, %attr) = @_;
	
	if ($optLg and ($el eq 'xliff')) { # xliff 1
        print "srcLang=$1\n" if $attr{'source-language'} =~ /^(\w.+)$/;
        print "traLang=$1\n" if $attr{'target-language'} =~ /^(\w.+)$/;  
        print "\n";
	}
	elsif ($optLg and ($el eq 'file')) {   # xliff 2
        print "srcLang=$1\n" if $attr{'srcLang'} =~ /^(\w.+)$/;
        print "traLang=$1\n" if $attr{'traLang'} =~ /^(\w.+)$/;         
        print "\n";
	}
	
	elsif (($el eq 'trans-unit') or ($el eq 'segment')) {
		$seg_txt = undef; 
		$level = 0; @segs = (); # for xliff 1 only
	}
	elsif ($el eq 'source') {
		$seg_txt = '';    # start new segment
	}
	elsif ($el eq 'seg-source') {
		$seg_txt = undef;   # start new segment
	}
	elsif ($el eq 'mrk') {
		if ($attr{mtype} eq 'seg') {
            $seg_txt = ''; push(@segs, \$seg_txt); $level = 1;
		} else {
			$level++;
		}
	}
}

sub handle_end {
	my (undef, $el) = @_;
	
	if ($el eq 'source') {
		$source = $seg_txt; $seg_txt = undef;
	}
	# XLIFF 1
	elsif ($el eq 'trans-unit') {
		if (@segs) {
            foreach (@segs) {
                next unless $_;
                print ref $_ ? $$_ : $_, "\n"; 
                $count_tu += scalar @segs;
            }
		} elsif ($source) {
            print $source, "\n"; $count_tu++;		
		}
	}
	elsif ($el eq 'mrk') {
		if ((--$level) == 0) {
            $segs[-1] = ${$segs[-1]}; $seg_txt = undef;
		}
	}
	# XLIFF 2
	elsif ($el eq 'segment') {
        print $source, "\n"; $count_tu++;
	}

}

sub handle_char {
	my (undef, $txt) = @_;
	$seg_txt .= $txt if defined $seg_txt;
}

=head1 LICENSE

Copyright 2013 Silvestris Project (L<http://www.silvestris-lab.org/>)

Licensed under the EUPL, Version 1.1 or – as soon they will be approved by the European Commission - subsequent versions of the EUPL (the "Licence");
You may not use this work except in compliance with the Licence.
You may obtain a copy of the Licence at: L<http://ec.europa.eu/idabc/eupl>

Unless required by applicable law or agreed to in writing, software distributed under the Licence is distributed on an "AS IS" basis,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the Licence for the specific language governing permissions and limitations under the Licence. 

=cut


