#!/usr/bin/perl -w ############################################################################################################ # # gibbs_ibm-model-1_collapsed_v1.00.perl: Collapsed Gibbs sampler for word alignments in IBM Model 1 # # This program was used to generate the alignments in Bayesian inference experiments for the paper: # "Bayesian Word Alignment for Statistical Machine Translation" (ACL-HLT 2011, short papers) # # Accepts two sentence-aligned text files (each sentence on a separate line). # Outputs a GIZA++-style word alignment file. # Optionally, a GIZA++-style word alignment file can be provided as input to initialize the sampler's state. # Other side outputs are: # 1. An "experiment_info" file for later reference # 2. A tabulated "totalentropy" file of total and per-word entropies at each iteration # # The hyperparameter and the sampling settings can be changed on lines 29--35. # # The core of the program (sampling) consists of lines 227--260. # # Author: Coskun Mermer # ############################################################################################################ use strict; $| = 1; die "Usage: $0 fcorpus_in ecorpus_in out [initial_alignments]\n" if (@ARGV < 3); # Dirichlet prior for translation probabilities my $beta = 0.0001; # MCMC parameters my $burninIters = 1000; # skip this many iterations in the beginning, first readout will be in iteration N + 1 (provided overlaps with "lag" -- for now) my $lagIters = 1; # Read out every N iterations; set to 1 to sample at every iteration my $numReadouts = 1000; my $randomseed = 0; # Useful info about how this program was run open(CALL, ">$ARGV[2]_experiment_info"); print CALL "Started at " . localtime() . "\n"; print CALL $0; for my $arg (@ARGV) { print CALL " $arg"; } print CALL "\n"; print CALL "beta = $beta\n"; print CALL "(burn-in, lag, numreads) = ($burninIters, $lagIters, $numReadouts)\n"; print CALL "random-seed = $randomseed\n"; close(CALL); # Data structures my @Ecorpus; my @Fcorpus; my @A; my %Count; my %SumCount; srand($randomseed); # for replicable results # Read data into @Ecorpus and @Fcorpus, and compile E and F vocabularies open(F, $ARGV[0]); open(E, $ARGV[1]); my $s = 0; my %Evcb; my %Fvcb; $Fvcb{"NULL"}++; # Add NULL word to the foreign vocabulary while (my $fline = ) { my $eline = ; $s++; chomp $fline; chomp $eline; my @Fwords = split(/ +/, $fline); my @Ewords = split(/ +/, $eline); $Fcorpus[$s][0] = "NULL"; # only in the source language, i.e., f for my $j (0 .. $#Fwords) { $Fcorpus[$s][$j + 1] = $Fwords[$j]; $Fvcb{ $Fwords[$j] }++; } for my $i (0 .. $#Ewords) { $Ecorpus[$s][$i] = $Ewords[$i]; $Evcb{ $Ewords[$i] }++; } } my $Evcbsize = 0; for my $e (keys %Evcb) { $Evcbsize++; } my $Fvcbsize = 0; for my $f (keys %Fvcb) { $Fvcbsize++; $SumCount{$f} = 0; # initialize } print "Read English vocabulary size as $Evcbsize, Foreign vocabulary size as $Fvcbsize.\n"; # Initialize # EITHER from user-provided file if (defined($ARGV[3])) { open (INIT, $ARGV[3]); for my $s (1 .. $#Fcorpus) { ; my $eline = ; my $fline = ; chomp $eline; chomp $fline; my @Ewords = split(/ +/, $eline); my @Fwords = split(/ +/, $fline); my $j = 0; my $k = 0; while ($k <= $#Fwords) { my $f; while ($Fwords[$k] ne "({") { $f = $Fwords[$k]; $k++; last if ($k > $#Fwords); } $k++; last if ($k > $#Fwords); while ($Fwords[$k] ne "})") { # 1. Alignment $A[$s][ $Fwords[$k] - 1 ] = $j; # 2. Alignment counts $Count{$f}{ $Ewords[$Fwords[$k] - 1] }++; # 3. Sum of alignment counts for each fj $SumCount{$f}++; $k++; } $j++; } } } else { # OR from empirical evidence # Naively assume uniform probability and compute fractional counts my %FractionalCount; for my $s (1 .. $#Fcorpus) { for my $ei (@{$Ecorpus[$s]}) { for my $fj (@{$Fcorpus[$s]}) { $FractionalCount{$ei}{$fj}++; } } } # Utility function in case needed sub printFractionalCount { open (OUT, ">@_"); for my $e (keys %FractionalCount) { for my $f (keys %{$FractionalCount{$e}}) { print OUT "FractionalCount{ $e }{ $f } = $FractionalCount{$e}{$f}\n" } } close(OUT); } for my $s (1 .. $#Fcorpus) { my $i = 0; for my $ei (@{$Ecorpus[$s]}) { # 1. Alignments (choose Viterbi alignments based on fractional counts) my $bestcount = 0; my $bestfj = ""; my $bestj = -1; my $j = 0; for my $fj (@{$Fcorpus[$s]}) { if ($fj eq "NULL") { # don't want to initialize with NULL alignment $j++; next; } if ($FractionalCount{$ei}{$fj} > $bestcount) { $bestcount = $FractionalCount{$ei}{$fj}; $bestfj = $fj; $bestj = $j; } $j++; } $A[$s][$i] = $bestj; # 2. Alignment counts $Count{$bestfj}{$ei}++; # 3. Sum of alignment counts for each fj $SumCount{$bestfj}++; $i++; } } } # Initialization #&printAlignmentsToFile($ARGV[2]); #Some preliminary statistics my $totalentropy; my $loge_to_log2_conversion_factor = 1 / log(2); open(ENT, ">$ARGV[2]_totalentropy.xls"); my $numvariables = 0; my $entropyupperbound = 0; my $ftokens = 0; my $sumchoices = 0; for my $s (1 .. $#Fcorpus) { $ftokens += @{$Fcorpus[$s]}; for my $ei (@{$Ecorpus[$s]}) { $numvariables++; $entropyupperbound += log(@{$Fcorpus[$s]}); $sumchoices += @{$Fcorpus[$s]}; } } $entropyupperbound *= $loge_to_log2_conversion_factor; my $eubpervar = $entropyupperbound / $numvariables; my $pplpervar = exp($eubpervar * log(2)); my $avgchoices = $sumchoices / $numvariables; my $avgsentlength = $ftokens / @Fcorpus; print "Number of variables = $numvariables, entropy upper bound = $entropyupperbound (per variable = $eubpervar, per-word perplexity = $pplpervar)\n"; print "Number of French words (incl. NULL) = $ftokens (avg. = $avgsentlength), avg. choices per English word = $avgchoices\n"; # Take successive samples by Gibbs sampling my @Acollect; my $t = 0; while ($numReadouts) { print "$t"; $totalentropy = 0; # Keep track of readouts my $isReadoutIter = 0; $isReadoutIter = 1 if ($t % $lagIters == 0); $isReadoutIter = 0 if ($t < $burninIters); $numReadouts-- if $isReadoutIter; print "*" if $isReadoutIter; # Sample each variable for my $s (1 .. $#Fcorpus) { print "." if ($s % 1000 == 0); my $i = 0; for my $ei (@{$Ecorpus[$s]}) { # Decrement before sampling $Count{$Fcorpus[$s][ $A[$s][$i] ]}{$ei}--; $SumCount{$Fcorpus[$s][ $A[$s][$i] ]}--; # Calculate relative probabilities and draw a sample from this distribution my @probvector; my $j = 0; for my $fj (@{$Fcorpus[$s]}) { my $p_i_j; if (defined($Count{$fj}{$ei})) { $p_i_j = ($Count{$fj}{$ei} + $beta) / ($SumCount{$fj} + $Evcbsize * $beta); } else { $p_i_j = $beta / ($SumCount{$fj} + $Evcbsize * $beta); } $probvector[$j] = $p_i_j; $j++; } my ($outcome, $entropy) = &do_sampling(@probvector); $A[$s][$i] = $outcome; $Acollect[$s][$i][ $outcome ]++ if $isReadoutIter; $totalentropy += $entropy; # Increment according to the new sample $Count{$Fcorpus[$s][ $A[$s][$i] ]}{$ei}++; $SumCount{$Fcorpus[$s][ $A[$s][$i] ]}++; $i++; } } my $log2entropy = $totalentropy * $loge_to_log2_conversion_factor; printf ENT "%.2f %.6f\n", $log2entropy, $log2entropy / $numvariables; #&printAlignmentsToFileGIZA("$ARGV[2]_iter$t") if $isReadoutIter; $t++; } close(ENT); # Finally, determine final Viterbi alignment from the collected samples # For each variable, select the value that appears the most in the samples for my $s (1 .. $#Fcorpus) { my $i = 0; for my $ei (@{$Ecorpus[$s]}) { my $maxfreq = 0; my $maxindex = -1; my $j = 0; for my $fj (@{$Fcorpus[$s]}) { if (!defined($Acollect[$s][$i][$j])) { $j++; next; } if ($Acollect[$s][$i][$j] > $maxfreq) { $maxfreq = $Acollect[$s][$i][$j]; $maxindex = $j; } $j++; } $A[$s][$i] = $maxindex; $i++; } } #&printAlignmentsToFile($ARGV[2]); &printAlignmentsToFileGIZA("$ARGV[2].A3.final"); open(CALL, ">>$ARGV[2]_experiment_info"); # append print CALL "Finished at " . localtime() . "\n"; close(CALL); print "Done.\n"; ### END OF PROGRAM # Takes as input argument an array of counts (interpreted as proportional to their probabilities) and returns the array index that was chosen sub do_sampling { # Total size of this event space my $sum = 0; my $entropy_sum = 0; for my $i (0 .. $#_) { my $count_i = $_[$i]; $sum += $count_i; $entropy_sum += $count_i * (log($count_i)); } my $entropy = log($sum) - ($entropy_sum / $sum); # Roll the die my $outcome = rand($sum); #Determine the outcome event my $i = -1; while ($outcome > 0) { $i++; $outcome -= $_[$i]; } return ($i, $entropy); } sub printCountsToFile { open (OUT, ">@_"); for my $f (keys %Fvcb) { for my $e (keys %Evcb) { print OUT "Count{ $f }{ $e } = $Count{$f}{$e}\n" if ((defined($Count{$f}{$e})) && ($Count{$f}{$e} > 0)); } } close(OUT); } sub printAlignmentsToFile { open (OUT, ">@_"); for my $s (0 .. $#Fcorpus) { my $i = 0; for my $ei (@{$Ecorpus[$s]}) { print OUT "A[$s][$i:$ei]=$A[$s][$i]:$Fcorpus[$s][ $A[$s][$i] ]\n"; $i++; } } close(OUT); } sub printAlignmentsToFileGIZA { open (OUT, ">@_"); for my $s (1 .. $#Fcorpus) { print OUT "# Sentence pair ($s)\n"; for my $ei (@{$Ecorpus[$s]}) { print OUT "$ei "; } print OUT "\n"; my $j = 0; for my $fj (@{$Fcorpus[$s]}) { print OUT "$fj ({ "; my $i = 1; # to match the GIZA++ output format for my $ei (@{$Ecorpus[$s]}) { print OUT "$i " if ($A[$s][$i - 1] == $j); $i++; } print OUT "}) "; $j++; } print OUT "\n"; } close(OUT); }