quick_lm.pl (8604B)
1 #!/usr/local/bin/perl 2 3 # ==================================================================== 4 # Copyright (c) 1996-2002 Alexander I. Rudnicky and Carnegie Mellon University. 5 # All rights reserved. 6 # 7 # Redistribution and use in source and binary forms, with or without 8 # modification, are permitted provided that the following conditions 9 # are met: 10 # 11 # 1. Redistributions of source code must retain the above copyright 12 # notice, this list of conditions and the following disclaimer. 13 # 14 # 2. Redistributions in binary form must reproduce the above copyright 15 # notice, this list of conditions and the following disclaimer in 16 # the documentation and-or other materials provided with the 17 # distribution. 18 # 19 # 3. All copies, used or distributed, must preserve the original wording of 20 # the copyright notice included in the output file. 21 # 22 # This work was supported in part by funding from the Defense Advanced 23 # Research Projects Agency and the CMU Sphinx Speech Consortium. 24 # 25 # THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 26 # ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 27 # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 29 # NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 # 37 # ==================================================================== 38 # 39 # Pretty Good Language Modeler, now with unigram vector augmentation! 40 # 41 # The Pretty Good Language Modeler is intended for quick construction 42 # of small language models, typically as might be needed in 43 # application development. Depending on the version of Perl that you 44 # are running, a practical limitation is a maximum vocabulary size on 45 # the order of 1000-2000 words. The limiting factor is the number of 46 # n-grams observed, since each n-gram is stored as a hash key. (So 47 # smaller vocabularies may turn out to be a problem as well.) 48 # 49 # This package computes a standard back-off language model. It differs 50 # in one significant respect, which is the computation of the 51 # discount. We adopt a "proportional" (or ratio) discount in which a 52 # certain percentage of probability mass is removed (typically 50%) 53 # from observed n-grams and redistributed over unobserved n-grams. 54 # 55 # Conventionally, an absolute discount would be used, however we have 56 # found that the proportional discount appears to be robust for 57 # extremely small languages, as might be prototyped by a developer, 58 # as opposed to based on a collected corpus. We have found that 59 # absolute and proportional discounts produce comparable recognition 60 # results with perhaps a slight advantage for proportional 61 # discounting. A more systematic investigation of this technique would 62 # be desirable. In any case it also has the virtue of using a very 63 # simple computation. 64 65 # NOTE: this is by no means an efficient implementation and performance will 66 # deteriorate rapidly as a function of the corpus size. Larger corpora should be 67 # processed using the toolkit available at http://www.speech.cs.cmu.edu/SLM_info.html 68 69 # [2feb96] (air) 70 # cobbles together a language model from a set of exemplar sentences. 71 # features: 1) uniform discounting, 2) no cutoffs 72 # the "+" version allows insertion of extra words into the 1gram vector 73 74 # [27nov97] (air) 75 # bulletproof a bit for use in conjunction with a cgi script 76 77 # [20000711] (air) 78 # made visible the discount parmeter 79 80 # [20011123] (air) 81 # cleaned-up version for distribution 82 83 use Getopt::Std; 84 85 $VERBOSE = 1; 86 87 sub handler { local($sig) = @_; 88 print STDERR "quick_lm caught a SIG$sig -- dying\n"; 89 exit(0); 90 } 91 foreach (qw(XCPU KILL TERM STOP)) { $SIG{$_} = \&handler; } 92 93 94 if ($#ARGV < 0) { die("usage: quick_lm -s <sentence_file> [-w <word_file>] [-d discount]\n"); } 95 Getopt::Std::getopts("s:w:d:x"); 96 $sentfile = $opt_s; 97 $wordfile = $opt_w; 98 $discount = $opt_d; 99 100 $| = 1; # always flush buffers 101 102 if ($VERBOSE>0) {print STDERR "Language model started at ",scalar localtime(),"\n";} 103 104 105 open(IN,"$sentfile") or die("can't open $sentfile!\n"); 106 if ($wordfile ne "") { open(WORDS,"$wordfile"); $wflag = 1;} else { $wflag = 0; } 107 108 $log10 = log(10.0); 109 110 if ($discount ne "") { 111 if (($discount<=0.0) or ($discount>=1.0)) { 112 print STDERR "\discount value out of range: must be 0.0 < x < 1.0! ...using 0.5\n"; 113 $discount_mass = 0.5; # just use default 114 } else { 115 $discount_mass = $discount; 116 } 117 } else { 118 # Ben and Greg's experiments show that 0.5 is a way better default choice. 119 $discount_mass = 0.5; # Set a nominal discount... 120 } 121 $deflator = 1.0 - $discount_mass; 122 123 # create count tables 124 $sent_cnt = 0; 125 while (<IN>) { 126 s/^\s*//; s/\s*$//; 127 if ( $_ eq "" ) { next; } else { $sent_cnt++; } # skip empty lines 128 @word = split(/\s/); 129 for ($j=0;$j<($#word-1);$j++) { 130 $trigram{join(" ",$word[$j],$word[$j+1],$word[$j+2])}++; 131 $bigram{ join(" ",$word[$j],$word[$j+1])}++; 132 $unigram{$word[$j]}++; 133 } 134 # finish up the bi and uni's at the end of the sentence... 135 $bigram{join(" ",$word[$j],$word[$j+1])}++; 136 $unigram{$word[$j]}++; 137 138 $unigram{$word[$j+1]}++; 139 } 140 close(IN); 141 if ($VERBOSE) { print STDERR "$sent_cnt sentences found.\n"; } 142 143 # add in any words 144 if ($wflag) { 145 $new = 0; $read_in = 0; 146 while (<WORDS>) { 147 s/^\s*//; s/\s*$//; 148 if ( $_ eq "" ) { next; } else { $read_in++; } # skip empty lines 149 if (! $unigram{$_}) { $unigram{$_} = 1; $new++; } 150 } 151 if ($VERBOSE) { print STDERR "tried to add $read_in word; $new were new words\n"; } 152 close (WORDS); 153 } 154 if ( ($sent_cnt==0) && ($new==0) ) { 155 print STDERR "no input?\n"; 156 exit; 157 } 158 159 open(LM,">$sentfile.arpabo") or die("can't open $sentfile.arpabo for output!\n"); 160 161 $preface = ""; 162 $preface .= "Language model created by QuickLM on ".`date`; 163 $preface .= "Copyright (c) 1996-2002\nCarnegie Mellon University and Alexander I. Rudnicky\n\n"; 164 $preface .= "This model based on a corpus of $sent_cnt sentences and ".scalar (keys %unigram). " words\n"; 165 $preface .= "The (fixed) discount mass is $discount_mass\n\n"; 166 167 168 # compute counts 169 $unisum = 0; $uni_count = 0; $bi_count = 0; $tri_count = 0; 170 foreach $x (keys(%unigram)) { $uni_count++; $unisum += $unigram{$x}; } 171 foreach $x (keys(%bigram)) { $bi_count++; } 172 foreach $x (keys(%trigram)) { $tri_count++; } 173 174 print LM $preface; 175 print LM "\\data\\\n"; 176 print LM "ngram 1=$uni_count\n"; 177 if ( $bi_count > 0 ) { print LM "ngram 2=$bi_count\n"; } 178 if ( $tri_count > 0 ) { print LM "ngram 3=$tri_count\n"; } 179 print LM "\n"; 180 181 # compute uni probs 182 foreach $x (keys(%unigram)) { 183 $uniprob{$x} = ($unigram{$x}/$unisum) * $deflator; 184 } 185 186 # compute alphas 187 foreach $y (keys(%unigram)) { 188 $w1 = $y; 189 $sum_denom = 0.0; 190 foreach $x (keys(%bigram)) { 191 if ( substr($x,0,rindex($x," ")) eq $w1 ) { 192 $w2 = substr($x,index($x," ")+1); 193 $sum_denom += $uniprob{$w2}; 194 } 195 } 196 $alpha{$w1} = $discount_mass / (1.0 - $sum_denom); 197 } 198 199 print LM "\\1-grams:\n"; 200 foreach $x (sort keys(%unigram)) { 201 printf LM "%6.4f %s %6.4f\n", log($uniprob{$x})/$log10, $x, log($alpha{$x})/$log10; 202 } 203 print LM "\n"; 204 205 #compute bi probs 206 foreach $x (keys(%bigram)) { 207 $w1 = substr($x,0,rindex($x," ")); 208 $biprob{$x} = ($bigram{$x}*$deflator)/$unigram{$w1}; 209 } 210 211 #compute bialphas 212 foreach $x (keys(%bigram)) { 213 $w1w2 = $x; 214 $sum_denom = 0.0; 215 foreach $y (keys(%trigram)) { 216 if (substr($y,0,rindex($y," ")) eq $w1w2 ) { 217 $w2w3 = substr($y,index($y," ")+1); 218 $sum_denom += $biprob{$w2w3}; 219 } 220 } 221 $bialpha{$w1w2} = $discount_mass / (1.0 - $sum_denom); 222 } 223 224 # output the bigrams and trigrams (now that we have the alphas computed). 225 if ( $bi_count > 0 ) { 226 print LM "\\2-grams:\n"; 227 foreach $x (sort keys(%bigram)) { 228 printf LM "%6.4f %s %6.4f\n", 229 log($biprob{$x})/$log10, $x, log($bialpha{$x})/$log10; 230 } 231 print LM "\n"; 232 } 233 234 if ($tri_count > 0 ) { 235 print LM "\\3-grams:\n"; 236 foreach $x (sort keys(%trigram)) { 237 $w1w2 = substr($x,0,rindex($x," ")); 238 printf LM "%6.4f %s\n", 239 log(($trigram{$x}*$deflator)/$bigram{$w1w2})/$log10, $x; 240 } 241 print LM "\n"; 242 } 243 244 print LM "\\end\\\n"; 245 close(LM); 246 247 if ($VERBOSE>0) { print STDERR "Language model completed at ",scalar localtime(),"\n"; } 248 249 #