‹ projects

vmc

a voice model creator for CMU Sphinx
Log | Files | Refs | README | LICENSE

quick_lm.pl (8604B)


      1 #!/usr/local/bin/perl
      2 
      3 # ====================================================================
      4 # Copyright (c) 1996-2002 Alexander I. Rudnicky and Carnegie Mellon University.
      5 # All rights reserved.
      6 #
      7 # Redistribution and use in source and binary forms, with or without
      8 # modification, are permitted provided that the following conditions
      9 # are met:
     10 #
     11 # 1. Redistributions of source code must retain the above copyright
     12 #    notice, this list of conditions and the following disclaimer. 
     13 #
     14 # 2. Redistributions in binary form must reproduce the above copyright
     15 #    notice, this list of conditions and the following disclaimer in
     16 #    the documentation and-or other materials provided with the
     17 #    distribution.
     18 #
     19 # 3. All copies, used or distributed, must preserve the original wording of
     20 #    the copyright notice included in the output file.
     21 #
     22 # This work was supported in part by funding from the Defense Advanced 
     23 # Research Projects Agency and the CMU Sphinx Speech Consortium.
     24 #
     25 # THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
     26 # ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
     27 # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     28 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
     29 # NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     30 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
     31 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
     32 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
     33 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
     34 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
     35 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     36 #
     37 # ====================================================================
     38 #
     39 # Pretty Good Language Modeler, now with unigram vector augmentation!
     40 # 
     41 # The Pretty Good Language Modeler is intended for quick construction 
     42 # of small language models, typically as might be needed in 
     43 # application development. Depending on the version of Perl that you 
     44 # are running, a practical limitation is a maximum vocabulary size on 
     45 # the order of 1000-2000 words. The limiting factor is the number of 
     46 # n-grams observed, since each n-gram is stored as a hash key. (So 
     47 # smaller vocabularies may turn out to be a problem as well.)
     48 # 
     49 # This package computes a standard back-off language model. It differs 
     50 # in one significant respect, which is the computation of the 
     51 # discount. We adopt a "proportional" (or ratio) discount in which a 
     52 # certain percentage of probability mass is removed (typically 50%) 
     53 # from observed n-grams and redistributed over unobserved n-grams.
     54 # 
     55 # Conventionally, an absolute discount would be used, however we have 
     56 # found that the proportional discount appears to be robust for 
     57 # extremely small languages, as might be prototyped by a developer, 
     58 # as opposed to based on a collected corpus. We have found that 
     59 # absolute and proportional discounts produce comparable recognition 
     60 # results with perhaps a slight advantage for proportional 
     61 # discounting. A more systematic investigation of this technique would 
     62 # be desirable. In any case it also has the virtue of using a very
     63 # simple computation.
     64 
     65 # NOTE: this is by no means an efficient implementation and performance will 
     66 # deteriorate rapidly as a function of the corpus size. Larger corpora should be
     67 # processed using the toolkit available at http://www.speech.cs.cmu.edu/SLM_info.html
     68 
     69 # [2feb96] (air)
     70 # cobbles together a language model from a set of exemplar sentences.
     71 # features: 1) uniform discounting, 2) no cutoffs
     72 # the "+" version allows insertion of extra words into the 1gram vector
     73 
     74 # [27nov97] (air)
     75 # bulletproof a bit for use in conjunction with a cgi script
     76 
     77 # [20000711] (air)
     78 # made visible the discount parmeter
     79 
     80 # [20011123] (air)
     81 # cleaned-up version for distribution
     82 
     83 use Getopt::Std;
     84 
     85 $VERBOSE = 1;
     86 
     87 sub handler { local($sig) = @_;
     88 	      print STDERR "quick_lm caught a SIG$sig -- dying\n";
     89 	      exit(0);
     90 	    }
     91 foreach (qw(XCPU KILL TERM STOP)) { $SIG{$_} = \&handler; }
     92 
     93 
     94 if ($#ARGV < 0) { die("usage: quick_lm -s <sentence_file> [-w <word_file>] [-d discount]\n"); }
     95 Getopt::Std::getopts("s:w:d:x");
     96 $sentfile = $opt_s;
     97 $wordfile = $opt_w;
     98 $discount = $opt_d;
     99 
    100 $| = 1;  # always flush buffers
    101 
    102 if ($VERBOSE>0) {print STDERR "Language model started at ",scalar localtime(),"\n";}
    103 
    104 
    105 open(IN,"$sentfile") or die("can't open $sentfile!\n");
    106 if ($wordfile ne "") { open(WORDS,"$wordfile"); $wflag = 1;} else { $wflag = 0; }
    107 
    108 $log10 = log(10.0);
    109 
    110 if ($discount ne "") {
    111   if (($discount<=0.0) or ($discount>=1.0)) {
    112     print STDERR "\discount value out of range: must be 0.0 < x < 1.0! ...using 0.5\n";
    113     $discount_mass = 0.5;  # just use default
    114   } else {
    115     $discount_mass = $discount;
    116   }
    117 } else {
    118   # Ben and Greg's experiments show that 0.5 is a way better default choice.
    119   $discount_mass = 0.5;  # Set a nominal discount...
    120 }
    121 $deflator = 1.0 - $discount_mass;
    122 
    123 # create count tables
    124 $sent_cnt = 0;
    125 while (<IN>) {	 
    126   s/^\s*//; s/\s*$//;
    127   if ( $_ eq "" ) { next; } else { $sent_cnt++; } # skip empty lines
    128   @word = split(/\s/);    
    129   for ($j=0;$j<($#word-1);$j++) {	
    130     $trigram{join(" ",$word[$j],$word[$j+1],$word[$j+2])}++;
    131     $bigram{ join(" ",$word[$j],$word[$j+1])}++;
    132     $unigram{$word[$j]}++;
    133   }
    134   # finish up the bi and uni's at the end of the sentence...
    135   $bigram{join(" ",$word[$j],$word[$j+1])}++;
    136   $unigram{$word[$j]}++;
    137   
    138   $unigram{$word[$j+1]}++;
    139 }
    140 close(IN);
    141 if ($VERBOSE) { print STDERR "$sent_cnt sentences found.\n"; }
    142 
    143 # add in any words
    144 if ($wflag) {
    145   $new = 0; $read_in = 0;
    146   while (<WORDS>) {
    147     s/^\s*//; s/\s*$//;
    148     if ( $_ eq "" ) { next; }  else { $read_in++; }  # skip empty lines
    149     if (! $unigram{$_}) { $unigram{$_} = 1; $new++; }
    150   }
    151   if ($VERBOSE) { print STDERR "tried to add $read_in word; $new were new words\n"; }
    152   close (WORDS);
    153 }
    154 if ( ($sent_cnt==0) && ($new==0) ) {
    155   print STDERR "no input?\n";
    156   exit;
    157 }
    158 
    159 open(LM,">$sentfile.arpabo") or die("can't open $sentfile.arpabo for output!\n");
    160 
    161 $preface = "";
    162 $preface .= "Language model created by QuickLM on ".`date`;
    163 $preface .= "Copyright (c) 1996-2002\nCarnegie Mellon University and Alexander I. Rudnicky\n\n";
    164 $preface .= "This model based on a corpus of $sent_cnt sentences and ".scalar (keys %unigram). " words\n";
    165 $preface .= "The (fixed) discount mass is $discount_mass\n\n";
    166 
    167 
    168 # compute counts
    169 $unisum = 0; $uni_count = 0; $bi_count = 0; $tri_count = 0;
    170 foreach $x (keys(%unigram)) { $uni_count++; $unisum += $unigram{$x}; }
    171 foreach $x (keys(%bigram))  { $bi_count++; }
    172 foreach $x (keys(%trigram)) { $tri_count++; }
    173 
    174 print LM $preface;
    175 print LM "\\data\\\n";
    176 print LM "ngram 1=$uni_count\n";
    177 if ( $bi_count > 0 ) { print LM "ngram 2=$bi_count\n"; }
    178 if ( $tri_count > 0 ) { print LM "ngram 3=$tri_count\n"; }
    179 print LM "\n";
    180 
    181 # compute uni probs
    182 foreach $x (keys(%unigram)) {
    183   $uniprob{$x} = ($unigram{$x}/$unisum) * $deflator;
    184 }
    185 
    186 # compute alphas
    187 foreach $y (keys(%unigram)) {
    188   $w1 = $y;
    189   $sum_denom = 0.0;
    190   foreach $x (keys(%bigram)) {
    191     if ( substr($x,0,rindex($x," ")) eq $w1 ) {
    192       $w2 = substr($x,index($x," ")+1);
    193       $sum_denom += $uniprob{$w2};
    194     }
    195   }
    196   $alpha{$w1} = $discount_mass / (1.0 - $sum_denom);
    197 }
    198 
    199 print LM "\\1-grams:\n";
    200 foreach $x (sort keys(%unigram)) {
    201   printf LM "%6.4f %s %6.4f\n", log($uniprob{$x})/$log10, $x, log($alpha{$x})/$log10;
    202 }
    203 print LM "\n";
    204 
    205 #compute bi probs
    206 foreach $x (keys(%bigram)) {
    207   $w1 = substr($x,0,rindex($x," "));
    208   $biprob{$x} = ($bigram{$x}*$deflator)/$unigram{$w1};
    209 }
    210 
    211 #compute bialphas
    212 foreach $x (keys(%bigram)) {
    213   $w1w2 = $x;
    214   $sum_denom = 0.0;
    215   foreach $y (keys(%trigram)) {
    216     if (substr($y,0,rindex($y," ")) eq $w1w2 ) {
    217       $w2w3 = substr($y,index($y," ")+1);
    218       $sum_denom += $biprob{$w2w3};
    219     }
    220   }
    221   $bialpha{$w1w2} = $discount_mass / (1.0 - $sum_denom);
    222 }
    223 
    224 # output the bigrams and trigrams (now that we have the alphas computed).
    225 if ( $bi_count > 0 ) {
    226   print LM "\\2-grams:\n";
    227   foreach $x (sort keys(%bigram)) {
    228     printf LM "%6.4f %s %6.4f\n",
    229       log($biprob{$x})/$log10, $x, log($bialpha{$x})/$log10;
    230   }
    231   print LM "\n";
    232 }
    233 
    234 if ($tri_count > 0 ) {
    235   print LM "\\3-grams:\n";
    236   foreach $x (sort keys(%trigram)) {
    237     $w1w2 = substr($x,0,rindex($x," "));
    238     printf LM "%6.4f %s\n",
    239       log(($trigram{$x}*$deflator)/$bigram{$w1w2})/$log10, $x;
    240   }
    241   print LM "\n";
    242 }
    243 
    244 print LM "\\end\\\n";
    245 close(LM);
    246 
    247 if ($VERBOSE>0) { print STDERR "Language model completed at ",scalar localtime(),"\n"; }
    248 
    249 #