Delete quick_lm.pl - vmc - a voice model creator for CMU Sphinx

commit e8446c5e27a38c43ff14675f43ed8dc845f8947f
parent d14d174ff25531531f176825e2d43b8b31706ece
Author: umhau <umhau@users.noreply.github.com>
Date:   Tue,  1 Nov 2016 17:28:10 -0400

Delete quick_lm.pl
Diffstat:
D quick_lm.pl  | 249 -------------------------------------------------------------------------------

1 file changed, 0 insertions(+), 249 deletions(-)
diff --git a/quick_lm.pl b/quick_lm.pl
@@ -1,249 +0,0 @@
-#!/usr/local/bin/perl
-
-# ====================================================================
-# Copyright (c) 1996-2002 Alexander I. Rudnicky and Carnegie Mellon University.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# 1. Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer. 
-#
-# 2. Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in
-#    the documentation and-or other materials provided with the
-#    distribution.
-#
-# 3. All copies, used or distributed, must preserve the original wording of
-#    the copyright notice included in the output file.
-#
-# This work was supported in part by funding from the Defense Advanced 
-# Research Projects Agency and the CMU Sphinx Speech Consortium.
-#
-# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
-# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
-# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
-# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-# ====================================================================
-#
-# Pretty Good Language Modeler, now with unigram vector augmentation!
-# 
-# The Pretty Good Language Modeler is intended for quick construction 
-# of small language models, typically as might be needed in 
-# application development. Depending on the version of Perl that you 
-# are running, a practical limitation is a maximum vocabulary size on 
-# the order of 1000-2000 words. The limiting factor is the number of 
-# n-grams observed, since each n-gram is stored as a hash key. (So 
-# smaller vocabularies may turn out to be a problem as well.)
-# 
-# This package computes a stadard back-off language model. It differs 
-# in one significant respect, which is the computation of the 
-# discount. We adopt a "proportional" (or ratio) discount in which a 
-# certain percentage of probability mass is removed (typically 50%) 
-# from observed n-grams and redistributed over unobserved n-grams.
-# 
-# Conventionally, an absolute discount would be used, however we have 
-# found that the proportional discount appears to be robust for 
-# extremely small languages, as might be prototyped by a developer, 
-# as opposed to based on a collected corpus. We have found that 
-# absolute and proportional discounts produce comparable recognition 
-# results with perhaps a slight advantage for proportional 
-# discounting. A more systematic investigation of this technique would 
-# be desirable. In any case it also has the virtue of using a very
-# simple computation.
-
-# NOTE: this is by no means an efficient implementation and performance will 
-# deteriorate rapidly as a function of the corpus size. Larger corpora should be
-# processed using the toolkit available at http://www.speech.cs.cmu.edu/SLM_info.html
-
-# [2feb96] (air)
-# cobbles together a language model from a set of exemplar sentences.
-# features: 1) uniform discounting, 2) no cutoffs
-# the "+" version allows insertion of extra words into the 1gram vector
-
-# [27nov97] (air)
-# bulletproof a bit for use in conjunction with a cgi script
-
-# [20000711] (air)
-# made visible the discount parmeter
-
-# [20011123] (air)
-# cleaned-up version for distribution
-
-use Getopt::Std;
-
-$VERBOSE = 1;
-
-sub handler { local($sig) = @_;
-	      print STDERR "quick_lm caught a SIG$sig -- dying\n";
-	      exit(0);
-	    }
-foreach (qw(XCPU KILL TERM STOP)) { $SIG{$_} = \&handler; }
-
-
-if ($#ARGV < 0) { die("usage: quick_lm -s <sentence_file> [-w <word_file>] [-d discount]\n"); }
-Getopt::Std::getopts("s:w:d:x");
-$sentfile = $opt_s;
-$wordfile = $opt_w;
-$discount = $opt_d;
-
-$| = 1;  # always flush buffers
-
-if ($VERBOSE>0) {print STDERR "Language model started at ",scalar localtime(),"\n";}
-
-
-open(IN,"$sentfile") or die("can't open $sentfile!\n");
-if ($wordfile ne "") { open(WORDS,"$wordfile"); $wflag = 1;} else { $wflag = 0; }
-
-$log10 = log(10.0);
-
-if ($discount ne "") {
-  if (($discount<=0.0) or ($discount>=1.0)) {
-    print STDERR "\discount value out of range: must be 0.0 < x < 1.0! ...using 0.5\n";
-    $discount_mass = 0.5;  # just use default
-  } else {
-    $discount_mass = $discount;
-  }
-} else {
-  # Ben and Greg's experiments show that 0.5 is a way better default choice.
-  $discount_mass = 0.5;  # Set a nominal discount...
-}
-$deflator = 1.0 - $discount_mass;
-
-# create count tables
-$sent_cnt = 0;
-while (<IN>) {	 
-  s/^\s*//; s/\s*$//;
-  if ( $_ eq "" ) { next; } else { $sent_cnt++; } # skip empty lines
-  @word = split(/\s/);    
-  for ($j=0;$j<($#word-1);$j++) {	
-    $trigram{join(" ",$word[$j],$word[$j+1],$word[$j+2])}++;
-    $bigram{ join(" ",$word[$j],$word[$j+1])}++;
-    $unigram{$word[$j]}++;
-  }
-  # finish up the bi and uni's at the end of the sentence...
-  $bigram{join(" ",$word[$j],$word[$j+1])}++;
-  $unigram{$word[$j]}++;
-  
-  $unigram{$word[$j+1]}++;
-}
-close(IN);
-if ($VERBOSE) { print STDERR "$sent_cnt sentences found.\n"; }
-
-# add in any words
-if ($wflag) {
-  $new = 0; $read_in = 0;
-  while (<WORDS>) {
-    s/^\s*//; s/\s*$//;
-    if ( $_ eq "" ) { next; }  else { $read_in++; }  # skip empty lines
-    if (! $unigram{$_}) { $unigram{$_} = 1; $new++; }
-  }
-  if ($VERBOSE) { print STDERR "tried to add $read_in word; $new were new words\n"; }
-  close (WORDS);
-}
-if ( ($sent_cnt==0) && ($new==0) ) {
-  print STDERR "no input?\n";
-  exit;
-}
-
-open(LM,">$sentfile.arpabo") or die("can't open $sentfile.arpabo for output!\n");
-
-$preface = "";
-$preface .= "Language model created by QuickLM on ".`date`;
-$preface .= "Copyright (c) 1996-2002\nCarnegie Mellon University and Alexander I. Rudnicky\n\n";
-$preface .= "This model based on a corpus of $sent_cnt sentences and ".scalar (keys %unigram). " words\n";
-$preface .= "The (fixed) discount mass is $discount_mass\n\n";
-
-
-# compute counts
-$unisum = 0; $uni_count = 0; $bi_count = 0; $tri_count = 0;
-foreach $x (keys(%unigram)) { $uni_count++; $unisum += $unigram{$x}; }
-foreach $x (keys(%bigram))  { $bi_count++; }
-foreach $x (keys(%trigram)) { $tri_count++; }
-
-print LM $preface;
-print LM "\\data\\\n";
-print LM "ngram 1=$uni_count\n";
-if ( $bi_count > 0 ) { print LM "ngram 2=$bi_count\n"; }
-if ( $tri_count > 0 ) { print LM "ngram 3=$tri_count\n"; }
-print LM "\n";
-
-# compute uni probs
-foreach $x (keys(%unigram)) {
-  $uniprob{$x} = ($unigram{$x}/$unisum) * $deflator;
-}
-
-# compute alphas
-foreach $y (keys(%unigram)) {
-  $w1 = $y;
-  $sum_denom = 0.0;
-  foreach $x (keys(%bigram)) {
-    if ( substr($x,0,rindex($x," ")) eq $w1 ) {
-      $w2 = substr($x,index($x," ")+1);
-      $sum_denom += $uniprob{$w2};
-    }
-  }
-  $alpha{$w1} = $discount_mass / (1.0 - $sum_denom);
-}
-
-print LM "\\1-grams:\n";
-foreach $x (sort keys(%unigram)) {
-  printf LM "%6.4f %s %6.4f\n", log($uniprob{$x})/$log10, $x, log($alpha{$x})/$log10;
-}
-print LM "\n";
-
-#compute bi probs
-foreach $x (keys(%bigram)) {
-  $w1 = substr($x,0,rindex($x," "));
-  $biprob{$x} = ($bigram{$x}*$deflator)/$unigram{$w1};
-}
-
-#compute bialphas
-foreach $x (keys(%bigram)) {
-  $w1w2 = $x;
-  $sum_denom = 0.0;
-  foreach $y (keys(%trigram)) {
-    if (substr($y,0,rindex($y," ")) eq $w1w2 ) {
-      $w2w3 = substr($y,index($y," ")+1);
-      $sum_denom += $biprob{$w2w3};
-    }
-  }
-  $bialpha{$w1w2} = $discount_mass / (1.0 - $sum_denom);
-}
-
-# output the bigrams and trigrams (now that we have the alphas computed).
-if ( $bi_count > 0 ) {
-  print LM "\\2-grams:\n";
-  foreach $x (sort keys(%bigram)) {
-    printf LM "%6.4f %s %6.4f\n",
-      log($biprob{$x})/$log10, $x, log($bialpha{$x})/$log10;
-  }
-  print LM "\n";
-}
-
-if ($tri_count > 0 ) {
-  print LM "\\3-grams:\n";
-  foreach $x (sort keys(%trigram)) {
-    $w1w2 = substr($x,0,rindex($x," "));
-    printf LM "%6.4f %s\n",
-      log(($trigram{$x}*$deflator)/$bigram{$w1w2})/$log10, $x;
-  }
-  print LM "\n";
-}
-
-print LM "\\end\\\n";
-close(LM);
-
-if ($VERBOSE>0) { print STDERR "Language model completed at ",scalar localtime(),"\n"; }
-
-#

‹ projects	vmc a voice model creator for CMU Sphinx
	Log \| Files \| Refs \| README \| LICENSE