commit e8446c5e27a38c43ff14675f43ed8dc845f8947f
parent d14d174ff25531531f176825e2d43b8b31706ece
Author: umhau <umhau@users.noreply.github.com>
Date: Tue, 1 Nov 2016 17:28:10 -0400
Delete quick_lm.pl
Diffstat:
| D | quick_lm.pl | | | 249 | ------------------------------------------------------------------------------- |
1 file changed, 0 insertions(+), 249 deletions(-)
diff --git a/quick_lm.pl b/quick_lm.pl
@@ -1,249 +0,0 @@
-#!/usr/local/bin/perl
-
-# ====================================================================
-# Copyright (c) 1996-2002 Alexander I. Rudnicky and Carnegie Mellon University.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# 1. Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in
-# the documentation and-or other materials provided with the
-# distribution.
-#
-# 3. All copies, used or distributed, must preserve the original wording of
-# the copyright notice included in the output file.
-#
-# This work was supported in part by funding from the Defense Advanced
-# Research Projects Agency and the CMU Sphinx Speech Consortium.
-#
-# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
-# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
-# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-# ====================================================================
-#
-# Pretty Good Language Modeler, now with unigram vector augmentation!
-#
-# The Pretty Good Language Modeler is intended for quick construction
-# of small language models, typically as might be needed in
-# application development. Depending on the version of Perl that you
-# are running, a practical limitation is a maximum vocabulary size on
-# the order of 1000-2000 words. The limiting factor is the number of
-# n-grams observed, since each n-gram is stored as a hash key. (So
-# smaller vocabularies may turn out to be a problem as well.)
-#
-# This package computes a stadard back-off language model. It differs
-# in one significant respect, which is the computation of the
-# discount. We adopt a "proportional" (or ratio) discount in which a
-# certain percentage of probability mass is removed (typically 50%)
-# from observed n-grams and redistributed over unobserved n-grams.
-#
-# Conventionally, an absolute discount would be used, however we have
-# found that the proportional discount appears to be robust for
-# extremely small languages, as might be prototyped by a developer,
-# as opposed to based on a collected corpus. We have found that
-# absolute and proportional discounts produce comparable recognition
-# results with perhaps a slight advantage for proportional
-# discounting. A more systematic investigation of this technique would
-# be desirable. In any case it also has the virtue of using a very
-# simple computation.
-
-# NOTE: this is by no means an efficient implementation and performance will
-# deteriorate rapidly as a function of the corpus size. Larger corpora should be
-# processed using the toolkit available at http://www.speech.cs.cmu.edu/SLM_info.html
-
-# [2feb96] (air)
-# cobbles together a language model from a set of exemplar sentences.
-# features: 1) uniform discounting, 2) no cutoffs
-# the "+" version allows insertion of extra words into the 1gram vector
-
-# [27nov97] (air)
-# bulletproof a bit for use in conjunction with a cgi script
-
-# [20000711] (air)
-# made visible the discount parmeter
-
-# [20011123] (air)
-# cleaned-up version for distribution
-
-use Getopt::Std;
-
-$VERBOSE = 1;
-
-sub handler { local($sig) = @_;
- print STDERR "quick_lm caught a SIG$sig -- dying\n";
- exit(0);
- }
-foreach (qw(XCPU KILL TERM STOP)) { $SIG{$_} = \&handler; }
-
-
-if ($#ARGV < 0) { die("usage: quick_lm -s <sentence_file> [-w <word_file>] [-d discount]\n"); }
-Getopt::Std::getopts("s:w:d:x");
-$sentfile = $opt_s;
-$wordfile = $opt_w;
-$discount = $opt_d;
-
-$| = 1; # always flush buffers
-
-if ($VERBOSE>0) {print STDERR "Language model started at ",scalar localtime(),"\n";}
-
-
-open(IN,"$sentfile") or die("can't open $sentfile!\n");
-if ($wordfile ne "") { open(WORDS,"$wordfile"); $wflag = 1;} else { $wflag = 0; }
-
-$log10 = log(10.0);
-
-if ($discount ne "") {
- if (($discount<=0.0) or ($discount>=1.0)) {
- print STDERR "\discount value out of range: must be 0.0 < x < 1.0! ...using 0.5\n";
- $discount_mass = 0.5; # just use default
- } else {
- $discount_mass = $discount;
- }
-} else {
- # Ben and Greg's experiments show that 0.5 is a way better default choice.
- $discount_mass = 0.5; # Set a nominal discount...
-}
-$deflator = 1.0 - $discount_mass;
-
-# create count tables
-$sent_cnt = 0;
-while (<IN>) {
- s/^\s*//; s/\s*$//;
- if ( $_ eq "" ) { next; } else { $sent_cnt++; } # skip empty lines
- @word = split(/\s/);
- for ($j=0;$j<($#word-1);$j++) {
- $trigram{join(" ",$word[$j],$word[$j+1],$word[$j+2])}++;
- $bigram{ join(" ",$word[$j],$word[$j+1])}++;
- $unigram{$word[$j]}++;
- }
- # finish up the bi and uni's at the end of the sentence...
- $bigram{join(" ",$word[$j],$word[$j+1])}++;
- $unigram{$word[$j]}++;
-
- $unigram{$word[$j+1]}++;
-}
-close(IN);
-if ($VERBOSE) { print STDERR "$sent_cnt sentences found.\n"; }
-
-# add in any words
-if ($wflag) {
- $new = 0; $read_in = 0;
- while (<WORDS>) {
- s/^\s*//; s/\s*$//;
- if ( $_ eq "" ) { next; } else { $read_in++; } # skip empty lines
- if (! $unigram{$_}) { $unigram{$_} = 1; $new++; }
- }
- if ($VERBOSE) { print STDERR "tried to add $read_in word; $new were new words\n"; }
- close (WORDS);
-}
-if ( ($sent_cnt==0) && ($new==0) ) {
- print STDERR "no input?\n";
- exit;
-}
-
-open(LM,">$sentfile.arpabo") or die("can't open $sentfile.arpabo for output!\n");
-
-$preface = "";
-$preface .= "Language model created by QuickLM on ".`date`;
-$preface .= "Copyright (c) 1996-2002\nCarnegie Mellon University and Alexander I. Rudnicky\n\n";
-$preface .= "This model based on a corpus of $sent_cnt sentences and ".scalar (keys %unigram). " words\n";
-$preface .= "The (fixed) discount mass is $discount_mass\n\n";
-
-
-# compute counts
-$unisum = 0; $uni_count = 0; $bi_count = 0; $tri_count = 0;
-foreach $x (keys(%unigram)) { $uni_count++; $unisum += $unigram{$x}; }
-foreach $x (keys(%bigram)) { $bi_count++; }
-foreach $x (keys(%trigram)) { $tri_count++; }
-
-print LM $preface;
-print LM "\\data\\\n";
-print LM "ngram 1=$uni_count\n";
-if ( $bi_count > 0 ) { print LM "ngram 2=$bi_count\n"; }
-if ( $tri_count > 0 ) { print LM "ngram 3=$tri_count\n"; }
-print LM "\n";
-
-# compute uni probs
-foreach $x (keys(%unigram)) {
- $uniprob{$x} = ($unigram{$x}/$unisum) * $deflator;
-}
-
-# compute alphas
-foreach $y (keys(%unigram)) {
- $w1 = $y;
- $sum_denom = 0.0;
- foreach $x (keys(%bigram)) {
- if ( substr($x,0,rindex($x," ")) eq $w1 ) {
- $w2 = substr($x,index($x," ")+1);
- $sum_denom += $uniprob{$w2};
- }
- }
- $alpha{$w1} = $discount_mass / (1.0 - $sum_denom);
-}
-
-print LM "\\1-grams:\n";
-foreach $x (sort keys(%unigram)) {
- printf LM "%6.4f %s %6.4f\n", log($uniprob{$x})/$log10, $x, log($alpha{$x})/$log10;
-}
-print LM "\n";
-
-#compute bi probs
-foreach $x (keys(%bigram)) {
- $w1 = substr($x,0,rindex($x," "));
- $biprob{$x} = ($bigram{$x}*$deflator)/$unigram{$w1};
-}
-
-#compute bialphas
-foreach $x (keys(%bigram)) {
- $w1w2 = $x;
- $sum_denom = 0.0;
- foreach $y (keys(%trigram)) {
- if (substr($y,0,rindex($y," ")) eq $w1w2 ) {
- $w2w3 = substr($y,index($y," ")+1);
- $sum_denom += $biprob{$w2w3};
- }
- }
- $bialpha{$w1w2} = $discount_mass / (1.0 - $sum_denom);
-}
-
-# output the bigrams and trigrams (now that we have the alphas computed).
-if ( $bi_count > 0 ) {
- print LM "\\2-grams:\n";
- foreach $x (sort keys(%bigram)) {
- printf LM "%6.4f %s %6.4f\n",
- log($biprob{$x})/$log10, $x, log($bialpha{$x})/$log10;
- }
- print LM "\n";
-}
-
-if ($tri_count > 0 ) {
- print LM "\\3-grams:\n";
- foreach $x (sort keys(%trigram)) {
- $w1w2 = substr($x,0,rindex($x," "));
- printf LM "%6.4f %s\n",
- log(($trigram{$x}*$deflator)/$bigram{$w1w2})/$log10, $x;
- }
- print LM "\n";
-}
-
-print LM "\\end\\\n";
-close(LM);
-
-if ($VERBOSE>0) { print STDERR "Language model completed at ",scalar localtime(),"\n"; }
-
-#