file import now works without errors - vmc - a voice model creator for CMU Sphinx

commit b49a6bc32f432d7e7c9bf1db7fc0a87a7a887c1b
parent c4dc4d886efbf7ed0bf735178edb7151a672c6ae
Author: umhau <umhau@users.noreply.github.com>
Date:   Tue,  1 Nov 2016 19:08:35 -0400

file import now works without errors
Diffstat:
A acousticfiles.sh  | 37 +++++++++++++++++++++++++++++++++++++
A buildLM.sh  | 43 +++++++++++++++++++++++++++++++++++++++++++
A format_text.py  | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A getaudio.py  | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M vmc.sh  | 23 +++++++++++++++++++----
A voicemodel.sh  | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

6 files changed, 501 insertions(+), 4 deletions(-)
diff --git a/acousticfiles.sh b/acousticfiles.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# 
+# DESCRIPTION
+# 
+#       Produce acoustic feature files from user-supplied voice recordings. These are stored with 
+#       the associated audio files, and named similarly with an .mfc extension.
+# 
+# USAGE
+# 
+#       bash acousticfiles.sh /audio/folder/path /path/to/model-name.fileids
+# 
+# EXAMPLE
+# 
+#       bash /opt/vmc/functions/acousticfiles.sh ~/audio ~/audio/newmodel.fileids
+#
+# DEPENDENCIES
+# 
+#       CMU Sphinx
+# 
+
+# VARIABLES DEFINITIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+folderpath=${1%/}
+
+fid_filepath=$2 # filename format: model-name.fileids
+
+
+# FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# generate some acoustic feature files
+echo "Generating acoustic feature files..."
+cd $folderpath # sphinx_fe likes to have a consistent working directory
+sphinx_fe -argfile /opt/vmc/tools/en-us/feat.params -samprate 16000 -c $fid_filepath -di . -do . -ei wav -eo mfc -mswav yes &> /dev/null
+
+
+
+
diff --git a/buildLM.sh b/buildLM.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# 
+# DESCRIPTION
+# 
+#       Produce binary language model from plain sentence list.  Invokes CMU-created perl script
+#       located in /opt/vmc/tools.  Saves file in given directory.
+# 
+# USAGE
+# 
+#       bash buildLM.sh sentence-list model-name save-directory
+#
+# DEPENDENCIES
+# 
+#       CMU Sphinx
+# 
+# VARIABLES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+sentence_list_path=$1
+
+model_name=$2
+
+save_directory=$3
+
+tools_dir=/opt/vmc/tools
+
+# COMMANDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# run perl script to create language model
+perl $tools_dir/quick_lm.pl -s $sentence_list_path #&> /dev/null
+
+sentence_list=`basename $sentence_list_path`
+
+sentence_list_dir=`dirname $sentence_list_path`
+
+# rename output
+src=$sentence_list_path.arpabo
+dst=$sentence_list_dir/$model_name.lm
+mv $src $dst
+
+# convert lm to binary (bin) format (command was too complex for python to handle)
+filename=$sentence_list_dir/$model_name.lm
+sphinx_lm_convert -i $dst -o $dst.bin &> /dev/null
+
diff --git a/format_text.py b/format_text.py
@@ -0,0 +1,174 @@
+#!/usr/bin/python3
+# 
+# DESCRIPTION
+# 
+#       Creates a number of text files dependent on a sentence file which are required for building
+#       a CMU Sphinx voice model.  Also uses the extended PocketSphinx pronunciation dictionary.
+# 
+#       Note that the target directory is the directory where the files should be saved into. This 
+#       should be similar to the directory the initial command was given from within.
+# 
+# USAGE
+# 
+#       python3 format_text.py /path/to/sentence-file.txt model-name target-directory
+# 
+# EXAMPLE
+# 
+#       python3 /opt/vmc/functions/format_text.py ~/sentence-file.txt model-name target-directory
+#
+# DEPENDENCIES
+# 
+#       python3
+# 
+# IMPORTS =========================================================================================
+
+import pathlib, re, sys, os, string
+
+# VARIABLE DEFINITIONS ============================================================================
+
+sentence_file = sys.argv[1] # os.path.basename() to get just the filename
+
+model_name = sys.argv[2]
+
+target_directory = sys.argv[3].rstrip(os.sep)
+
+pronunciation_dictionary = '/opt/vmc/tools/cmudict-en-us.dict'
+
+# FUNCTION DEFINITION =============================================================================
+
+# Print iterations progress
+def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100):
+    """
+    Call in a loop to create terminal progress bar
+    @params:
+        iteration   - Required  : current iteration (Int)
+        total       - Required  : total iterations (Int)
+        prefix      - Optional  : prefix string (Str)
+        suffix      - Optional  : suffix string (Str)
+        decimals    - Optional  : positive number of decimals in percent complete (Int)
+        barLength   - Optional  : character length of bar (Int)
+    """
+    formatStr       = "{0:." + str(decimals) + "f}"
+    percents        = formatStr.format(100 * (iteration / float(total)))
+    filledLength    = int(round(barLength * iteration / float(total)))
+    bar             = '█' * filledLength + '-' * (barLength - filledLength)
+    sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)),
+    if iteration == total:
+        sys.stdout.write('\n')
+    sys.stdout.flush()
+
+# LOGIC ===========================================================================================
+
+sentences_text = ""
+
+j=0
+
+with open(sentence_file) as f:
+
+    for line in f:
+
+        sentences_text = sentences_text+' '+line
+
+        j+=1
+        afno = str('%04d'%j)
+
+        # get rid of punctuation
+        exclude = set(string.punctuation)
+        sentence = ''.join(ch for ch in line if ch not in exclude)
+
+        nice_text = sentence.lower().rstrip()
+        formatted_text = "</s> " +  nice_text +  " </s> (" + model_name + "_" + afno + ")\n"
+        # formatted_text = nice_text+"\n"
+        formatted_filename = target_directory + "/" +model_name + '.transcription'             
+        hs = open(formatted_filename,"a")
+        hs.write(formatted_text)
+        hs.close() 
+
+        #fileid
+        formatted_text = model_name + "_" + afno + "\n"
+        formatted_filename = target_directory + "/" +model_name + '.fileids'
+        hs = open(formatted_filename,"a")
+        hs.write(formatted_text)
+        hs.close() 
+
+        sentences_text = sentences_text+' '+line
+
+# def sentence_parsing(sentences_text, model_name, sentence_file, pronunciation_dictionary):
+
+# create unique, sorted word list from sentence list
+words = []
+print("Creating unique, sorted word list...")
+[words.append(word.strip(string.punctuation).upper()) for word in sentences_text.split()]
+# set() uniques the list, sorted() puts them a-z.
+uwords = sorted(list(set(words))) 
+
+# save word list to file
+print("Saving word list to file...")
+uwordsfilename = str(target_directory+'/'+model_name+'.vocab') # correct extension
+uwordsfile = open(uwordsfilename, 'w')
+for word in uwords:
+    uwordsfile.write("%s\n" % word)
+
+# create pronunciation dictionary from word list
+cmudict = []
+print("Opening pronunciation dictionary...")
+with open(pronunciation_dictionary) as f:
+    for line in f:
+        cmudict.append(line)
+
+pdict = []
+missing_words = []
+l = len(uwords)
+i = 0
+print("Extracting entries corresponding to word list...")
+printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
+
+curr_line = 0
+
+for word in uwords:
+    
+    wordmatch=False # a counter to help with efficiency
+    for line in cmudict[curr_line:]:        
+        
+        regex_string = str('^(?P<text>'+str(word.lower()) + '(\(\d\))?)( |\t)(?P<phones>.+)$')
+
+        if re.match(regex_string, line):
+            # print("match!")
+            ms = re.search(regex_string, line)
+            pdict.append(str(ms.group('text')+' '+ms.group('phones')))
+            wordmatch=True
+        
+        # if I already made a match and I'm not now, time to break. this allows for finding 
+        # alternate pronunciations
+        elif wordmatch: 
+            # curr_line +=1
+            break
+            
+        # curr_line +=1
+    
+    # check for words the pronunciation dictionary doesn't have & save
+    if not wordmatch:
+        missing_words.append(word)
+
+    i +=1
+    printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
+
+# save missing words list to file
+if missing_words:
+    missing_words_filename = str(target_directory+'/'+model_name+'.missing')
+    print("\nWord(s) missing from pronunciation dictionary. See ")
+    print(missing_words_filename+" for list.")
+    mwordsfile = open(missing_words_filename, 'w')
+    for word in missing_words:
+        mwordsfile.write("%s\n" % word)
+            
+# save pronunciation dictionary to file
+print("Saving pronunciation dictionary to file...")
+pdictfilename = str(target_directory+'/'+model_name+'.dic')
+pdictfile = open(pdictfilename, 'w')
+for word_entry in pdict:
+    pdictfile.write("%s\n" % word_entry)
+
+# final instructions
+print("Data files created.")
+
diff --git a/getaudio.py b/getaudio.py
@@ -0,0 +1,129 @@
+#!/usr/bin/python3
+# 
+# DESCRIPTION
+# 
+#       getaudio is used to sequentially prompt the user for dictations of displayed sentences.
+#
+# DEPENDENCIES 
+# 
+#       python3-pyaudio, python3
+#
+# USAGE
+# 
+#       python3 getaudio.py sentence-file /output/folder recording-repetitions model-name
+#
+# LIBRARY IMPORTS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+import sys, os, _thread, pyaudio, wave, contextlib
+
+# VARIABLE DEFINITIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+pronunciation_dictionary="cmudict-en-us.dict"
+
+chunk = 1024
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RATE = 16000
+
+sentence_file = sys.argv[1]
+
+output_folder = sys.argv[2].rstrip(os.sep)
+
+reps = int(sys.argv[3])
+
+model_name = sys.argv[4]
+
+
+# FUNCTION DEFINITIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# ignore sdterr messages: as from pyaudio
+@contextlib.contextmanager
+def ignore_stderr():
+    devnull = os.open(os.devnull, os.O_WRONLY)
+    old_stderr = os.dup(2)
+    sys.stderr.flush()
+    os.dup2(devnull, 2)
+    os.close(devnull)
+    try:
+        yield
+    finally:
+        os.dup2(old_stderr, 2)
+        os.close(old_stderr)
+
+def record_until_keypress(audio_filepath):
+
+    # detect keypress [enter]
+    def input_thread(L):
+        input()
+        L.append(None)
+
+    # initialize audio stream - and keep it quiet
+    with ignore_stderr():
+        p = pyaudio.PyAudio()
+        stream = p.open(format = FORMAT,
+                channels = CHANNELS,
+                rate = RATE,
+                input = True,
+                frames_per_buffer = chunk)
+
+    # create interrupt thread
+    L = []
+    _thread.start_new_thread(input_thread, (L,))
+    
+    # record data during loop
+    frames = []
+    while True:
+        data = stream.read(chunk)
+        frames.append(data)
+        if L: 
+            stream.stop_stream()
+            break
+    
+    # exit cleanly after break
+    stream.close()
+    p.terminate()
+    
+    # write data to WAVE file
+    data = b''.join(frames)
+    wf = wave.open(audio_filepath, 'wb')
+    wf.setnchannels(CHANNELS)
+    wf.setsampwidth(p.get_sample_size(FORMAT))
+    wf.setframerate(RATE)
+    wf.writeframes(data)
+    wf.close()
+
+
+# LOGIC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+if not os.path.exists(output_folder):
+    os.makedirs(output_folder)
+
+# create list of sentences for prompt
+sentence_list = []
+with open(sentence_file) as f:
+    for line in f:
+        sentence_list.append(line)
+
+num_recs = len(sentence_list)*reps
+
+# collect audio files
+try:
+
+    input("Press [enter], read text, & press [enter].")
+
+    j=0
+
+    for sentence in sentence_list*reps: 
+        #recording number
+        j+=1
+
+        # record audio with visual
+        print("Recording no. %04d of %04d: \n\n\t%s" % (j, num_recs, sentence), end='\r')
+
+        # recording file should look like this (e.g.): ./bespoke_training_data/audio/arctic_0001.wav        
+        record_until_keypress(str(output_folder + os.sep + model_name + "_%04d.wav" % j))
+
+except KeyboardInterrupt:
+    pass
+
+
diff --git a/vmc.sh b/vmc.sh
@@ -68,18 +68,21 @@ fdir=/opt/vmc/functions
 
 # OBTAIN REQUISITE FILES --------------------------------------------------------------------------
 
-# get audio files
-if [ $1 = '-record' ]; then 
+echo
+echo "Collecting required files..."
+
+# get audio files and put them where they go
+if [[ $2 = '-record' ]]; then 
 
     mkdir -p $audio_folder
 
     python3 $fdir/getaudio.py $sentence_file $audio_folder $iterations $model_name
 
-elif [ $1 = '-import' ]; then
+elif [[ $2 = '-import' ]]; then
 
     mkdir -p $audio_folder
 
-    cp -r $audio_file_directory $audio_folder
+    cp -a $audio_file_directory/*.wav $audio_folder/
 
 fi
 
@@ -88,14 +91,23 @@ cp -r $tdir/en-us $output_folder
 
 # PRODUCE DERIVATIVE FILES ------------------------------------------------------------------------
 
+echo
+echo "Producing sentence file derivatives..."
+
 # get derivatives of sentence file
 python3 $fdir/format_text.py $sentence_file $model_name $output_folder
 
+echo 
+echo "Producing audio file derivatives..."
+
 # get derivatives of audio files
 bash $fdir/acousticfiles.sh $audio_folder $output_folder/$model_name.fileids
 
 # CREATE MODELS -----------------------------------------------------------------------------------
 
+echo
+echo "Creating models..."
+
 # build language model
 bash $fdir/buildLM.sh $sentence_file $model_name $output_folder
 
@@ -103,3 +115,5 @@ bash $fdir/buildLM.sh $sentence_file $model_name $output_folder
 bash $fdir/voicemodel.sh $model_name $output_folder $audio_folder $output_folder
 
 
+echo 
+echo "Process complete."
+\ No newline at end of file
diff --git a/voicemodel.sh b/voicemodel.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+# 
+# DESCRIPTION
+# 
+#       Given acoustic feature files and sentence file derivatives, produce voice model. 
+# 
+# USAGE
+# 
+#       bash voicemodel.sh model-name model-dir acoustic-files-dir sentence-file-derivatives-dir
+# 
+# EXAMPLE
+# 
+#       bash voicemodel.sh new_model ~/tools/new_model ~/tools/new_model/audio ~/tools/new_model
+#
+# DEPENDENCIES
+# 
+#       CMU Sphinx
+# 
+# NOTES
+# 
+#       This script is primarily using a copy of en-us that is being actively edited as it is 
+#       adapted to become a custom voice model.  
+# 
+#       Binaries are located in /opt/vmc/tools.
+# 
+# VARIABLES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+model_name=$1
+model_dir=$2        # location of adapted voice model files: copy of en-us, audio files, etc.
+af_dir=$3           # directory containing audio files and audio feature files
+sf_dir=$4           # directory containing sentence file derivatives
+
+tools_dir=/opt/vmc/tools
+
+pronunciation_dictionary=$tools_dir/cmudict-en-us.dict
+
+# COMMANDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# convert binary mdef file to .txt
+cd $model_dir
+pocketsphinx_mdef_convert -text $model_dir/en-us/mdef $model_dir/en-us/mdef.txt &> /dev/null
+
+# run tools to create voice model
+cd $af_dir
+
+# sphinx_fe
+sphinx_fe \
+ -argfile $model_dir/en-us/feat.params \
+ -samprate 16000 \
+ -c $sf_dir/$model_name.fileids \
+ -di . \
+ -do . \
+ -ei wav \
+ -eo mfc \
+ -mswav yes \
+ &> /dev/null
+
+$tools_dir/bw \
+ -hmmdir $model_dir/en-us \
+ -moddeffn $model_dir/en-us/mdef.txt  \
+ -ts2cbfn .ptm. \
+ -feat 1s_c_d_dd \
+ -svspec 0-12/13-25/26-38 \
+ -cmn current \
+ -agc none \
+ -dictfn $pronunciation_dictionary  \
+ -ctlfn $sf_dir/$model_name.fileids \
+ -lsnfn $sf_dir/$model_name.transcription \
+ -accumdir . \
+ &> /dev/null
+
+$tools_dir/mllr_solve \
+ -meanfn $model_dir/en-us/means  \
+ -varfn $model_dir/en-us/variances \
+ -outmllrfn mllr_matrix \
+ -accumdir . \
+ &> /dev/null
+
+$tools_dir/map_adapt \
+ -moddeffn $model_dir/en-us/mdef.txt \
+ -ts2cbfn .ptm. \
+ -meanfn $model_dir/en-us/means \
+ -varfn $model_dir/en-us/variances \
+ -mixwfn $model_dir/en-us/mixture_weights \
+ -tmatfn $model_dir/en-us/transition_matrices \
+ -accumdir . \
+ -mapmeanfn $model_dir/en-us/means \
+ -mapvarfn $model_dir/en-us/variances \
+ -mapmixwfn $model_dir/en-us/mixture_weights \
+ -maptmatfn $model_dir/en-us/transition_matrices\
+ &> /dev/null
+
+$tools_dir/mk_s2sendump \
+ -pocketsphinx yes \
+ -moddeffn $model_dir/en-us/mdef.txt \
+ -mixwfn $model_dir/en-us/mixture_weights \
+ -sendumpfn $model_dir/en-us/sendump \
+ &> /dev/null
+\ No newline at end of file

‹ projects	vmc a voice model creator for CMU Sphinx
	Log \| Files \| Refs \| README \| LICENSE

A	acousticfiles.sh	\|	37	+++++++++++++++++++++++++++++++++++++
A	buildLM.sh	\|	43	+++++++++++++++++++++++++++++++++++++++++++
A	format_text.py	\|	174	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	getaudio.py	\|	129	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	vmc.sh	\|	23	+++++++++++++++++++----
A	voicemodel.sh	\|	99	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++