multiple recording iterations now used in training - vmc - a voice model creator for CMU Sphinx

commit 437e97e470561bc8beaed55e125418e2919b1206
parent af2f5a5dd7dde1f8be7bc03f9c998bf97a6fad47
Author: umhau <umhau@users.noreply.github.com>
Date:   Wed,  2 Nov 2016 17:58:51 -0400

multiple recording iterations now used in training

priorly, the extra iterations of voice recordings weren't being used in training the voice model.  Now they should be.
Diffstat:
M functions/format_text.py  | 39 +++++++++++++++++++++++----------------

1 file changed, 23 insertions(+), 16 deletions(-)
diff --git a/functions/format_text.py b/functions/format_text.py
@@ -10,11 +10,11 @@
 # 
 # USAGE
 # 
-#       python3 format_text.py /path/to/sentence-file.txt model-name target-directory
+#       python3 format_text.py /path/to/sentence-file.txt model-name target-directory iterations
 # 
 # EXAMPLE
 # 
-#       python3 /opt/vmc/functions/format_text.py ~/sentence-file.txt model-name target-directory
+#       python3 /opt/vmc/functions/format_text.py ~/sentence-file.txt model-name target-directory 2
 #
 # DEPENDENCIES
 # 
@@ -32,6 +32,8 @@ model_name = sys.argv[2]
 
 target_directory = sys.argv[3].rstrip(os.sep)
 
+iterations = int(sys.argv[4])
+
 pronunciation_dictionary = '/opt/vmc/tools/cmudict-en-us.dict'
 
 # FUNCTION DEFINITION =============================================================================
@@ -59,32 +61,34 @@ def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, bar
 
 # LOGIC ===========================================================================================
 
+
+# FILEID AND TRANSCRIPTION FILES ------------------------------------------------------------------
+
 sentences_text = ""
 
 j=0
 
 with open(sentence_file) as f:
 
-    for line in f:
+    for line in f*iterations:
 
         sentences_text = sentences_text+' '+line
 
         j+=1
         afno = str('%04d'%j)
 
-        # get rid of punctuation
+        # create transcription file
         exclude = set(string.punctuation)
         sentence = ''.join(ch for ch in line if ch not in exclude)
 
         nice_text = sentence.lower().rstrip()
         formatted_text = "</s> " +  nice_text +  " </s> (" + model_name + "_" + afno + ")\n"
-        # formatted_text = nice_text+"\n"
         formatted_filename = target_directory + "/" +model_name + '.transcription'             
         hs = open(formatted_filename,"a")
         hs.write(formatted_text)
         hs.close() 
-
-        #fileid
+        
+        #create fileid file
         formatted_text = model_name + "_" + afno + "\n"
         formatted_filename = target_directory + "/" +model_name + '.fileids'
         hs = open(formatted_filename,"a")
@@ -93,7 +97,7 @@ with open(sentence_file) as f:
 
         sentences_text = sentences_text+' '+line
 
-# def sentence_parsing(sentences_text, model_name, sentence_file, pronunciation_dictionary):
+# CREATE PRONUNCIATION DICTIONARY -----------------------------------------------------------------
 
 # create unique, sorted word list from sentence list
 words = []
@@ -151,9 +155,18 @@ for word in uwords:
         missing_words.append(word)
 
     i +=1
+    
     printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
 
-# save missing words list to file
+# save pronunciation dictionary to file
+print("Saving pronunciation dictionary to file...")
+pdictfilename = str(target_directory+'/'+model_name+'.dic')
+pdictfile = open(pdictfilename, 'w')
+for word_entry in pdict:
+    pdictfile.write("%s\n" % word_entry)
+
+# RECORD LIST OF MISSING WORDS --------------------------------------------------------------------
+
 if missing_words:
     missing_words_filename = str(target_directory+'/'+model_name+'.missing')
     print("\nWord(s) missing from pronunciation dictionary. See ")
@@ -162,13 +175,7 @@ if missing_words:
     for word in missing_words:
         mwordsfile.write("%s\n" % word)
             
-# save pronunciation dictionary to file
-print("Saving pronunciation dictionary to file...")
-pdictfilename = str(target_directory+'/'+model_name+'.dic')
-pdictfile = open(pdictfilename, 'w')
-for word_entry in pdict:
-    pdictfile.write("%s\n" % word_entry)
+# DONE --------------------------------------------------------------------------------------------
 
-# final instructions
 print("Data files created.")

‹ projects	vmc a voice model creator for CMU Sphinx
	Log \| Files \| Refs \| README \| LICENSE