cleaned; made able to recognize preexisting audio files and work with them. - vmc

commit 46217e670560561c07474b702c6d1216ee9c6e87
parent 0063f1a59cbf7f822ea5b6fb5146193a6936e4f1
Author: umhau <umhau@alum.gcc.edu>
Date:   Wed, 14 Jun 2017 23:44:09 -0400

cleaned; made able to recognize preexisting audio files and work with them.

Diffstat:
M lib/format_text.py  | 252 ++++++++++++++++++++++++++++++++++++++++++-------------------------------------

1 file changed, 133 insertions(+), 119 deletions(-)
diff --git a/lib/format_text.py b/lib/format_text.py
@@ -1,55 +1,73 @@
 #!/usr/bin/python3
 # 
-# DESCRIPTION
+# DESCRIPTION -----------------------------------------------------------------
 # 
-#       Creates a number of text files dependent on a sentence file which are required for building
-#       a CMU Sphinx voice model.  Also uses the extended PocketSphinx pronunciation dictionary.
+#       Creates a number of text files dependent on a sentence file which are 
+#       required for building a CMU Sphinx voice model.  Also uses the extended
+#       PocketSphinx pronunciation dictionary.
 # 
-#       Note that the target directory is the directory where the files should be saved into. This 
-#       should be similar to the directory the initial command was given from within.
-# 
-# USAGE
-# 
-#       python3 format_text.py /path/to/sentence-file.txt model-name target-directory iterations
+#       Note that the target directory is the directory where the files should 
+#       be saved into. This should be similar to the directory the initial 
+#       command was given from within.
+#
+#       The last two options in the usage example are optional. The 'number of 
+#       preexisting audio recordings' variable is used for adding to an 
+#       existing collection of audio recordings - it controls how the new 
+#       .wav files are named (starting from zero, or something higher).  The 
+#       last one, 'the fancy sentence list', is only looked for if the other is
+#       present.  It is the absolute path of the sentence list that vmc edited
+#       the last time it was creating audio files in the given folder.  Instead
+#       of starting from scratch, vmc can simply append the new items into that 
+#       list.  
+#
+#       The integration of the new functions should be seamless, though I 
+#       anticipate that it will be a royal pain to get it working.
 # 
-# EXAMPLE
+# USAGE | EXAMPLE--------------------------------------------------------------
 # 
-#       python3 /opt/vmc/functions/format_text.py ~/sentence-file.txt model-name target-directory 2
-#
-# DEPENDENCIES
+#       python3 format_text.py /path/to/sentence-file.txt \
+#                              model-name \
+#                              audio_folder \
+#                              iterations \ 
+#                              num_of_preexisting_audio_recordings 
 # 
-#       python3
+# FILES CREATED ---------------------------------------------------------------
+#       All are marked for new information to be appended. Therefore, I don't
+#       have to provide the old names of any files - just make sure the model
+#       names are the same.  Since none of these come with a model name except
+#       the .dict file (.dic??), I should be fine with the current 
+#       configuration.
+#   
+#       audio_folder + "/" +model_name + '.transcription'
+#       audio_folder + "/" +model_name + '.fileids'
+#       
+#       audio_folder+'/'+model_name+'.vocab'
+#       audio_folder+'/'+model_name+'.dic'       
 # 
-# IMPORTS =========================================================================================
+# IMPORTS =====================================================================
 
 import pathlib, re, sys, os, string
 
-# VARIABLE DEFINITIONS ============================================================================
+# VARIABLE DEFINITIONS ========================================================
 
-sentence_file = sys.argv[1] # os.path.basename() to get just the filename
+create_pronunciation_dictionary = False # This may not be desirable
 
+sentence_file = sys.argv[1] # os.path.basename() to get just the filename
 model_name = sys.argv[2]
-
-target_directory = sys.argv[3].rstrip(os.sep)
-
+audio_folder = sys.argv[3].rstrip(os.sep)
 iterations = int(sys.argv[4])
+pronunciation_dictionary = '/opt/vmc/lib/cmudict-en-us.dict'
 
-pronunciation_dictionary = '/opt/vmc/tools/cmudict-en-us.dict'
+try:
+    recording_count = int(sys.argv[5]) # how many audio files already exist
+except IndexError:
+    recording_count = 0
 
-# FUNCTION DEFINITION =============================================================================
+# LOGIC =======================================================================
 
-# Print iterations progress
+# Print iterations progress ---------------------------------------------------
 def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100):
-    """
-    Call in a loop to create terminal progress bar
-    @params:
-        iteration   - Required  : current iteration (Int)
-        total       - Required  : total iterations (Int)
-        prefix      - Optional  : prefix string (Str)
-        suffix      - Optional  : suffix string (Str)
-        decimals    - Optional  : positive number of decimals in percent complete (Int)
-        barLength   - Optional  : character length of bar (Int)
-    """
+
     formatStr       = "{0:." + str(decimals) + "f}"
     percents        = formatStr.format(100 * (iteration / float(total)))
     filledLength    = int(round(barLength * iteration / float(total)))
@@ -59,51 +77,46 @@ def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, bar
         sys.stdout.write('\n')
     sys.stdout.flush()
 
-# LOGIC ===========================================================================================
-
-
-# FILEID AND TRANSCRIPTION FILES ------------------------------------------------------------------
+# create files per-audio recording --------------------------------------------
 
-sentences_text = ""
-
-j=0
-lines = []
+sentences_text = ""; lines = []; 
 
 with open(sentence_file) as f:
+    for line in f: lines.append(line)
 
-    for line in f:
-        lines.append(line)
-
-# this was separated out so I can easily multiply the lines by the iterations...doesn't work to do 
-# f*iterations (in above loop).
 for line in lines*iterations:
 
-    sentences_text = sentences_text+' '+line
+    def append_to_file(formatted_filename, formatted_text):
+        hs = open(formatted_filename,"a")
+        hs.write(formatted_text)
+        hs.close() 
 
-    j+=1
-    afno = str('%04d'%j)
+    sentences_text = sentences_text+' '+line
+    recording_count+=1
+    formatted_audio_file_number = str('%04d'%recording_count)
 
-    # create transcription file
+    # create transcription file -----------------------------------------------
     exclude = set(string.punctuation)
     sentence = ''.join(ch for ch in line if ch not in exclude)
 
     nice_text = sentence.lower().rstrip()
-    formatted_text = "</s> " +  nice_text +  " </s> (" + model_name + "_" + afno + ")\n"
-    formatted_filename = target_directory + "/" +model_name + '.transcription'             
-    hs = open(formatted_filename,"a")
-    hs.write(formatted_text)
-    hs.close() 
+    formatted_text = "</s> "+nice_text+" </s> ("+model_name+"_"+formatted_audio_file_number+")\n"
+    formatted_filename = audio_folder + "/" +model_name + '.transcription'
+
+    append_to_file(formatted_filename, formatted_text)
     
-    #create fileid file
-    formatted_text = model_name + "_" + afno + "\n"
-    formatted_filename = target_directory + "/" +model_name + '.fileids'
-    hs = open(formatted_filename,"a")
-    hs.write(formatted_text)
-    hs.close() 
+    #create fileid file -------------------------------------------------------
+    formatted_text = model_name + "_" + formatted_audio_file_number + "\n"
+    formatted_filename = audio_folder + "/" +model_name + '.fileids'
+
+    append_to_file(formatted_filename, formatted_text)
 
-    sentences_text = sentences_text+' '+line # why twice? I have no memory of why I did this.
+    sentences_text = sentences_text+' '+line # why twice? I have no memory of 
+                                             # why I did this.  I don't think 
+                                             # it does anything, either. TODO:
+                                             # remove and see what happens.
 
-# CREATE PRONUNCIATION DICTIONARY -----------------------------------------------------------------
+# CREATE PRONUNCIATION DICTIONARY ---------------------------------------------
 
 # create unique, sorted word list from sentence list
 words = []
@@ -114,74 +127,75 @@ uwords = sorted(list(set(words)))
 
 # save word list to file
 print("Saving word list to file...")
-uwordsfilename = str(target_directory+'/'+model_name+'.vocab') # correct extension
+uwordsfilename = str(audio_folder+'/'+model_name+'.vocab') # correct extension
 uwordsfile = open(uwordsfilename, 'w')
 for word in uwords:
     uwordsfile.write("%s\n" % word)
 
 # create pronunciation dictionary from word list
-cmudict = []
-print("Opening pronunciation dictionary...")
-with open(pronunciation_dictionary) as f:
-    for line in f:
-        cmudict.append(line)
-
-pdict = []
-missing_words = []
-l = len(uwords)
-i = 0
-print("Extracting entries corresponding to word list...")
-printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
-
-curr_line = 0
+if create_pronunciation_dictionary:
+    cmudict = []
+    print("Opening pronunciation dictionary...")
+    with open(pronunciation_dictionary) as f:
+        for line in f:
+            cmudict.append(line)
+
+    pdict = []
+    missing_words = []
+    l = len(uwords)
+    i = 0
+    print("Extracting entries corresponding to word list...")
+    printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
 
-for word in uwords:
-    
-    wordmatch=False # a counter to help with efficiency
-    for line in cmudict[curr_line:]:        
-        
-        regex_string = str('^(?P<text>'+str(word.lower()) + '(\(\d\))?)( |\t)(?P<phones>.+)$')
+    curr_line = 0
 
-        if re.match(regex_string, line):
-            # print("match!")
-            ms = re.search(regex_string, line)
-            pdict.append(str(ms.group('text')+' '+ms.group('phones')))
-            wordmatch=True
+    for word in uwords:
         
-        # if I already made a match and I'm not now, time to break. this allows for finding 
-        # alternate pronunciations
-        elif wordmatch: 
-            # curr_line +=1
-            break
+        wordmatch=False # a counter to help with efficiency
+        for line in cmudict[curr_line:]:        
             
-        # curr_line +=1
-    
-    # check for words the pronunciation dictionary doesn't have & save
-    if not wordmatch:
-        missing_words.append(word)
+            regex_string = str('^(?P<text>'+str(word.lower()) + '(\(\d\))?)( |\t)(?P<phones>.+)$')
 
-    i +=1
-    
-    printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
+            if re.match(regex_string, line):
+                # print("match!")
+                ms = re.search(regex_string, line)
+                pdict.append(str(ms.group('text')+' '+ms.group('phones')))
+                wordmatch=True
+            
+            # if I already made a match and I'm not now, time to break. this allows
+            # for finding alternate pronunciations
+            elif wordmatch: 
+                # curr_line +=1
+                break
+                
+            # curr_line +=1
+        
+        # check for words the pronunciation dictionary doesn't have & save
+        if not wordmatch:
+            missing_words.append(word)
 
-# save pronunciation dictionary to file
-print("Saving pronunciation dictionary to file...")
-pdictfilename = str(target_directory+'/'+model_name+'.dic')
-pdictfile = open(pdictfilename, 'w')
-for word_entry in pdict:
-    pdictfile.write("%s\n" % word_entry)
-
-# RECORD LIST OF MISSING WORDS --------------------------------------------------------------------
-
-if missing_words:
-    missing_words_filename = str(target_directory+'/'+model_name+'.missing')
-    print("\nWord(s) missing from pronunciation dictionary. See ")
-    print(missing_words_filename+" for list.")
-    mwordsfile = open(missing_words_filename, 'w')
-    for word in missing_words:
-        mwordsfile.write("%s\n" % word)
+        i +=1
+        
+        printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
+
+    # save pronunciation dictionary to file
+    print("Saving pronunciation dictionary to file...")
+    pdictfilename = str(audio_folder+'/'+model_name+'.dic')
+    pdictfile = open(pdictfilename, 'w')
+    for word_entry in pdict:
+        pdictfile.write("%s\n" % word_entry)
+
+# RECORD LIST OF MISSING WORDS ------------------------------------------------
+
+    if missing_words:
+        missing_words_filename = str(audio_folder+'/'+model_name+'.missing')
+        print("\nWord(s) missing from pronunciation dictionary. See ")
+        print(missing_words_filename+" for list.")
+        mwordsfile = open(missing_words_filename, 'w')
+        for word in missing_words:
+            mwordsfile.write("%s\n" % word)
             
-# DONE --------------------------------------------------------------------------------------------
+# DONE ------------------------------------------------------------------------
 
 print("Data files created.")

‹ projects	vmc a voice model creator for CMU Sphinx
	Log \| Files \| Refs \| README \| LICENSE