commit 46217e670560561c07474b702c6d1216ee9c6e87
parent 0063f1a59cbf7f822ea5b6fb5146193a6936e4f1
Author: umhau <umhau@alum.gcc.edu>
Date: Wed, 14 Jun 2017 23:44:09 -0400
cleaned; made able to recognize preexisting audio files and work with them.
Diffstat:
| M | lib/format_text.py | | | 252 | ++++++++++++++++++++++++++++++++++++++++++------------------------------------- |
1 file changed, 133 insertions(+), 119 deletions(-)
diff --git a/lib/format_text.py b/lib/format_text.py
@@ -1,55 +1,73 @@
#!/usr/bin/python3
#
-# DESCRIPTION
+# DESCRIPTION -----------------------------------------------------------------
#
-# Creates a number of text files dependent on a sentence file which are required for building
-# a CMU Sphinx voice model. Also uses the extended PocketSphinx pronunciation dictionary.
+# Creates a number of text files dependent on a sentence file which are
+# required for building a CMU Sphinx voice model. Also uses the extended
+# PocketSphinx pronunciation dictionary.
#
-# Note that the target directory is the directory where the files should be saved into. This
-# should be similar to the directory the initial command was given from within.
-#
-# USAGE
-#
-# python3 format_text.py /path/to/sentence-file.txt model-name target-directory iterations
+# Note that the target directory is the directory where the files should
+# be saved into. This should be similar to the directory the initial
+# command was given from within.
+#
+# The last two options in the usage example are optional. The 'number of
+# preexisting audio recordings' variable is used for adding to an
+# existing collection of audio recordings - it controls how the new
+# .wav files are named (starting from zero, or something higher). The
+# last one, 'the fancy sentence list', is only looked for if the other is
+# present. It is the absolute path of the sentence list that vmc edited
+# the last time it was creating audio files in the given folder. Instead
+# of starting from scratch, vmc can simply append the new items into that
+# list.
+#
+# The integration of the new functions should be seamless, though I
+# anticipate that it will be a royal pain to get it working.
#
-# EXAMPLE
+# USAGE | EXAMPLE--------------------------------------------------------------
#
-# python3 /opt/vmc/functions/format_text.py ~/sentence-file.txt model-name target-directory 2
-#
-# DEPENDENCIES
+# python3 format_text.py /path/to/sentence-file.txt \
+# model-name \
+# audio_folder \
+# iterations \
+# num_of_preexisting_audio_recordings
#
-# python3
+# FILES CREATED ---------------------------------------------------------------
+# All are marked for new information to be appended. Therefore, I don't
+# have to provide the old names of any files - just make sure the model
+# names are the same. Since none of these come with a model name except
+# the .dict file (.dic??), I should be fine with the current
+# configuration.
+#
+# audio_folder + "/" +model_name + '.transcription'
+# audio_folder + "/" +model_name + '.fileids'
+#
+# audio_folder+'/'+model_name+'.vocab'
+# audio_folder+'/'+model_name+'.dic'
#
-# IMPORTS =========================================================================================
+# IMPORTS =====================================================================
import pathlib, re, sys, os, string
-# VARIABLE DEFINITIONS ============================================================================
+# VARIABLE DEFINITIONS ========================================================
-sentence_file = sys.argv[1] # os.path.basename() to get just the filename
+create_pronunciation_dictionary = False # This may not be desirable
+sentence_file = sys.argv[1] # os.path.basename() to get just the filename
model_name = sys.argv[2]
-
-target_directory = sys.argv[3].rstrip(os.sep)
-
+audio_folder = sys.argv[3].rstrip(os.sep)
iterations = int(sys.argv[4])
+pronunciation_dictionary = '/opt/vmc/lib/cmudict-en-us.dict'
-pronunciation_dictionary = '/opt/vmc/tools/cmudict-en-us.dict'
+try:
+ recording_count = int(sys.argv[5]) # how many audio files already exist
+except IndexError:
+ recording_count = 0
-# FUNCTION DEFINITION =============================================================================
+# LOGIC =======================================================================
-# Print iterations progress
+# Print iterations progress ---------------------------------------------------
def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100):
- """
- Call in a loop to create terminal progress bar
- @params:
- iteration - Required : current iteration (Int)
- total - Required : total iterations (Int)
- prefix - Optional : prefix string (Str)
- suffix - Optional : suffix string (Str)
- decimals - Optional : positive number of decimals in percent complete (Int)
- barLength - Optional : character length of bar (Int)
- """
+
formatStr = "{0:." + str(decimals) + "f}"
percents = formatStr.format(100 * (iteration / float(total)))
filledLength = int(round(barLength * iteration / float(total)))
@@ -59,51 +77,46 @@ def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, bar
sys.stdout.write('\n')
sys.stdout.flush()
-# LOGIC ===========================================================================================
-
-
-# FILEID AND TRANSCRIPTION FILES ------------------------------------------------------------------
+# create files per-audio recording --------------------------------------------
-sentences_text = ""
-
-j=0
-lines = []
+sentences_text = ""; lines = [];
with open(sentence_file) as f:
+ for line in f: lines.append(line)
- for line in f:
- lines.append(line)
-
-# this was separated out so I can easily multiply the lines by the iterations...doesn't work to do
-# f*iterations (in above loop).
for line in lines*iterations:
- sentences_text = sentences_text+' '+line
+ def append_to_file(formatted_filename, formatted_text):
+ hs = open(formatted_filename,"a")
+ hs.write(formatted_text)
+ hs.close()
- j+=1
- afno = str('%04d'%j)
+ sentences_text = sentences_text+' '+line
+ recording_count+=1
+ formatted_audio_file_number = str('%04d'%recording_count)
- # create transcription file
+ # create transcription file -----------------------------------------------
exclude = set(string.punctuation)
sentence = ''.join(ch for ch in line if ch not in exclude)
nice_text = sentence.lower().rstrip()
- formatted_text = "</s> " + nice_text + " </s> (" + model_name + "_" + afno + ")\n"
- formatted_filename = target_directory + "/" +model_name + '.transcription'
- hs = open(formatted_filename,"a")
- hs.write(formatted_text)
- hs.close()
+ formatted_text = "</s> "+nice_text+" </s> ("+model_name+"_"+formatted_audio_file_number+")\n"
+ formatted_filename = audio_folder + "/" +model_name + '.transcription'
+
+ append_to_file(formatted_filename, formatted_text)
- #create fileid file
- formatted_text = model_name + "_" + afno + "\n"
- formatted_filename = target_directory + "/" +model_name + '.fileids'
- hs = open(formatted_filename,"a")
- hs.write(formatted_text)
- hs.close()
+ #create fileid file -------------------------------------------------------
+ formatted_text = model_name + "_" + formatted_audio_file_number + "\n"
+ formatted_filename = audio_folder + "/" +model_name + '.fileids'
+
+ append_to_file(formatted_filename, formatted_text)
- sentences_text = sentences_text+' '+line # why twice? I have no memory of why I did this.
+ sentences_text = sentences_text+' '+line # why twice? I have no memory of
+ # why I did this. I don't think
+ # it does anything, either. TODO:
+ # remove and see what happens.
-# CREATE PRONUNCIATION DICTIONARY -----------------------------------------------------------------
+# CREATE PRONUNCIATION DICTIONARY ---------------------------------------------
# create unique, sorted word list from sentence list
words = []
@@ -114,74 +127,75 @@ uwords = sorted(list(set(words)))
# save word list to file
print("Saving word list to file...")
-uwordsfilename = str(target_directory+'/'+model_name+'.vocab') # correct extension
+uwordsfilename = str(audio_folder+'/'+model_name+'.vocab') # correct extension
uwordsfile = open(uwordsfilename, 'w')
for word in uwords:
uwordsfile.write("%s\n" % word)
# create pronunciation dictionary from word list
-cmudict = []
-print("Opening pronunciation dictionary...")
-with open(pronunciation_dictionary) as f:
- for line in f:
- cmudict.append(line)
-
-pdict = []
-missing_words = []
-l = len(uwords)
-i = 0
-print("Extracting entries corresponding to word list...")
-printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
-
-curr_line = 0
+if create_pronunciation_dictionary:
+ cmudict = []
+ print("Opening pronunciation dictionary...")
+ with open(pronunciation_dictionary) as f:
+ for line in f:
+ cmudict.append(line)
+
+ pdict = []
+ missing_words = []
+ l = len(uwords)
+ i = 0
+ print("Extracting entries corresponding to word list...")
+ printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
-for word in uwords:
-
- wordmatch=False # a counter to help with efficiency
- for line in cmudict[curr_line:]:
-
- regex_string = str('^(?P<text>'+str(word.lower()) + '(\(\d\))?)( |\t)(?P<phones>.+)$')
+ curr_line = 0
- if re.match(regex_string, line):
- # print("match!")
- ms = re.search(regex_string, line)
- pdict.append(str(ms.group('text')+' '+ms.group('phones')))
- wordmatch=True
+ for word in uwords:
- # if I already made a match and I'm not now, time to break. this allows for finding
- # alternate pronunciations
- elif wordmatch:
- # curr_line +=1
- break
+ wordmatch=False # a counter to help with efficiency
+ for line in cmudict[curr_line:]:
- # curr_line +=1
-
- # check for words the pronunciation dictionary doesn't have & save
- if not wordmatch:
- missing_words.append(word)
+ regex_string = str('^(?P<text>'+str(word.lower()) + '(\(\d\))?)( |\t)(?P<phones>.+)$')
- i +=1
-
- printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
+ if re.match(regex_string, line):
+ # print("match!")
+ ms = re.search(regex_string, line)
+ pdict.append(str(ms.group('text')+' '+ms.group('phones')))
+ wordmatch=True
+
+ # if I already made a match and I'm not now, time to break. this allows
+ # for finding alternate pronunciations
+ elif wordmatch:
+ # curr_line +=1
+ break
+
+ # curr_line +=1
+
+ # check for words the pronunciation dictionary doesn't have & save
+ if not wordmatch:
+ missing_words.append(word)
-# save pronunciation dictionary to file
-print("Saving pronunciation dictionary to file...")
-pdictfilename = str(target_directory+'/'+model_name+'.dic')
-pdictfile = open(pdictfilename, 'w')
-for word_entry in pdict:
- pdictfile.write("%s\n" % word_entry)
-
-# RECORD LIST OF MISSING WORDS --------------------------------------------------------------------
-
-if missing_words:
- missing_words_filename = str(target_directory+'/'+model_name+'.missing')
- print("\nWord(s) missing from pronunciation dictionary. See ")
- print(missing_words_filename+" for list.")
- mwordsfile = open(missing_words_filename, 'w')
- for word in missing_words:
- mwordsfile.write("%s\n" % word)
+ i +=1
+
+ printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
+
+ # save pronunciation dictionary to file
+ print("Saving pronunciation dictionary to file...")
+ pdictfilename = str(audio_folder+'/'+model_name+'.dic')
+ pdictfile = open(pdictfilename, 'w')
+ for word_entry in pdict:
+ pdictfile.write("%s\n" % word_entry)
+
+# RECORD LIST OF MISSING WORDS ------------------------------------------------
+
+ if missing_words:
+ missing_words_filename = str(audio_folder+'/'+model_name+'.missing')
+ print("\nWord(s) missing from pronunciation dictionary. See ")
+ print(missing_words_filename+" for list.")
+ mwordsfile = open(missing_words_filename, 'w')
+ for word in missing_words:
+ mwordsfile.write("%s\n" % word)
-# DONE --------------------------------------------------------------------------------------------
+# DONE ------------------------------------------------------------------------
print("Data files created.")