‹ projects

vmc

a voice model creator for CMU Sphinx
Log | Files | Refs | README | LICENSE

commit 46217e670560561c07474b702c6d1216ee9c6e87
parent 0063f1a59cbf7f822ea5b6fb5146193a6936e4f1
Author: umhau <umhau@alum.gcc.edu>
Date:   Wed, 14 Jun 2017 23:44:09 -0400

cleaned; made able to recognize preexisting audio files and work with them.

Diffstat:
Mlib/format_text.py | 252++++++++++++++++++++++++++++++++++++++++++-------------------------------------
1 file changed, 133 insertions(+), 119 deletions(-)

diff --git a/lib/format_text.py b/lib/format_text.py @@ -1,55 +1,73 @@ #!/usr/bin/python3 # -# DESCRIPTION +# DESCRIPTION ----------------------------------------------------------------- # -# Creates a number of text files dependent on a sentence file which are required for building -# a CMU Sphinx voice model. Also uses the extended PocketSphinx pronunciation dictionary. +# Creates a number of text files dependent on a sentence file which are +# required for building a CMU Sphinx voice model. Also uses the extended +# PocketSphinx pronunciation dictionary. # -# Note that the target directory is the directory where the files should be saved into. This -# should be similar to the directory the initial command was given from within. -# -# USAGE -# -# python3 format_text.py /path/to/sentence-file.txt model-name target-directory iterations +# Note that the target directory is the directory where the files should +# be saved into. This should be similar to the directory the initial +# command was given from within. +# +# The last two options in the usage example are optional. The 'number of +# preexisting audio recordings' variable is used for adding to an +# existing collection of audio recordings - it controls how the new +# .wav files are named (starting from zero, or something higher). The +# last one, 'the fancy sentence list', is only looked for if the other is +# present. It is the absolute path of the sentence list that vmc edited +# the last time it was creating audio files in the given folder. Instead +# of starting from scratch, vmc can simply append the new items into that +# list. +# +# The integration of the new functions should be seamless, though I +# anticipate that it will be a royal pain to get it working. # -# EXAMPLE +# USAGE | EXAMPLE-------------------------------------------------------------- # -# python3 /opt/vmc/functions/format_text.py ~/sentence-file.txt model-name target-directory 2 -# -# DEPENDENCIES +# python3 format_text.py /path/to/sentence-file.txt \ +# model-name \ +# audio_folder \ +# iterations \ +# num_of_preexisting_audio_recordings # -# python3 +# FILES CREATED --------------------------------------------------------------- +# All are marked for new information to be appended. Therefore, I don't +# have to provide the old names of any files - just make sure the model +# names are the same. Since none of these come with a model name except +# the .dict file (.dic??), I should be fine with the current +# configuration. +# +# audio_folder + "/" +model_name + '.transcription' +# audio_folder + "/" +model_name + '.fileids' +# +# audio_folder+'/'+model_name+'.vocab' +# audio_folder+'/'+model_name+'.dic' # -# IMPORTS ========================================================================================= +# IMPORTS ===================================================================== import pathlib, re, sys, os, string -# VARIABLE DEFINITIONS ============================================================================ +# VARIABLE DEFINITIONS ======================================================== -sentence_file = sys.argv[1] # os.path.basename() to get just the filename +create_pronunciation_dictionary = False # This may not be desirable +sentence_file = sys.argv[1] # os.path.basename() to get just the filename model_name = sys.argv[2] - -target_directory = sys.argv[3].rstrip(os.sep) - +audio_folder = sys.argv[3].rstrip(os.sep) iterations = int(sys.argv[4]) +pronunciation_dictionary = '/opt/vmc/lib/cmudict-en-us.dict' -pronunciation_dictionary = '/opt/vmc/tools/cmudict-en-us.dict' +try: + recording_count = int(sys.argv[5]) # how many audio files already exist +except IndexError: + recording_count = 0 -# FUNCTION DEFINITION ============================================================================= +# LOGIC ======================================================================= -# Print iterations progress +# Print iterations progress --------------------------------------------------- def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100): - """ - Call in a loop to create terminal progress bar - @params: - iteration - Required : current iteration (Int) - total - Required : total iterations (Int) - prefix - Optional : prefix string (Str) - suffix - Optional : suffix string (Str) - decimals - Optional : positive number of decimals in percent complete (Int) - barLength - Optional : character length of bar (Int) - """ + formatStr = "{0:." + str(decimals) + "f}" percents = formatStr.format(100 * (iteration / float(total))) filledLength = int(round(barLength * iteration / float(total))) @@ -59,51 +77,46 @@ def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, bar sys.stdout.write('\n') sys.stdout.flush() -# LOGIC =========================================================================================== - - -# FILEID AND TRANSCRIPTION FILES ------------------------------------------------------------------ +# create files per-audio recording -------------------------------------------- -sentences_text = "" - -j=0 -lines = [] +sentences_text = ""; lines = []; with open(sentence_file) as f: + for line in f: lines.append(line) - for line in f: - lines.append(line) - -# this was separated out so I can easily multiply the lines by the iterations...doesn't work to do -# f*iterations (in above loop). for line in lines*iterations: - sentences_text = sentences_text+' '+line + def append_to_file(formatted_filename, formatted_text): + hs = open(formatted_filename,"a") + hs.write(formatted_text) + hs.close() - j+=1 - afno = str('%04d'%j) + sentences_text = sentences_text+' '+line + recording_count+=1 + formatted_audio_file_number = str('%04d'%recording_count) - # create transcription file + # create transcription file ----------------------------------------------- exclude = set(string.punctuation) sentence = ''.join(ch for ch in line if ch not in exclude) nice_text = sentence.lower().rstrip() - formatted_text = "</s> " + nice_text + " </s> (" + model_name + "_" + afno + ")\n" - formatted_filename = target_directory + "/" +model_name + '.transcription' - hs = open(formatted_filename,"a") - hs.write(formatted_text) - hs.close() + formatted_text = "</s> "+nice_text+" </s> ("+model_name+"_"+formatted_audio_file_number+")\n" + formatted_filename = audio_folder + "/" +model_name + '.transcription' + + append_to_file(formatted_filename, formatted_text) - #create fileid file - formatted_text = model_name + "_" + afno + "\n" - formatted_filename = target_directory + "/" +model_name + '.fileids' - hs = open(formatted_filename,"a") - hs.write(formatted_text) - hs.close() + #create fileid file ------------------------------------------------------- + formatted_text = model_name + "_" + formatted_audio_file_number + "\n" + formatted_filename = audio_folder + "/" +model_name + '.fileids' + + append_to_file(formatted_filename, formatted_text) - sentences_text = sentences_text+' '+line # why twice? I have no memory of why I did this. + sentences_text = sentences_text+' '+line # why twice? I have no memory of + # why I did this. I don't think + # it does anything, either. TODO: + # remove and see what happens. -# CREATE PRONUNCIATION DICTIONARY ----------------------------------------------------------------- +# CREATE PRONUNCIATION DICTIONARY --------------------------------------------- # create unique, sorted word list from sentence list words = [] @@ -114,74 +127,75 @@ uwords = sorted(list(set(words))) # save word list to file print("Saving word list to file...") -uwordsfilename = str(target_directory+'/'+model_name+'.vocab') # correct extension +uwordsfilename = str(audio_folder+'/'+model_name+'.vocab') # correct extension uwordsfile = open(uwordsfilename, 'w') for word in uwords: uwordsfile.write("%s\n" % word) # create pronunciation dictionary from word list -cmudict = [] -print("Opening pronunciation dictionary...") -with open(pronunciation_dictionary) as f: - for line in f: - cmudict.append(line) - -pdict = [] -missing_words = [] -l = len(uwords) -i = 0 -print("Extracting entries corresponding to word list...") -printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20) - -curr_line = 0 +if create_pronunciation_dictionary: + cmudict = [] + print("Opening pronunciation dictionary...") + with open(pronunciation_dictionary) as f: + for line in f: + cmudict.append(line) + + pdict = [] + missing_words = [] + l = len(uwords) + i = 0 + print("Extracting entries corresponding to word list...") + printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20) -for word in uwords: - - wordmatch=False # a counter to help with efficiency - for line in cmudict[curr_line:]: - - regex_string = str('^(?P<text>'+str(word.lower()) + '(\(\d\))?)( |\t)(?P<phones>.+)$') + curr_line = 0 - if re.match(regex_string, line): - # print("match!") - ms = re.search(regex_string, line) - pdict.append(str(ms.group('text')+' '+ms.group('phones'))) - wordmatch=True + for word in uwords: - # if I already made a match and I'm not now, time to break. this allows for finding - # alternate pronunciations - elif wordmatch: - # curr_line +=1 - break + wordmatch=False # a counter to help with efficiency + for line in cmudict[curr_line:]: - # curr_line +=1 - - # check for words the pronunciation dictionary doesn't have & save - if not wordmatch: - missing_words.append(word) + regex_string = str('^(?P<text>'+str(word.lower()) + '(\(\d\))?)( |\t)(?P<phones>.+)$') - i +=1 - - printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20) + if re.match(regex_string, line): + # print("match!") + ms = re.search(regex_string, line) + pdict.append(str(ms.group('text')+' '+ms.group('phones'))) + wordmatch=True + + # if I already made a match and I'm not now, time to break. this allows + # for finding alternate pronunciations + elif wordmatch: + # curr_line +=1 + break + + # curr_line +=1 + + # check for words the pronunciation dictionary doesn't have & save + if not wordmatch: + missing_words.append(word) -# save pronunciation dictionary to file -print("Saving pronunciation dictionary to file...") -pdictfilename = str(target_directory+'/'+model_name+'.dic') -pdictfile = open(pdictfilename, 'w') -for word_entry in pdict: - pdictfile.write("%s\n" % word_entry) - -# RECORD LIST OF MISSING WORDS -------------------------------------------------------------------- - -if missing_words: - missing_words_filename = str(target_directory+'/'+model_name+'.missing') - print("\nWord(s) missing from pronunciation dictionary. See ") - print(missing_words_filename+" for list.") - mwordsfile = open(missing_words_filename, 'w') - for word in missing_words: - mwordsfile.write("%s\n" % word) + i +=1 + + printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20) + + # save pronunciation dictionary to file + print("Saving pronunciation dictionary to file...") + pdictfilename = str(audio_folder+'/'+model_name+'.dic') + pdictfile = open(pdictfilename, 'w') + for word_entry in pdict: + pdictfile.write("%s\n" % word_entry) + +# RECORD LIST OF MISSING WORDS ------------------------------------------------ + + if missing_words: + missing_words_filename = str(audio_folder+'/'+model_name+'.missing') + print("\nWord(s) missing from pronunciation dictionary. See ") + print(missing_words_filename+" for list.") + mwordsfile = open(missing_words_filename, 'w') + for word in missing_words: + mwordsfile.write("%s\n" % word) -# DONE -------------------------------------------------------------------------------------------- +# DONE ------------------------------------------------------------------------ print("Data files created.")