‹ projects

vmc

a voice model creator for CMU Sphinx
Log | Files | Refs | README | LICENSE

commit 437e97e470561bc8beaed55e125418e2919b1206
parent af2f5a5dd7dde1f8be7bc03f9c998bf97a6fad47
Author: umhau <umhau@users.noreply.github.com>
Date:   Wed,  2 Nov 2016 17:58:51 -0400

multiple recording iterations now used in training

priorly, the extra iterations of voice recordings weren't being used in training the voice model.  Now they should be.
Diffstat:
Mfunctions/format_text.py | 39+++++++++++++++++++++++----------------
1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/functions/format_text.py b/functions/format_text.py @@ -10,11 +10,11 @@ # # USAGE # -# python3 format_text.py /path/to/sentence-file.txt model-name target-directory +# python3 format_text.py /path/to/sentence-file.txt model-name target-directory iterations # # EXAMPLE # -# python3 /opt/vmc/functions/format_text.py ~/sentence-file.txt model-name target-directory +# python3 /opt/vmc/functions/format_text.py ~/sentence-file.txt model-name target-directory 2 # # DEPENDENCIES # @@ -32,6 +32,8 @@ model_name = sys.argv[2] target_directory = sys.argv[3].rstrip(os.sep) +iterations = int(sys.argv[4]) + pronunciation_dictionary = '/opt/vmc/tools/cmudict-en-us.dict' # FUNCTION DEFINITION ============================================================================= @@ -59,32 +61,34 @@ def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, bar # LOGIC =========================================================================================== + +# FILEID AND TRANSCRIPTION FILES ------------------------------------------------------------------ + sentences_text = "" j=0 with open(sentence_file) as f: - for line in f: + for line in f*iterations: sentences_text = sentences_text+' '+line j+=1 afno = str('%04d'%j) - # get rid of punctuation + # create transcription file exclude = set(string.punctuation) sentence = ''.join(ch for ch in line if ch not in exclude) nice_text = sentence.lower().rstrip() formatted_text = "</s> " + nice_text + " </s> (" + model_name + "_" + afno + ")\n" - # formatted_text = nice_text+"\n" formatted_filename = target_directory + "/" +model_name + '.transcription' hs = open(formatted_filename,"a") hs.write(formatted_text) hs.close() - - #fileid + + #create fileid file formatted_text = model_name + "_" + afno + "\n" formatted_filename = target_directory + "/" +model_name + '.fileids' hs = open(formatted_filename,"a") @@ -93,7 +97,7 @@ with open(sentence_file) as f: sentences_text = sentences_text+' '+line -# def sentence_parsing(sentences_text, model_name, sentence_file, pronunciation_dictionary): +# CREATE PRONUNCIATION DICTIONARY ----------------------------------------------------------------- # create unique, sorted word list from sentence list words = [] @@ -151,9 +155,18 @@ for word in uwords: missing_words.append(word) i +=1 + printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20) -# save missing words list to file +# save pronunciation dictionary to file +print("Saving pronunciation dictionary to file...") +pdictfilename = str(target_directory+'/'+model_name+'.dic') +pdictfile = open(pdictfilename, 'w') +for word_entry in pdict: + pdictfile.write("%s\n" % word_entry) + +# RECORD LIST OF MISSING WORDS -------------------------------------------------------------------- + if missing_words: missing_words_filename = str(target_directory+'/'+model_name+'.missing') print("\nWord(s) missing from pronunciation dictionary. See ") @@ -162,13 +175,7 @@ if missing_words: for word in missing_words: mwordsfile.write("%s\n" % word) -# save pronunciation dictionary to file -print("Saving pronunciation dictionary to file...") -pdictfilename = str(target_directory+'/'+model_name+'.dic') -pdictfile = open(pdictfilename, 'w') -for word_entry in pdict: - pdictfile.write("%s\n" % word_entry) +# DONE -------------------------------------------------------------------------------------------- -# final instructions print("Data files created.")