Delete format_text.py - vmc - a voice model creator for CMU Sphinx

commit 0a463e90f927f1084278e0ee6cb69ae7cb90994b
parent 155cdcb301f4523889278f33fec4a8eaba4b8c62
Author: umhau <umhau@users.noreply.github.com>
Date:   Tue,  1 Nov 2016 19:09:23 -0400

Delete format_text.py
Diffstat:
D format_text.py  | 174 -------------------------------------------------------------------------------

1 file changed, 0 insertions(+), 174 deletions(-)
diff --git a/format_text.py b/format_text.py
@@ -1,174 +0,0 @@
-#!/usr/bin/python3
-# 
-# DESCRIPTION
-# 
-#       Creates a number of text files dependent on a sentence file which are required for building
-#       a CMU Sphinx voice model.  Also uses the extended PocketSphinx pronunciation dictionary.
-# 
-#       Note that the target directory is the directory where the files should be saved into. This 
-#       should be similar to the directory the initial command was given from within.
-# 
-# USAGE
-# 
-#       python3 format_text.py /path/to/sentence-file.txt model-name target-directory
-# 
-# EXAMPLE
-# 
-#       python3 /opt/vmc/functions/format_text.py ~/sentence-file.txt model-name target-directory
-#
-# DEPENDENCIES
-# 
-#       python3
-# 
-# IMPORTS =========================================================================================
-
-import pathlib, re, sys, os, string
-
-# VARIABLE DEFINITIONS ============================================================================
-
-sentence_file = sys.argv[1] # os.path.basename() to get just the filename
-
-model_name = sys.argv[2]
-
-target_directory = sys.argv[3].rstrip(os.sep)
-
-pronunciation_dictionary = '/opt/vmc/tools/cmudict-en-us.dict'
-
-# FUNCTION DEFINITION =============================================================================
-
-# Print iterations progress
-def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100):
-    """
-    Call in a loop to create terminal progress bar
-    @params:
-        iteration   - Required  : current iteration (Int)
-        total       - Required  : total iterations (Int)
-        prefix      - Optional  : prefix string (Str)
-        suffix      - Optional  : suffix string (Str)
-        decimals    - Optional  : positive number of decimals in percent complete (Int)
-        barLength   - Optional  : character length of bar (Int)
-    """
-    formatStr       = "{0:." + str(decimals) + "f}"
-    percents        = formatStr.format(100 * (iteration / float(total)))
-    filledLength    = int(round(barLength * iteration / float(total)))
-    bar             = '█' * filledLength + '-' * (barLength - filledLength)
-    sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)),
-    if iteration == total:
-        sys.stdout.write('\n')
-    sys.stdout.flush()
-
-# LOGIC ===========================================================================================
-
-sentences_text = ""
-
-j=0
-
-with open(sentence_file) as f:
-
-    for line in f:
-
-        sentences_text = sentences_text+' '+line
-
-        j+=1
-        afno = str('%04d'%j)
-
-        # get rid of punctuation
-        exclude = set(string.punctuation)
-        sentence = ''.join(ch for ch in line if ch not in exclude)
-
-        nice_text = sentence.lower().rstrip()
-        formatted_text = "</s> " +  nice_text +  " </s> (" + model_name + "_" + afno + ")\n"
-        # formatted_text = nice_text+"\n"
-        formatted_filename = target_directory + "/" +model_name + '.transcription'             
-        hs = open(formatted_filename,"a")
-        hs.write(formatted_text)
-        hs.close() 
-
-        #fileid
-        formatted_text = model_name + "_" + afno + "\n"
-        formatted_filename = target_directory + "/" +model_name + '.fileids'
-        hs = open(formatted_filename,"a")
-        hs.write(formatted_text)
-        hs.close() 
-
-        sentences_text = sentences_text+' '+line
-
-# def sentence_parsing(sentences_text, model_name, sentence_file, pronunciation_dictionary):
-
-# create unique, sorted word list from sentence list
-words = []
-print("Creating unique, sorted word list...")
-[words.append(word.strip(string.punctuation).upper()) for word in sentences_text.split()]
-# set() uniques the list, sorted() puts them a-z.
-uwords = sorted(list(set(words))) 
-
-# save word list to file
-print("Saving word list to file...")
-uwordsfilename = str(target_directory+'/'+model_name+'.vocab') # correct extension
-uwordsfile = open(uwordsfilename, 'w')
-for word in uwords:
-    uwordsfile.write("%s\n" % word)
-
-# create pronunciation dictionary from word list
-cmudict = []
-print("Opening pronunciation dictionary...")
-with open(pronunciation_dictionary) as f:
-    for line in f:
-        cmudict.append(line)
-
-pdict = []
-missing_words = []
-l = len(uwords)
-i = 0
-print("Extracting entries corresponding to word list...")
-printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
-
-curr_line = 0
-
-for word in uwords:
-    
-    wordmatch=False # a counter to help with efficiency
-    for line in cmudict[curr_line:]:        
-        
-        regex_string = str('^(?P<text>'+str(word.lower()) + '(\(\d\))?)( |\t)(?P<phones>.+)$')
-
-        if re.match(regex_string, line):
-            # print("match!")
-            ms = re.search(regex_string, line)
-            pdict.append(str(ms.group('text')+' '+ms.group('phones')))
-            wordmatch=True
-        
-        # if I already made a match and I'm not now, time to break. this allows for finding 
-        # alternate pronunciations
-        elif wordmatch: 
-            # curr_line +=1
-            break
-            
-        # curr_line +=1
-    
-    # check for words the pronunciation dictionary doesn't have & save
-    if not wordmatch:
-        missing_words.append(word)
-
-    i +=1
-    printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
-
-# save missing words list to file
-if missing_words:
-    missing_words_filename = str(target_directory+'/'+model_name+'.missing')
-    print("\nWord(s) missing from pronunciation dictionary. See ")
-    print(missing_words_filename+" for list.")
-    mwordsfile = open(missing_words_filename, 'w')
-    for word in missing_words:
-        mwordsfile.write("%s\n" % word)
-            
-# save pronunciation dictionary to file
-print("Saving pronunciation dictionary to file...")
-pdictfilename = str(target_directory+'/'+model_name+'.dic')
-pdictfile = open(pdictfilename, 'w')
-for word_entry in pdict:
-    pdictfile.write("%s\n" % word_entry)
-
-# final instructions
-print("Data files created.")
-

‹ projects	vmc a voice model creator for CMU Sphinx
	Log \| Files \| Refs \| README \| LICENSE