format_text.py (8708B)
1 #!/usr/bin/python3 2 # 3 # DESCRIPTION ----------------------------------------------------------------- 4 # 5 # Creates a number of text files dependent on a sentence file which are 6 # required for building a CMU Sphinx voice model. Also uses the extended 7 # PocketSphinx pronunciation dictionary. 8 # 9 # Note that the target directory is the directory where the files should 10 # be saved into. This should be similar to the directory the initial 11 # command was given from within. 12 # 13 # The last two options in the usage example are optional. The 'number of 14 # preexisting audio recordings' variable is used for adding to an 15 # existing collection of audio recordings - it controls how the new 16 # .wav files are named (starting from zero, or something higher). The 17 # last one, 'the fancy sentence list', is only looked for if the other is 18 # present. It is the absolute path of the sentence list that vmc edited 19 # the last time it was creating audio files in the given folder. Instead 20 # of starting from scratch, vmc can simply append the new items into that 21 # list. 22 # 23 # The integration of the new functions should be seamless, though I 24 # anticipate that it will be a royal pain to get it working. 25 # 26 # USAGE | EXAMPLE-------------------------------------------------------------- 27 # 28 # python3 format_text.py /path/to/sentence-file.txt \ 29 # model-name \ 30 # audio_folder \ 31 # iterations \ 32 # num_of_preexisting_audio_recordings 33 # 34 # FILES CREATED --------------------------------------------------------------- 35 # All are marked for new information to be appended. Therefore, I don't 36 # have to provide the old names of any files - just make sure the model 37 # names are the same. Since none of these come with a model name except 38 # the .dict file (.dic??), I should be fine with the current 39 # configuration. 40 # 41 # audio_folder + "/" +model_name + '.transcription' 42 # audio_folder + "/" +model_name + '.fileids' 43 # 44 # audio_folder+'/'+model_name+'.vocab' 45 # audio_folder+'/'+model_name+'.dic' 46 # 47 # IMPORTS ===================================================================== 48 49 import pathlib, re, sys, os, string 50 51 # VARIABLE DEFINITIONS ======================================================== 52 53 create_pronunciation_dictionary = False # This may not be desirable 54 55 sentence_file = sys.argv[1] # os.path.basename() to get just the filename 56 model_name = sys.argv[2] 57 audio_folder = sys.argv[3].rstrip(os.sep) 58 iterations = int(sys.argv[4]) 59 pronunciation_dictionary = '/opt/vmc/lib/cmudict-en-us.dict' 60 recording_count = int(sys.argv[5]) # how many audio files already exist 61 62 if recording_count == 0: 63 append_to_existing=False 64 else: 65 append_to_existing=True 66 67 # LOGIC ======================================================================= 68 69 # Print iterations progress --------------------------------------------------- 70 def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100): 71 72 formatStr = "{0:." + str(decimals) + "f}" 73 percents = formatStr.format(100 * (iteration / float(total))) 74 filledLength = int(round(barLength * iteration / float(total))) 75 bar = '█' * filledLength + '-' * (barLength - filledLength) 76 sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)), 77 if iteration == total: 78 sys.stdout.write('\n') 79 sys.stdout.flush() 80 81 # create files per-audio recording -------------------------------------------- 82 83 sentences_text = ""; lines = []; 84 85 with open(sentence_file) as f: 86 for line in f: lines.append(line) 87 88 for line in lines*iterations: 89 90 def append_to_file(formatted_filename, formatted_text): 91 hs = open(formatted_filename,"a") 92 hs.write(formatted_text) 93 hs.close() 94 95 sentences_text = sentences_text+' '+line 96 recording_count+=1 97 formatted_audio_file_number = str('%04d'%recording_count) 98 99 # create transcription file ----------------------------------------------- 100 101 # clean up the text 102 exclude = set(string.punctuation) 103 sentence = ''.join(ch for ch in line if ch not in exclude) 104 nice_text = sentence.lower().rstrip() 105 106 # format text string and file name with file ids 107 formatted_text = "</s> "+nice_text+" </s> ("+model_name+"_"+formatted_audio_file_number+")\n" 108 formatted_filename = audio_folder + "/" +model_name + '.transcription' 109 110 # save into transcription file 111 append_to_file(formatted_filename, formatted_text) 112 113 #create fileid file ------------------------------------------------------- 114 115 # format file id entry and filename 116 formatted_text = model_name + "_" + formatted_audio_file_number + "\n" 117 formatted_filename = audio_folder + "/" +model_name + '.fileids' 118 119 # save into fileids file 120 append_to_file(formatted_filename, formatted_text) 121 122 # ???? 123 sentences_text = sentences_text+' '+line # why twice? I have no memory of 124 # why I did this. I don't think 125 # it does anything, either. TODO: 126 # remove and see what happens. 127 128 # CREATE PRONUNCIATION DICTIONARY --------------------------------------------- 129 130 # this is the same for the new file and the old file 131 uwordsfilename = str(audio_folder+'/'+model_name+'.vocab') # correct extension 132 133 # create unique, sorted word list from sentence list 134 words = []; print("Creating unique, sorted word list...") 135 136 # get words from new sentence file 137 [words.append(word.strip(string.punctuation).upper().rstrip()) for word in sentences_text.split()] 138 139 # add the words from the old word list (if this is appending to an old model) 140 if append_to_existing: 141 with open(uwordsfilename) as f: 142 for word in f: words.append(word.strip(string.punctuation).upper().rstrip()) 143 144 # set() uniques the list, sorted() puts them a-z. 145 uwords = list(filter(None, sorted(list(set(words))))) 146 147 # save word list to file 148 print("Saving word list to file..."); uwordsfile = open(uwordsfilename, 'w') 149 for word in uwords: 150 uwordsfile.write("%s\n" % word) 151 152 # create pronunciation dictionary from word list 153 if create_pronunciation_dictionary: 154 cmudict = []; print("Opening pronunciation dictionary...") 155 with open(pronunciation_dictionary) as f: 156 for line in f: 157 cmudict.append(line) 158 159 print("Extracting entries corresponding to word list...") 160 pdict = []; missing_words = []; l = len(uwords); i = 0; curr_line = 0 161 printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20) 162 163 for word in uwords: 164 165 wordmatch=False # a counter to help with efficiency 166 for line in cmudict[curr_line:]: 167 168 regex_string = str('^(?P<text>'+str(word.lower()) + '(\(\d\))?)( |\t)(?P<phones>.+)$') 169 170 if re.match(regex_string, line): 171 # print("match!") 172 ms = re.search(regex_string, line) 173 pdict.append(str(ms.group('text')+' '+ms.group('phones'))) 174 wordmatch=True 175 176 # if I already made a match and I'm not now, time to break. this allows 177 # for finding alternate pronunciations 178 elif wordmatch: 179 # curr_line +=1 180 break 181 182 # curr_line +=1 183 184 # check for words the pronunciation dictionary doesn't have & save 185 if not wordmatch: 186 missing_words.append(word) 187 188 i +=1 189 190 printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20) 191 192 # save pronunciation dictionary to file 193 print("Saving pronunciation dictionary to file...") 194 pdictfilename = str(audio_folder+'/'+model_name+'.dic') 195 pdictfile = open(pdictfilename, 'w') 196 for word_entry in pdict: 197 pdictfile.write("%s\n" % word_entry) 198 199 # RECORD LIST OF MISSING WORDS ------------------------------------------------ 200 201 if missing_words: 202 missing_words_filename = str(audio_folder+'/'+model_name+'.missing') 203 print("\nWord(s) missing from pronunciation dictionary. See ") 204 print(missing_words_filename+" for list.") 205 mwordsfile = open(missing_words_filename, 'w') 206 for word in missing_words: 207 mwordsfile.write("%s\n" % word) 208 209 # DONE ------------------------------------------------------------------------ 210 211 print("Data files created.") 212