‹ projects

vmc

a voice model creator for CMU Sphinx
Log | Files | Refs | README | LICENSE

format_text.py (8708B)


      1 #!/usr/bin/python3
      2 # 
      3 # DESCRIPTION -----------------------------------------------------------------
      4 # 
      5 #       Creates a number of text files dependent on a sentence file which are 
      6 #       required for building a CMU Sphinx voice model.  Also uses the extended
      7 #       PocketSphinx pronunciation dictionary.
      8 # 
      9 #       Note that the target directory is the directory where the files should 
     10 #       be saved into. This should be similar to the directory the initial 
     11 #       command was given from within.
     12 #
     13 #       The last two options in the usage example are optional. The 'number of 
     14 #       preexisting audio recordings' variable is used for adding to an 
     15 #       existing collection of audio recordings - it controls how the new 
     16 #       .wav files are named (starting from zero, or something higher).  The 
     17 #       last one, 'the fancy sentence list', is only looked for if the other is
     18 #       present.  It is the absolute path of the sentence list that vmc edited
     19 #       the last time it was creating audio files in the given folder.  Instead
     20 #       of starting from scratch, vmc can simply append the new items into that 
     21 #       list.  
     22 #
     23 #       The integration of the new functions should be seamless, though I 
     24 #       anticipate that it will be a royal pain to get it working.
     25 # 
     26 # USAGE | EXAMPLE--------------------------------------------------------------
     27 # 
     28 #       python3 format_text.py /path/to/sentence-file.txt \
     29 #                              model-name \
     30 #                              audio_folder \
     31 #                              iterations \ 
     32 #                              num_of_preexisting_audio_recordings 
     33 # 
     34 # FILES CREATED ---------------------------------------------------------------
     35 #       All are marked for new information to be appended. Therefore, I don't
     36 #       have to provide the old names of any files - just make sure the model
     37 #       names are the same.  Since none of these come with a model name except
     38 #       the .dict file (.dic??), I should be fine with the current 
     39 #       configuration.
     40 #   
     41 #       audio_folder + "/" +model_name + '.transcription'
     42 #       audio_folder + "/" +model_name + '.fileids'
     43 #       
     44 #       audio_folder+'/'+model_name+'.vocab'
     45 #       audio_folder+'/'+model_name+'.dic'       
     46 # 
     47 # IMPORTS =====================================================================
     48 
     49 import pathlib, re, sys, os, string
     50 
     51 # VARIABLE DEFINITIONS ========================================================
     52 
     53 create_pronunciation_dictionary = False # This may not be desirable
     54 
     55 sentence_file = sys.argv[1] # os.path.basename() to get just the filename
     56 model_name = sys.argv[2]
     57 audio_folder = sys.argv[3].rstrip(os.sep)
     58 iterations = int(sys.argv[4])
     59 pronunciation_dictionary = '/opt/vmc/lib/cmudict-en-us.dict'
     60 recording_count = int(sys.argv[5]) # how many audio files already exist
     61 
     62 if recording_count == 0:
     63     append_to_existing=False
     64 else:
     65     append_to_existing=True
     66    
     67 # LOGIC =======================================================================
     68 
     69 # Print iterations progress ---------------------------------------------------
     70 def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100):
     71 
     72     formatStr       = "{0:." + str(decimals) + "f}"
     73     percents        = formatStr.format(100 * (iteration / float(total)))
     74     filledLength    = int(round(barLength * iteration / float(total)))
     75     bar             = '█' * filledLength + '-' * (barLength - filledLength)
     76     sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)),
     77     if iteration == total:
     78         sys.stdout.write('\n')
     79     sys.stdout.flush()
     80 
     81 # create files per-audio recording --------------------------------------------
     82 
     83 sentences_text = ""; lines = []; 
     84 
     85 with open(sentence_file) as f:
     86     for line in f: lines.append(line)
     87 
     88 for line in lines*iterations:
     89 
     90     def append_to_file(formatted_filename, formatted_text):
     91         hs = open(formatted_filename,"a")
     92         hs.write(formatted_text)
     93         hs.close() 
     94 
     95     sentences_text = sentences_text+' '+line
     96     recording_count+=1
     97     formatted_audio_file_number = str('%04d'%recording_count)
     98 
     99     # create transcription file -----------------------------------------------
    100     
    101     # clean up the text 
    102     exclude = set(string.punctuation)
    103     sentence = ''.join(ch for ch in line if ch not in exclude)
    104     nice_text = sentence.lower().rstrip()
    105 
    106     # format text string and file name with file ids 
    107     formatted_text = "</s> "+nice_text+" </s> ("+model_name+"_"+formatted_audio_file_number+")\n"
    108     formatted_filename = audio_folder + "/" +model_name + '.transcription'
    109 
    110     # save into transcription file
    111     append_to_file(formatted_filename, formatted_text)
    112     
    113     #create fileid file -------------------------------------------------------
    114 
    115     # format file id entry and filename
    116     formatted_text = model_name + "_" + formatted_audio_file_number + "\n"
    117     formatted_filename = audio_folder + "/" +model_name + '.fileids'
    118 
    119     # save into fileids file
    120     append_to_file(formatted_filename, formatted_text)
    121 
    122     # ????
    123     sentences_text = sentences_text+' '+line # why twice? I have no memory of 
    124                                              # why I did this.  I don't think 
    125                                              # it does anything, either. TODO:
    126                                              # remove and see what happens.
    127 
    128 # CREATE PRONUNCIATION DICTIONARY ---------------------------------------------
    129 
    130 # this is the same for the new file and the old file
    131 uwordsfilename = str(audio_folder+'/'+model_name+'.vocab') # correct extension
    132 
    133 # create unique, sorted word list from sentence list
    134 words = []; print("Creating unique, sorted word list...")
    135 
    136 # get words from new sentence file
    137 [words.append(word.strip(string.punctuation).upper().rstrip()) for word in sentences_text.split()]
    138 
    139 # add the words from the old word list (if this is appending to an old model)
    140 if append_to_existing:
    141     with open(uwordsfilename) as f:
    142         for word in f: words.append(word.strip(string.punctuation).upper().rstrip())
    143 
    144 # set() uniques the list, sorted() puts them a-z.
    145 uwords = list(filter(None, sorted(list(set(words)))))
    146 
    147 # save word list to file
    148 print("Saving word list to file..."); uwordsfile = open(uwordsfilename, 'w')
    149 for word in uwords:
    150     uwordsfile.write("%s\n" % word)
    151 
    152 # create pronunciation dictionary from word list
    153 if create_pronunciation_dictionary:
    154     cmudict = []; print("Opening pronunciation dictionary...")
    155     with open(pronunciation_dictionary) as f:
    156         for line in f:
    157             cmudict.append(line)
    158 
    159     print("Extracting entries corresponding to word list...")
    160     pdict = []; missing_words = []; l = len(uwords); i = 0; curr_line = 0
    161     printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20) 
    162 
    163     for word in uwords:
    164         
    165         wordmatch=False # a counter to help with efficiency
    166         for line in cmudict[curr_line:]:        
    167             
    168             regex_string = str('^(?P<text>'+str(word.lower()) + '(\(\d\))?)( |\t)(?P<phones>.+)$')
    169 
    170             if re.match(regex_string, line):
    171                 # print("match!")
    172                 ms = re.search(regex_string, line)
    173                 pdict.append(str(ms.group('text')+' '+ms.group('phones')))
    174                 wordmatch=True
    175             
    176             # if I already made a match and I'm not now, time to break. this allows
    177             # for finding alternate pronunciations
    178             elif wordmatch: 
    179                 # curr_line +=1
    180                 break
    181                 
    182             # curr_line +=1
    183         
    184         # check for words the pronunciation dictionary doesn't have & save
    185         if not wordmatch:
    186             missing_words.append(word)
    187 
    188         i +=1
    189         
    190         printProgress (i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 2, barLength = 20)
    191 
    192     # save pronunciation dictionary to file
    193     print("Saving pronunciation dictionary to file...")
    194     pdictfilename = str(audio_folder+'/'+model_name+'.dic')
    195     pdictfile = open(pdictfilename, 'w')
    196     for word_entry in pdict:
    197         pdictfile.write("%s\n" % word_entry)
    198 
    199 # RECORD LIST OF MISSING WORDS ------------------------------------------------
    200 
    201     if missing_words:
    202         missing_words_filename = str(audio_folder+'/'+model_name+'.missing')
    203         print("\nWord(s) missing from pronunciation dictionary. See ")
    204         print(missing_words_filename+" for list.")
    205         mwordsfile = open(missing_words_filename, 'w')
    206         for word in missing_words:
    207             mwordsfile.write("%s\n" % word)
    208             
    209 # DONE ------------------------------------------------------------------------
    210 
    211 print("Data files created.")
    212