%pylab --no-import-all inline
execfile('load_data.py')
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['plt'] `%pylab --no-import-all` prevents importing * from pylab and numpy
from IPython.display import HTML
def html_table( tbl ):
s = "<table>"
maxwidth = max( map( len, tbl) )
for row in tbl:
if len(row)==1:
s += "<tr><td colspan=%d>" % maxwidth
s += row[0]
s += "</td></tr>"
else:
s += "<tr><td>" + "</td><td>".join( row ) + "</td></tr>"
s += "</table>"
return s
# Basic info
tbl = [ [ 'Set', 'Spoken sentences', 'Spoken phonemes', 'Speakers' ] ]
for desc, data in zip( ('Training', 'Validation', 'Test'), (train,valid,test) ):
tbl.append( [ desc, str(data.number_of_recorded_sentences()),
str(data.number_of_recorded_phonemes()),
str(data.number_of_distinct_speakers()) ] )
assert( len(np.intersect1d( train.spkr, test.spkr))==0 )
assert( len(np.intersect1d( valid.spkr, test.spkr))==0 )
assert( len(np.intersect1d( train.spkr, valid.spkr))==0 )
tbl.append( [ 'Train, valid and test set have no speakers in common' ] )
HTML( html_table( tbl ) )
Set | Spoken sentences | Spoken phonemes | Speakers |
Training | 4120 | 158084 | 412 |
Validation | 500 | 18996 | 50 |
Test | 1680 | 64145 | 168 |
Train, valid and test set have no speakers in common |
# List of all phonemes
tbl = [ ['List of all 61 phonemes' ]]
tbl.append( phonemes[0:31] )
tbl.append( phonemes[31:] )
print html_table(tbl)
HTML( html_table(tbl) )
<table><tr><td colspan=31>List of all 61 phonemes</td></tr><tr><td>iy</td><td>ch</td><td>el</td><td>tcl</td><td>h#</td><td>pcl</td><td>bcl</td><td>zh</td><td>th</td><td>dh</td><td>kcl</td><td>hv</td><td>hh</td><td>dx</td><td>ax-h</td><td>em</td><td>d</td><td>b</td><td>ux</td><td>f</td><td>uw</td><td>l</td><td>n</td><td>p</td><td>r</td><td>t</td><td>v</td><td>z</td><td>aa</td><td>ix</td><td>en</td></tr><tr><td>ae</td><td>eh</td><td>ah</td><td>ao</td><td>ih</td><td>ey</td><td>aw</td><td>ay</td><td>ax</td><td>er</td><td>pau</td><td>eng</td><td>gcl</td><td>ng</td><td>nx</td><td>uh</td><td>dcl</td><td>w</td><td>ow</td><td>jh</td><td>axr</td><td>g</td><td>k</td><td>m</td><td>q</td><td>s</td><td>sh</td><td>oy</td><td>epi</td><td>y</td></tr></table>
List of all 61 phonemes | ||||||||||||||||||||||||||||||
iy | ch | el | tcl | h# | pcl | bcl | zh | th | dh | kcl | hv | hh | dx | ax-h | em | d | b | ux | f | uw | l | n | p | r | t | v | z | aa | ix | en |
ae | eh | ah | ao | ih | ey | aw | ay | ax | er | pau | eng | gcl | ng | nx | uh | dcl | w | ow | jh | axr | g | k | m | q | s | sh | oy | epi | y |
phoneme_num_utterances = []
rows = []
for i in range(len(phonemes)):
num_utterances = sum(train.phn[:,2]==i)
row = (num_utterances,phonemes[i])
rows.append( row )
rows.sort(reverse=True)
tbl = [ ['Number of occurences of each phoneme in training set' ] ]
tbl.append( ['Phoneme', 'Occurences' ] )
tbl = tbl + map( lambda a: [a[1], str(a[0])], rows )
HTML( html_table(tbl) )
Number of occurences of each phoneme in training set | |
Phoneme | Occurences |
h# | 8240 |
ix | 7731 |
s | 6680 |
n | 6315 |
iy | 6219 |
tcl | 5951 |
r | 5855 |
kcl | 5230 |
l | 5160 |
ih | 4543 |
dcl | 4400 |
k | 4338 |
t | 3925 |
ae | 3571 |
m | 3518 |
eh | 3448 |
z | 3367 |
q | 3215 |
ax | 3201 |
d | 3157 |
axr | 3013 |
w | 2789 |
aa | 2757 |
ao | 2650 |
dh | 2501 |
dx | 2415 |
pcl | 2380 |
p | 2341 |
ay | 2123 |
ah | 2049 |
ey | 2041 |
sh | 2021 |
f | 1986 |
gcl | 1973 |
b | 1930 |
ow | 1906 |
er | 1808 |
g | 1786 |
v | 1771 |
bcl | 1690 |
ux | 1663 |
y | 1510 |
epi | 1320 |
ng | 1186 |
jh | 1077 |
hv | 1022 |
pau | 868 |
el | 864 |
hh | 854 |
nx | 852 |
ch | 714 |
th | 675 |
aw | 643 |
en | 639 |
oy | 602 |
uw | 506 |
uh | 483 |
ax-h | 343 |
zh | 128 |
em | 108 |
eng | 33 |
import operator
def dict_count_add( dic, key ):
if key in dic:
dic[key] = dic[key] + 1
else:
dic[key] = 1
output = "Number of recordings of each sentence:<br>"
counts = {}
sorted_counts = {}
# Number of unique senences
for desc,dataset in (('training set', train), ('validation set', valid), ('test set', test)):
count = {}
for i in range(len(dataset.x_raw)):
s = tuple(dataset.sentence_idx_to_word_nums(i))
dict_count_add( count, s )
tbl = [ [desc] ]
tbl.append( ['# Sentences that have', '# recordings'] )
for a in range(max(count.values())+1,0,-1):
if sum(array(count.values())==a)!=0:
tbl.append( [ str( sum(array(count.values())==a) ), str(a) ] )
tbl.append( [ str(len(count.values())), '<- total number of unique sentences' ] )
output += html_table(tbl)
sorted_count = sorted(count.iteritems(), key=operator.itemgetter(1),reverse=True)
tbl = [ ["Two most comment sentences in %s" % desc ] ]
tbl.append( [" ".join( word_num_to_word_str(list(sorted_count[0][0]) ) ) ])
tbl.append( [" ".join( word_num_to_word_str(list(sorted_count[1][0]) ) ) ])
output += html_table(tbl) + "<br>"
counts[dataset] = count
sorted_counts[dataset] = sorted_count
assert( len(np.intersect1d(counts[train].keys(),counts[test].keys()))==2 )
assert( len(np.intersect1d(counts[valid].keys(),counts[test].keys()))==2 )
output += "(Training+validiation) and test set have only the two \"special\" sentences in common.<br>"
output += "Training and validiation sets have a further "
output += str(len(np.intersect1d(counts[train].keys(),counts[valid].keys()))-2) + " sentences in common"
HTML(output)
training set | |
# Sentences that have | # recordings |
2 | 412 |
136 | 7 |
144 | 6 |
36 | 5 |
8 | 4 |
6 | 3 |
1 | 2 |
1248 | 1 |
1581 | <- total number of unique sentences |
Two most comment sentences in training set |
she had your dark suit in greasy wash water all year |
don't ask me to carry an oily rag like that |
validation set | |
# Sentences that have | # recordings |
2 | 50 |
6 | 4 |
4 | 3 |
35 | 2 |
294 | 1 |
341 | <- total number of unique sentences |
Two most comment sentences in validation set |
she had your dark suit in greasy wash water all year |
don't ask me to carry an oily rag like that |
test set | |
# Sentences that have | # recordings |
2 | 168 |
114 | 7 |
5 | 6 |
1 | 5 |
1 | 2 |
509 | 1 |
632 | <- total number of unique sentences |
Two most comment sentences in test set |
she had your dark suit in greasy wash water all year |
don't ask me to carry an oily rag like that |
print "Number of phonetic variations of 10 most common sentences"
sorted_count = sorted_counts[train]
# Count phonetic variations of each sentence
for sent_wrds,count in sorted_count[0:10]:
ct_phn = {}
for idx in range(len(train.x_raw)):
if tuple(train.sentence_idx_to_word_nums(idx))==sent_wrds:
phns = tuple(train.sentence_idx_to_phoneme_nums(idx))
dict_count_add( ct_phn, phns )
print len(ct_phn.values()),"unique phonetic variations out of",count,"recordings"
print "Some phonetic variations of most common sentences"
print "Sentence:", " ".join( word_num_to_word_str(list(sorted_count[0][0])) )
to_show = 2;
for i in range(len(train.x_raw)):
if tuple(train.sentence_idx_to_word_nums(i))==sorted_count[0][0]:
print " ".join( train.sentence_idx_to_phoneme_strs(i) )
to_show -= 1
if to_show==0: break
Number of phonetic variations of 10 most common sentences 410 unique phonetic variations out of 412 recordings 400 unique phonetic variations out of 412 recordings 7 unique phonetic variations out of 7 recordings 7 unique phonetic variations out of 7 recordings 7 unique phonetic variations out of 7 recordings 7 unique phonetic variations out of 7 recordings 7 unique phonetic variations out of 7 recordings 7 unique phonetic variations out of 7 recordings 7 unique phonetic variations out of 7 recordings 7 unique phonetic variations out of 7 recordings Some phonetic variations of most common sentences Sentence: she had your dark suit in greasy wash water all year h# sh iy hv ae dcl jh axr dcl d aa r kcl k s ux dx ix ng gcl g r iy z iy w aa sh epi w ao dx axr q ao l y ih axr h# h# s iy eh dcl d axr dcl d aa r kcl s ux q en gcl g r iy s ix w aa sh epi w ao dx er ao l y ih r h#
# Lengths of phonemes
lengths = []
phoneme_lengths = {}
for phn in phonemes: phoneme_lengths[phn] = []
for idx in range(train.number_of_recorded_phonemes()):
phoneme = train.phoneme_idx_to_phoneme_str(idx)
start, end = train.phoneme_idx_to_offsets(idx)
length = (end-start)/16000.0*1000.0 #Count in milliseconds
phoneme_lengths[phoneme].append( (length,idx) )
def first_element( l ): # Get the first element of each tuple in a list of tuples
return map( lambda x: x[0], l )
medians = map( lambda (key,val): (numpy.median( first_element( val )),key), phoneme_lengths.iteritems() )
medians.sort(reverse=True)
import pylab as plt
plt.figure( figsize=(30,5) )
axis([0, 62, 0, 500 ])
plt.boxplot( map( lambda x: first_element( phoneme_lengths[x[1]]), medians), vert=True )
plt.xticks(range(1,62), map( lambda x: x[1], medians) )
plt.xlabel('Phoneme')
plt.ylabel('Length (ms)')
print "longest and shortest duration of h# phoneme:", min(phoneme_lengths['h#']), max(phoneme_lengths['h#'])
print "longest and shortest duration of pau phoneme:", min(phoneme_lengths['pau']), max(phoneme_lengths['pau'])
plt.show()
longest and shortest duration of h# phoneme: (2.0, 98932) (2996.875, 156710) longest and shortest duration of pau phoneme: (22.5625, 29185) (652.5625, 3828)