Notebook

In [228]:

%pylab --no-import-all inline
execfile('load_data.py')

Populating the interactive namespace from numpy and matplotlib

WARNING: pylab import has clobbered these variables: ['plt']
`%pylab --no-import-all` prevents importing * from pylab and numpy

In [229]:

from IPython.display import HTML
def html_table( tbl ):
    s = "<table>"
    maxwidth = max( map( len, tbl) )
    for row in tbl:
        if len(row)==1:
            s += "<tr><td colspan=%d>" % maxwidth
            s += row[0]
            s += "</td></tr>"
        else:
            s += "<tr><td>" + "</td><td>".join( row ) + "</td></tr>"
    s += "</table>"
    return s

In [230]:

# Basic info
tbl = [ [ 'Set',      'Spoken sentences', 'Spoken phonemes', 'Speakers' ] ]

for desc, data in zip( ('Training', 'Validation', 'Test'), (train,valid,test) ):
    tbl.append( [ desc, str(data.number_of_recorded_sentences()),
                              str(data.number_of_recorded_phonemes()),
                              str(data.number_of_distinct_speakers()) ] )
assert( len(np.intersect1d( train.spkr, test.spkr))==0 )
assert( len(np.intersect1d( valid.spkr, test.spkr))==0 )
assert( len(np.intersect1d( train.spkr, valid.spkr))==0 )
tbl.append( [ 'Train, valid and test set have no speakers in common' ] )

HTML( html_table( tbl ) )

Out[230]:

Set	Spoken sentences	Spoken phonemes	Speakers
Training	4120	158084	412
Validation	500	18996	50
Test	1680	64145	168
Train, valid and test set have no speakers in common

In [238]:

# List of all phonemes
tbl = [ ['List of all 61 phonemes' ]]
tbl.append( phonemes[0:31] )
tbl.append( phonemes[31:] )

print  html_table(tbl) 

HTML( html_table(tbl) )

<table><tr><td colspan=31>List of all 61 phonemes</td></tr><tr><td>iy</td><td>ch</td><td>el</td><td>tcl</td><td>h#</td><td>pcl</td><td>bcl</td><td>zh</td><td>th</td><td>dh</td><td>kcl</td><td>hv</td><td>hh</td><td>dx</td><td>ax-h</td><td>em</td><td>d</td><td>b</td><td>ux</td><td>f</td><td>uw</td><td>l</td><td>n</td><td>p</td><td>r</td><td>t</td><td>v</td><td>z</td><td>aa</td><td>ix</td><td>en</td></tr><tr><td>ae</td><td>eh</td><td>ah</td><td>ao</td><td>ih</td><td>ey</td><td>aw</td><td>ay</td><td>ax</td><td>er</td><td>pau</td><td>eng</td><td>gcl</td><td>ng</td><td>nx</td><td>uh</td><td>dcl</td><td>w</td><td>ow</td><td>jh</td><td>axr</td><td>g</td><td>k</td><td>m</td><td>q</td><td>s</td><td>sh</td><td>oy</td><td>epi</td><td>y</td></tr></table>

Out[238]:

List of all 61 phonemes

tcl

pcl

bcl

kcl

ax-h

pau

eng

gcl

dcl

axr

epi

In [140]:

phoneme_num_utterances = []
rows = []
for i in range(len(phonemes)):
    num_utterances = sum(train.phn[:,2]==i)
    row = (num_utterances,phonemes[i])
    rows.append( row )

rows.sort(reverse=True)

tbl = [ ['Number of occurences of each phoneme in training set' ] ]
tbl.append( ['Phoneme', 'Occurences' ] )
tbl = tbl + map( lambda a: [a[1], str(a[0])], rows )
HTML( html_table(tbl) )

Out[140]:

Number of occurences of each phoneme in training set
Phoneme	Occurences
h#	8240
ix	7731
s	6680
n	6315
iy	6219
tcl	5951
r	5855
kcl	5230
l	5160
ih	4543
dcl	4400
k	4338
t	3925
ae	3571
m	3518
eh	3448
z	3367
q	3215
ax	3201
d	3157
axr	3013
w	2789
aa	2757
ao	2650
dh	2501
dx	2415
pcl	2380
p	2341
ay	2123
ah	2049
ey	2041
sh	2021
f	1986
gcl	1973
b	1930
ow	1906
er	1808
g	1786
v	1771
bcl	1690
ux	1663
y	1510
epi	1320
ng	1186
jh	1077
hv	1022
pau	868
el	864
hh	854
nx	852
ch	714
th	675
aw	643
en	639
oy	602
uw	506
uh	483
ax-h	343
zh	128
em	108
eng	33

In [162]:

import operator
def dict_count_add( dic, key ):
    if key in dic:
        dic[key] = dic[key] + 1
    else:
        dic[key] = 1


output = "Number of recordings of each sentence:<br>"
counts = {}
sorted_counts = {}
# Number of unique senences
for desc,dataset in (('training set', train), ('validation set', valid), ('test set', test)):
    count = {}
    for i in range(len(dataset.x_raw)):
        s = tuple(dataset.sentence_idx_to_word_nums(i))   
        dict_count_add( count, s )
    tbl = [ [desc] ]
    tbl.append( ['# Sentences that have', '# recordings'] )
    for a in range(max(count.values())+1,0,-1):
        if sum(array(count.values())==a)!=0:
            tbl.append( [ str( sum(array(count.values())==a) ), str(a) ] )
    tbl.append( [ str(len(count.values())), '<- total number of unique sentences' ] )
    output += html_table(tbl)
    sorted_count = sorted(count.iteritems(), key=operator.itemgetter(1),reverse=True)
    tbl = [ ["Two most comment sentences in %s" % desc ] ]
    tbl.append( [" ".join( word_num_to_word_str(list(sorted_count[0][0]) ) ) ])
    tbl.append( [" ".join( word_num_to_word_str(list(sorted_count[1][0]) ) ) ])
    output += html_table(tbl) + "<br>"
    
    counts[dataset] = count
    sorted_counts[dataset] = sorted_count


assert( len(np.intersect1d(counts[train].keys(),counts[test].keys()))==2 )
assert( len(np.intersect1d(counts[valid].keys(),counts[test].keys()))==2 )
output += "(Training+validiation) and test set have only the two \"special\" sentences in common.<br>"
output += "Training and validiation sets have a further "
output += str(len(np.intersect1d(counts[train].keys(),counts[valid].keys()))-2) + " sentences in common"

HTML(output)

Out[162]:

Number of recordings of each sentence:

training set
# Sentences that have	# recordings
2	412
136	7
144	6
36	5
8	4
6	3
1	2
1248	1
1581	<- total number of unique sentences

Two most comment sentences in training set

she had your dark suit in greasy wash water all year

don't ask me to carry an oily rag like that

validation set
# Sentences that have	# recordings
2	50
6	4
4	3
35	2
294	1
341	<- total number of unique sentences

Two most comment sentences in validation set

she had your dark suit in greasy wash water all year

don't ask me to carry an oily rag like that

test set
# Sentences that have	# recordings
2	168
114	7
5	6
1	5
1	2
509	1
632	<- total number of unique sentences

Two most comment sentences in test set

she had your dark suit in greasy wash water all year

don't ask me to carry an oily rag like that

(Training+validiation) and test set have only the two "special" sentences in common.
Training and validiation sets have a further 187 sentences in common

In [164]:

print "Number of phonetic variations of 10 most common sentences"
sorted_count = sorted_counts[train]
# Count phonetic variations of each sentence
for sent_wrds,count in sorted_count[0:10]:
    ct_phn = {}
    for idx in range(len(train.x_raw)):
        if tuple(train.sentence_idx_to_word_nums(idx))==sent_wrds:
            phns = tuple(train.sentence_idx_to_phoneme_nums(idx))
            dict_count_add( ct_phn, phns )
    print len(ct_phn.values()),"unique phonetic variations out of",count,"recordings"

print "Some phonetic variations of most common sentences"
print "Sentence:", " ".join( word_num_to_word_str(list(sorted_count[0][0])) )
to_show = 2;
for i in range(len(train.x_raw)):
    if tuple(train.sentence_idx_to_word_nums(i))==sorted_count[0][0]:
        print " ".join( train.sentence_idx_to_phoneme_strs(i) )
        to_show -= 1
        if to_show==0: break

Number of phonetic variations of 10 most common sentences
410 unique phonetic variations out of 412 recordings
400 unique phonetic variations out of 412 recordings
7 unique phonetic variations out of 7 recordings
7 unique phonetic variations out of 7 recordings
7 unique phonetic variations out of 7 recordings
7 unique phonetic variations out of 7 recordings
7 unique phonetic variations out of 7 recordings
7 unique phonetic variations out of 7 recordings
7 unique phonetic variations out of 7 recordings
7 unique phonetic variations out of 7 recordings
Some phonetic variations of most common sentences
Sentence: she had your dark suit in greasy wash water all year
h# sh iy hv ae dcl jh axr dcl d aa r kcl k s ux dx ix ng gcl g r iy z iy w aa sh epi w ao dx axr q ao l y ih axr h#
h# s iy eh dcl d axr dcl d aa r kcl s ux q en gcl g r iy s ix w aa sh epi w ao dx er ao l y ih r h#

In [218]:

# Lengths of phonemes
lengths = []
phoneme_lengths = {}
for phn in phonemes: phoneme_lengths[phn] = []
for idx in range(train.number_of_recorded_phonemes()):
    phoneme = train.phoneme_idx_to_phoneme_str(idx)
    start, end = train.phoneme_idx_to_offsets(idx)
    length = (end-start)/16000.0*1000.0 #Count in milliseconds
    phoneme_lengths[phoneme].append( (length,idx) )

def first_element( l ): # Get the first element of each tuple in a list of tuples
    return map( lambda x: x[0], l )

medians = map( lambda (key,val): (numpy.median( first_element( val )),key), phoneme_lengths.iteritems() ) 
medians.sort(reverse=True)

import pylab as plt
plt.figure( figsize=(30,5) )
axis([0, 62, 0, 500 ])
plt.boxplot( map( lambda x: first_element( phoneme_lengths[x[1]]), medians), vert=True )
plt.xticks(range(1,62), map( lambda x: x[1], medians) )
plt.xlabel('Phoneme')
plt.ylabel('Length (ms)')

print "longest and shortest duration of h# phoneme:", min(phoneme_lengths['h#']), max(phoneme_lengths['h#'])
print "longest and shortest duration of pau phoneme:", min(phoneme_lengths['pau']), max(phoneme_lengths['pau'])

plt.show()

longest and shortest duration of h# phoneme: (2.0, 98932) (2996.875, 156710)
longest and shortest duration of pau phoneme: (22.5625, 29185) (652.5625, 3828)

In [ ]: