GmmLatgenWrapper is a minimalistic extension to Kaldi toolkit, supporting real time decoding.
We use it through its PyGmmLatgenWrapper Python wrapper. Both developed at pykaldi git repository:
https://github.com/UFAL-DSG/pykaldi
The evaluation scripts are also available online:
%pylab inline
import pickle
import pylab as pl
from collections import namedtuple
from subprocess import call
def cwer(ins, dl, sub, refer_words):
return (100.0 * (ins + dl + sub)) / refer_words
def pdf_color2grey(file_path_in, file_path_out):
call(['gs', '-sDEVICE=pdfwrite', '-sProcessColorModel=DeviceGray',
'-sColorConversionStrategy=Gray', '-dOverrideICC',
'-o',
file_path_out,
'-f',
file_path_in])
# Measured for 50 wav utterances at once
SysRecord = namedtuple('SysRecord',
['wav_scp', 'beam', 'lattice_beam', 'max_active', 'wavlen',
'decodelen', 'forwardlen', 'backwardlen', 'refer_words', 'ins', 'dl', 'sub', 'ser'])
# Measured for every single wav utterances (50 utterances)
UserRecord = namedtuple('UserRecord',
['wav_name', 'wav_len', 'beam', 'lattice_beam', 'max_active', 'forwardlen', 'backwardlen'])
from wavaskey import load_wavaskey
from os.path import basename
google_dec = load_wavaskey('./dec_duration.txt', float)
google_dec = dict([(basename(k), v) for (k, v) in google_dec.iteritems()])
# with open('./all_5931_few_test_30_new_kaldi.pickle.log', 'r') as r:
# with open('./1635.ma5000_wer30.pickle.log', 'r') as r:
with open('./1635.maALL_wer30.pickle.log', 'r') as r:
sys_time_r = pickle.load(r)
user_time_r = pickle.load(r)
# Filter out extremes (wavs with length max_wav_len+ s), where VAD failed
max_wav_len = 20.0
user_time_r = [ rec for rec in user_time_r if rec.wav_len < max_wav_len]
user_time_google = []
for rec in user_time_r:
new_rec = UserRecord(rec.wav_name, rec.wav_len, None, None, None, google_dec[rec.wav_name], 0.0)
user_time_google.append(new_rec)
beam_range = sorted(list(set([rec.beam for rec in sys_time_r])))
lat_beam_range = sorted(list(set([rec.lattice_beam for rec in user_time_r])))
max_active_range = sorted(list(set([rec.max_active for rec in user_time_r])))
# list of wav length values for particular fixed setup (all setups use the same wavs).
wav_len_values = sorted([urec.wav_len for urec in user_time_r if
urec.max_active==max_active_range[0] and urec.lattice_beam==lat_beam_range[0] and urec.beam==beam_range[0]])
# Check
wl_dup, bt, beam, lat_beam, max_active = [], [], beam_range[0], lat_beam_range[0], max_active_range[0]
wav_name_len, exp_num_conf = len([rec.wav_name for rec in user_time_r]), len(beam_range) * len(lat_beam_range) * len(max_active_range)
print 'Checking if integer: We used %0.1f wave files in wav.scp' % (wav_name_len / float(exp_num_conf))
Populating the interactive namespace from numpy and matplotlib Checking if integer: We used 1172.7 wave files in wav.scp
WARNING: pylab import has clobbered these variables: ['rec'] `%pylab --no-import-all` prevents importing * from pylab and numpy
print('Number of wavs %d. Min length %f, Max length %f, Avg length %f' % (len(wav_len_values), min(wav_len_values), max(wav_len_values), sum(wav_len_values) / len(wav_len_values)))
subplot(1,4,1)
plot(range(len(wav_len_values)), wav_len_values, '+')
ylabel('wav len')
xlabel('wav index')
title('Wave lengths')
subplot(1,4,2)
ylabel('beam values')
title('Forward beam')
bar(range(len(beam_range)), beam_range)
frame = gca()
frame.axes.get_xaxis().set_visible(False)
subplot(1,4,3)
title('Lattice beam')
ylabel('lattice-beam')
bar(range(len(lat_beam_range)), lat_beam_range)
frame = gca()
frame.axes.get_xaxis().set_visible(False)
subplot(1,4,4)
title('Max active')
ylabel('Max active')
bar(range(len(max_active_range)), max_active_range)
frame = gca()
frame.axes.get_xaxis().set_visible(False)
tight_layout()
Number of wavs 1219. Min length 0.160000, Max length 17.760000, Avg length 2.030727
beam, lat_beam, max_active = beam_range[4], lat_beam_range[4], max_active_range[0]
rtf = sorted([ rec.forwardlen / float(rec.wav_len) for rec in user_time_r if rec.beam == beam and rec.lattice_beam == lat_beam and rec.max_active == max_active])
rtf_95 = rtf[0:int(0.95*len(rtf))]
percents = [float(x) / len(rtf) for x in range(1,len(rtf)+1,1)]
percents_95 = percents[0:int(0.95*len(percents))]
#subplot(2,1,1)
#ylim([0, 1.1])
#plot(percents_95, rtf_95, '.-.')
#title('(b: %0.1f, lb: %0.1f, ma: %d) Forward RTF decoding vs percentil (0, 0.95)'% (beam, lat_beam, max_active))
#ylabel('Forward RTF')
#plot(percents_95, [0.6] * len(percents_95), 'g--')
#plot(percents_95, [1.0] * len(percents_95), 'r--')
# subplot(2,1,2)
percents = [100*w for w in percents]
ylim([0, 2.1])
p1, = plot(percents, rtf, '.-.')
print('(b: %0.1f, lb: %0.1f, ma: %d) RTF decoding vs percentile' % (beam, lat_beam, max_active))
title('a')
ylabel('RTF')
xlabel('percentile')
p2, = plot(percents, [0.6] * len(percents), 'g--')
p3, = plot(percents, [1.0] * len(percents), 'r:')
p4 = axvline(95, color='g')
legend([p1, p2, p3, p4], ['RTF', 'Desired 0.6 RTF', 'Critical 1.0 RTF', '95th percentile'], loc=2, prop={'size':'small'})
xlim([0, 100])
pdfname = 'frtf_vs_prc.pdf'
#savefig(pdfname, bbox_inches='tight')
#pdf_color2grey(pdfname, 'grey_' + pdfname)
savefig(pdfname+'.ps', bbox_inches='tight')
(b: 12.0, lb: 5.0, ma: 2000) RTF decoding vs percentile
/usr/lib/python2.7/site-packages/matplotlib/tight_bbox.py:55: UserWarning: bbox_inches option for ps backend is not implemented yet. "implemented yet." % (format))
beam, lat_beam, max_active = beam_range[4], lat_beam_range[4], max_active_range[0]
bt = sorted([ 1000* rec.backwardlen for rec in user_time_r if rec.beam == beam and rec.lattice_beam == lat_beam and rec.max_active == max_active])
bt_95 = bt[0:int(0.95*len(bt))]
percents = [float(x) / len(bt) for x in range(1,len(bt)+1,1)]
percents_95 = percents[0:int(0.95*len(percents))]
#subplot(2,1,1)
#ylim([0, 1.0])
#plot(percents_95, bt_95,'.-.')
#title('(b: %0.1f, lb: %0.1f, ma: %d) Backward decoding vs percentil' % (beam, lat_beam, max_active))
#ylabel('Backward decoding time')
#plot(percents_95, [0.4] * len(percents_95), 'r--')
#subplot(2,1,2)
percents = [100*w for w in percents]
ylim([0, 800])
p1, = plot(percents, bt,'.-.')
print('(b: %0.1f, lb: %0.1f, ma: %d) Latency vs percentile' % (beam, lat_beam, max_active))
title('b')
ylabel('Latency [ms]')
xlabel('percentile')
p2, = plot(percents, [200] * len(percents), 'r--')
p3 = axvline(95, color='g')
legend([p1, p2, p3], ['Latency', 'Desired latency 200 ms', '95th percentile'], loc=2, prop={'size':'small'})
pdfname = 'lat_vs_prc.pdf'
# savefig(pdfname, bbox_inches='tight')
# pdf_color2grey(pdfname, 'grey_' + pdfname)
savefig(pdfname+'.ps', bbox_inches='tight')
(b: 12.0, lb: 5.0, ma: 2000) Latency vs percentile
lat_beam, max_active = lat_beam_range[4], max_active_range[0]
p95f, avgf, maxf, werl = [], [], [], []
for beam in beam_range:
forwardrtf = sorted([ rec.forwardlen / float(rec.wav_len) for rec in user_time_r if rec.beam == beam and rec.lattice_beam == lat_beam
and rec.max_active == max_active])
wers = [ cwer(rec.ins, rec.dl, rec.sub, rec.refer_words) for rec in sys_time_r if rec.beam == beam and rec.lattice_beam == lat_beam
and rec.max_active == max_active]
assert len(wers) == 1, 'wer is computed for each setting on all utterances at once-> 1 value'
p95f.append(forwardrtf[int(0.95 * len(forwardrtf))])
werl.append(wers[0])
avgf.append(sum(forwardrtf) / float(len(forwardrtf)))
maxf.append(max(forwardrtf))
fig, ax1 = subplots()
p1, = ax1.plot(beam_range, p95f, '-.g*')
p2, = ax1.plot(beam_range, avgf, '-.bs')
# ax1.plot(beam_range, maxf, 'r.-.')
# ax1.plot(beam_range, [1.0] * len(beam_range), 'r--')
p3, = ax1.plot(beam_range, [0.6] * len(beam_range), 'b--')
ax1.set_xlabel('beam')
ax1.set_ylabel('RTF')
ylim([0.0, 1.05])
for tl in ax1.get_yticklabels():
tl.set_color('b')
ax2 = ax1.twinx()
p4, = ax2.plot(beam_range, werl, 'k')
ax2.set_ylabel('WER')
ylim([19.0, 25.0])
for tl in ax2.get_yticklabels():
tl.set_color('k')
legend([p1, p2, p3, p4], ['95th RTF percentile', 'Average RTF', 'Desired 0.6 RTF', 'WER'], loc=1, prop={'size':'small'})
print('(lb: %0.1f, ma: %d )Forward beam vs forward RTF and WER' % (lat_beam, max_active))
title('a')
pdfname='beam_vs_rtfwer.pdf'
# savefig(pdfname, bbox_inches='tight')
# pdf_color2grey(pdfname, 'grey_' + pdfname)
savefig(pdfname+'.ps', bbox_inches='tight')
(lb: 5.0, ma: 2000 )Forward beam vs forward RTF and WER
print beam_range
beam, max_active = beam_range[4], max_active_range[0]
p95b, avgb, maxb, werl = [], [], [], []
for lat_beam in lat_beam_range:
back_time = sorted([ 1000 * rec.backwardlen for rec in user_time_r if rec.beam == beam
and rec.lattice_beam == lat_beam and rec.max_active == max_active])
wers = [ cwer(rec.ins, rec.dl, rec.sub, rec.refer_words) for rec in sys_time_r if rec.beam == beam
and rec.lattice_beam == lat_beam and rec.max_active == max_active]
assert len(wers) == 1, 'wer is computed for each setting on all utterances at once-> 1 value'
werl.append(wers[0])
p95b.append(back_time[int(0.95 * len(back_time))])
avgb.append(sum(back_time) / float(len(back_time)))
maxb.append(max(back_time))
fig, ax1 = subplots()
p1, = ax1.plot(lat_beam_range, p95b, '-.gs')
#ax1.plot(lat_beam_range, avgb, 'b.-.')
#ax1.plot(lat_beam_range, maxb, 'r.-.')
p2, = ax1.plot(lat_beam_range, [200] * len(lat_beam_range), 'r--')
ylim([0.0, 1000.0])
ax1.set_xlabel('lattice-beam')
ax1.set_ylabel('Latency [ms]')
for tl in ax1.get_yticklabels():
tl.set_color('b')
ax2 = ax1.twinx()
p3, = ax2.plot(lat_beam_range, werl, 'k')
ylim([20.0, 25.0])
ax2.set_ylabel('WER')
ylim([19.0, 25.0])
for tl in ax2.get_yticklabels():
tl.set_color('k')
legend([p1, p2, p3], ['95th latency percentile', 'Desired latency 200 ms', 'WER'], loc=2, prop={'size':'small'})
print('(b: %0.1f, ma: %d) Lattice beam vs latency and WER' % (beam, max_active))
title('b')
pdfname='latbeam_vs_latwer.pdf'
# savefig(pdfname, bbox_inches='tight')
# pdf_color2grey(pdfname, 'grey_' + pdfname)
savefig(pdfname+'.ps', bbox_inches='tight')
[8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 16.0] (b: 12.0, ma: 2000) Lattice beam vs latency and WER
wl_dup, bt, ft, beam, lat_beam, max_active = [], [], [], beam_range[4], lat_beam_range[4], max_active_range[0]
for wl in wav_len_values:
back_time = [ 1000 * rec.backwardlen for rec in user_time_r if
rec.beam == beam and rec.wav_len == wl and
rec.lattice_beam == lat_beam and rec.max_active == max_active]
forward_time_google = [ 1000 * rec.forwardlen for rec in user_time_google if rec.wav_len == wl]
for btv, ftv in zip(back_time, forward_time_google):
wl_dup.append(wl)
ft.append(ftv)
bt.append(btv)
print('(b: %0.1f, lb: %0.1f, ma: %d) OnlineLatgenRecogniser Latency vs wave length.' % (beam, lat_beam, max_active))
print('Google ASR cloud service for Czech; no parameters. Latency ~ Batch decoding + network lattency.')
title('Batch vs Online decoding (Kaldi vs Google service)')
xlabel('Wave duration [s]')
ylabel('Latency [ms]')
p1, = plot(wl_dup, bt, '.')
p2, = plot(wl_dup, ft, '.r')
legend([p1, p2], ['OnlineLatgenRecogniser latency ~ extracting lattice', 'Google ASR latency ~ batch decoding', 'WER'], loc=2, prop={'size':'small'})
pdfname = 'lat_cloud_kaldi.pdf'
# savefig(pdfname, bbox_inches='tight')
# pdf_color2grey(pdfname, 'grey_' + pdfname)
savefig(pdfname+'.ps', bbox_inches='tight')
(b: 12.0, lb: 5.0, ma: 2000) OnlineLatgenRecogniser Latency vs wave length. Google ASR cloud service for Czech; no parameters. Latency ~ Batch decoding + network lattency.
# lat_beam, max_active = lat_beam_range[4], max_active_range[0]
# minb, avgb, maxb = [], [], []
# for beam in beam_range:
# backwardlen = [ rec.backwardlen for rec in user_time_r if rec.beam == beam and rec.lattice_beam == lat_beam
# and rec.max_active == max_active]
# minb.append(min(backwardlen))
# avgb.append(sum(backwardlen) / float(len(backwardlen)))
# maxb.append(max(backwardlen))
# plot(beam_range, minb, 'g')
# plot(beam_range, avgb, 'b')
# # pl.plot(beam_range, maxb, 'r')
# title('(lb: %0.1f) Forwad beam vs backward decoding time (Latency)' % lat_beam)
# ylabel('Latency')
# xlabel('beam')
# beam, max_active = beam_range[2], max_active_range[0]
# minf, avgf, maxf = [], [], []
# for lat_beam in lat_beam_range:
# forwardrtf = [ rec.forwardlen/ float(rec.wav_len) for rec in user_time_r if rec.beam == beam and rec.lattice_beam == lat_beam
# and rec.max_active == max_active]
# minf.append(min(forwardrtf))
# avgf.append(sum(forwardrtf) / float(len(forwardrtf)))
# maxf.append(max(forwardrtf))
# plot(lat_beam_range, minf, 'g')
# plot(lat_beam_range, avgf, 'b')
# plot(lat_beam_range, maxf, 'r')
# title('(b: %0.1f) Lattice beam vs forward RTF (Overloaded server)' % beam)
# ylabel('Forward rtf')
# xlabel('Lattice beam')
#werv =[ (cwer(rec.ins, rec.dl, rec.sub, rec.refer_words), rec.beam, rec.lattice_beam, rec.max_active) for rec in sys_time_r]
#print sorted([(l,b,m,w) for (w,b,l,m) in werv])