rcParams['figure.figsize'] = (16, 4) #wide graphs by default
import essentia.standard
loader = essentia.standard.MonoLoader(filename = 'sources/180451__iluppai__alto-saxophone-solo.wav')
sax = loader()
sax_sr = 44100
plot(sax);
import essentia.standard
loader = essentia.standard.MonoLoader(filename = 'sources/109193__juskiddink__leq-acappella.wav')
voice = loader()
voice_sr = 44100
import essentia.standard
loader = essentia.standard.MonoLoader(filename = 'sources/32804__johnwally__solo-man.wav')
guitar = loader()
guitar_sr = 44100
Good overview of methods in:
Camacho, A., & Harris, J. G. (2007, September). A sawtooth waveform inspired pitch estimator for speech and music. The Journal of the Acoustical Society of America. University of FloridA.
Rabiner, L., Cheng, M., Rosenberg, a., & McGonegal, C. (1976). A comparative performance study of several pitch detection algorithms. IEEE Transactions on Acoustics, Speech, and Signal Processing, 24(5), 399–418. doi:10.1109/TASSP.1976.1162846
def windowed_zcr(sig_in, winsize, hop, sr = 1.0):
l = len(sig_in)
win_start = arange(0, l - winsize, hop)
zcr = zeros((len(win_start)))
for i, start in enumerate(win_start):
sl = sig_in[start: start + winsize].astype(float)
zcr[i] = (sr/float(winsize)) * sum(sl[:-1]*sl[1:] < 0)
times = win_start + winsize/2
return times/float(sr), zcr
plot(linspace(0, 1, 44100), sax[:44100])
ylim((-1,1))
twinx()
plot(*windowed_zcr(sax[:44100], 1024, 128, sax_sr), color='k', lw=2);
ylabel('frequency (Hz)')
grid()
plot(linspace(0, 1, 44100), voice[:44100])
ylim((-1,1))
twinx()
plot(*windowed_zcr(voice[:44100], 1024, 128, voice_sr), color='k', lw=2);
ylabel('frequency (Hz)')
grid()
plot(linspace(0, 1, 44100), guitar[:44100])
ylim((-1,1))
twinx()
plot(*windowed_zcr(guitar[:44100], 1024, 128, guitar_sr), color='k', lw=2);
ylabel('frequency (Hz)')
grid()
plot(linspace(0, 1, 44100), guitar[:44100])
ylim((-0.2,0.2))
hlines(0, 0.55, 0.6)
twinx()
plot(*windowed_zcr(guitar[:44100], 1024, 128, guitar_sr), color='k', marker='o',lw=2);
ylabel('frequency (Hz)')
grid()
xlim((0.55, 0.6))
(0.55, 0.6)
plot(linspace(0, 1, 44100), sax[:44100])
ylim((-0.02,0.02))
hlines(0, 0.8, 0.85)
twinx()
plot(*windowed_zcr(sax[:44100], 1024, 128, sax_sr), color='k', marker='o',lw=2);
ylabel('frequency (Hz)')
grid()
xlim((0.0, 0.05))
(0.0, 0.05)
Very low level signals are not distinguishable from high level ones (This is generally a problem of most pitch detection methods!)
def windowed_rms(input_sig, win_size, hop=None, sr=1.0):
if not hop:
hop = winsize/2
rms = []
window_start = arange(0, len(input_sig) - win_size, hop)
for start in window_start:
w = input_sig[start: start+win_size].astype(float)
rms_inst = sqrt(mean(w**2))
rms.append(rms_inst)
times = (window_start + win_size/2)/float(sr)
return times, rms
rms = windowed_rms(sax[:44100], 1024, 128, sax_sr)
zcr = windowed_zcr(sax[:44100], 1024, 128, sax_sr)
plot(*rms)
[<matplotlib.lines.Line2D at 0x7804910>]
rms_th = where(array(rms[1]) > 0.02, 1.0, 0.0)
plot(zcr[0], zcr[1]*rms_th)
[<matplotlib.lines.Line2D at 0x6897dd0>]
Now must approximate to "valid" pitches.
def midi2Hz(midinote, tuning=440.0):
return tuning * (2**((midinote - 69)/12.0))
midi2Hz(69), midi2Hz(60), midi2Hz(24)
(440.0, 261.6255653005986, 32.70319566257483)
num_freqs = 8*12 # eight octaves from C0
quant_freqs = [midi2Hz(i + 24) for i in range(num_freqs)]
array(quant_freqs)
array([ 32.70319566, 34.64782887, 36.70809599, 38.89087297, 41.20344461, 43.65352893, 46.24930284, 48.9994295 , 51.9130872 , 55. , 58.27047019, 61.73541266, 65.40639133, 69.29565774, 73.41619198, 77.78174593, 82.40688923, 87.30705786, 92.49860568, 97.998859 , 103.82617439, 110. , 116.54094038, 123.47082531, 130.81278265, 138.59131549, 146.83238396, 155.56349186, 164.81377846, 174.61411572, 184.99721136, 195.99771799, 207.65234879, 220. , 233.08188076, 246.94165063, 261.6255653 , 277.18263098, 293.66476792, 311.12698372, 329.62755691, 349.22823143, 369.99442271, 391.99543598, 415.30469758, 440. , 466.16376152, 493.88330126, 523.2511306 , 554.36526195, 587.32953583, 622.25396744, 659.25511383, 698.45646287, 739.98884542, 783.99087196, 830.60939516, 880. , 932.32752304, 987.76660251, 1046.5022612 , 1108.73052391, 1174.65907167, 1244.50793489, 1318.51022765, 1396.91292573, 1479.97769085, 1567.98174393, 1661.21879032, 1760. , 1864.65504607, 1975.53320502, 2093.0045224 , 2217.46104781, 2349.31814334, 2489.01586978, 2637.0204553 , 2793.82585146, 2959.95538169, 3135.96348785, 3322.43758064, 3520. , 3729.31009214, 3951.06641005, 4186.00904481, 4434.92209563, 4698.63628668, 4978.03173955, 5274.04091061, 5587.65170293, 5919.91076339, 6271.92697571, 6644.87516128, 7040. , 7458.62018429, 7902.1328201 ])
def quantize_freq(freq_list, quant_freqs, quant_offset=24):
quantized = zeros_like(freq_list)
for i in range(len(freq_list)):
arg = argwhere(quant_freqs > freq_list[i])
if arg.size == 0 or arg[0] == 0:
quantized[i] = 0
elif quant_freqs[arg[0]] - freq_list[i] > freq_list[i] - quant_freqs[arg[0] - 1]:
quantized[i] = arg[0] - 1
else:
quantized[i] = arg[0]
return quantized + quant_offset
rms_th = where(array(rms[1]) > 0.02, 1.0, 0.0)
quantized = quantize_freq(zcr[1], quant_freqs)
plot(zcr[0], quantized*rms_th)
[<matplotlib.lines.Line2D at 0x7815a90>]
plot(zcr[0], quantized*rms_th)
#ylim((50, 75))
grid()
To improve presicion, zero-crossings must also try to compensate for multiple crossings within the same period.
Maximum lags determines the lowest frequency we can detect
44100.0/512
86.1328125
def windowed_acorr(input_sig, win_size, hop=None, sr=1.0, maxlags=None):
if not hop:
hop = win_size/2
if not maxlags:
maxlags = win_size/4
window_start = arange(0, len(input_sig) - win_size, hop)
acorrfs = []
for start in window_start:
w = input_sig[start: start+win_size]
lags, acorr_inst, lines, line = acorr(w, maxlags=maxlags)
acorrfs.append(acorr_inst)
times = (window_start + win_size/2)/float(sr)
clf()
return times, lags, acorrfs
times, lags, acorrfs = windowed_acorr(sax[:44100], 2048, 512, sax_sr)
imshow(array(acorrfs).T, aspect='auto')
yticks(linspace(0, 1024, 5), linspace(-512, 512, 5).astype(int));
plot(sax[:44100])
[<matplotlib.lines.Line2D at 0x6c56650>]
array(acorrfs).shape
(83, 1025)
Need to not count central peak (e.g. by determining highest frequency)
fmax = 4000.0
L = 44100.0/fmax
L
11.025
apeaks = argmax(array(acorrfs)[:,:512 - round(L)], axis=1)
imshow(array(acorrfs).T, aspect='auto')
plot(apeaks, color='w', lw=3)
xlim((0, 83))
ylim((0, 520))
yticks(linspace(512, 0, 5), linspace(0, -512, 5).astype(int));
Now convert sample delays into frequency:
plot(lags, acorrfs[30])
grid()
plot(44100.0/(512 - apeaks))
[<matplotlib.lines.Line2D at 0x68b62d0>]
times, sax_rms = windowed_rms(sax[:44100], 2048, 512, sax_sr)
plot(times, sax_rms)
[<matplotlib.lines.Line2D at 0xa5ee390>]
sax_rms_th = where(array(sax_rms) > 0.02, 1.0, 0.0)
sax_freqs = sax_rms_th * 44100.0/(512 - apeaks)
plot(sax_freqs)
[<matplotlib.lines.Line2D at 0x11c10890>]
quantized = quantize_freq(sax_freqs, quant_freqs)
plot(sax_freqs)
plot(440 * 2**((quantized - 69)/12.0))
[<matplotlib.lines.Line2D at 0x11c18690>]
plot(sax_freqs)
plot(440 * 2**((quantized - 69)/12.0))
ylim((150, 200))
legend(['measured', 'quantized'], loc='best')
ylabel('Freq (Hz)')
grid()
quantized[30], quantized[60]
(53.0, 55.0)
53 = E$\flat$, 55 = F
plot(quantized, 'g')
grid()
ylim((50, 60))
(50, 60)
win_size = 2048
hop = 512
times, lags, acorrfs = windowed_acorr(guitar[:44100], win_size, hop, guitar_sr)
fmax = 3000.0
L = 44100.0/fmax
apeaks = argmax(array(acorrfs)[:,:hop - round(L)], axis=1)
times, rms = windowed_rms(guitar[:44100], win_size, hop, guitar_sr)
rms_th = where(array(rms) > 0.02, 1.0, 0.0)
detected_freqs = rms_th * 44100.0/(hop - apeaks)
quantized = quantize_freq(detected_freqs, quant_freqs)
imshow(array(acorrfs).T, aspect='auto')
plot(apeaks, color='w', lw=3)
xlim((0, 83))
ylim((0, 520))
yticks(linspace(512, 0, 5), linspace(0, -512, 5).astype(int));
plot(detected_freqs)
plot(440 * 2**((quantized - 69)/12.0))
ylim((250, 450))
legend(['measured', 'quantized'], loc='best')
grid()
plot(quantized)
ylim((55, 70))
grid()
win_size = 2048
hop = 512
times, lags, acorrfs = windowed_acorr(voice[:44100], win_size, hop, voice_sr)
fmax = 3000.0
L = 44100.0/fmax
apeaks = argmax(array(acorrfs)[:,:hop - round(L)], axis=1)
times, rms = windowed_rms(voice[:44100], win_size, hop, voice_sr)
rms_th = where(array(rms) > 0.02, 1.0, 0.0)
detected_freqs = rms_th * 44100.0/(hop - apeaks)
quantized = quantize_freq(detected_freqs, quant_freqs)
imshow(array(acorrfs).T, aspect='auto')
plot(apeaks, color='w', lw=3)
xlim((0, 83))
ylim((0, 520))
yticks(linspace(512, 0, 5), linspace(0, -512, 5).astype(int));
plot(detected_freqs)
plot(440 * 2**((quantized - 69)/12.0))
ylim((150, 450))
legend(['measured', 'quantized'], loc='best')
grid()
plot(quantized)
ylim((50, 65))
grid()
Pxx, times, freqs, line = specgram(sax[:44100], NFFT=2048, noverlap=512+1024, Fs=sax_sr)
peaks = argmax(Pxx, axis=0)
peaks.shape
(83,)
plot(peaks)
[<matplotlib.lines.Line2D at 0x10e0dd10>]
Pxx, times, freqs, line = specgram(sax[:44100], NFFT=4096, noverlap=2048, Fs=sax_sr)
peaks = argmax(Pxx, axis=0)
plot(sax_sr* peaks/4096.0)
grid()
peaks = argsort(Pxx, axis=0)
peaks.shape
(2049, 20)
plot(peaks[-4:].T);
hist(peaks[-4:].flat, bins=100);
(array([ 7, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 7, 8, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]), array([ 1. , 2.15, 3.3 , 4.45, 5.6 , 6.75, 7.9 , 9.05, 10.2 , 11.35, 12.5 , 13.65, 14.8 , 15.95, 17.1 , 18.25, 19.4 , 20.55, 21.7 , 22.85, 24. , 25.15, 26.3 , 27.45, 28.6 , 29.75, 30.9 , 32.05, 33.2 , 34.35, 35.5 , 36.65, 37.8 , 38.95, 40.1 , 41.25, 42.4 , 43.55, 44.7 , 45.85, 47. , 48.15, 49.3 , 50.45, 51.6 , 52.75, 53.9 , 55.05, 56.2 , 57.35, 58.5 , 59.65, 60.8 , 61.95, 63.1 , 64.25, 65.4 , 66.55, 67.7 , 68.85, 70. , 71.15, 72.3 , 73.45, 74.6 , 75.75, 76.9 , 78.05, 79.2 , 80.35, 81.5 , 82.65, 83.8 , 84.95, 86.1 , 87.25, 88.4 , 89.55, 90.7 , 91.85, 93. , 94.15, 95.3 , 96.45, 97.6 , 98.75, 99.9 , 101.05, 102.2 , 103.35, 104.5 , 105.65, 106.8 , 107.95, 109.1 , 110.25, 111.4 , 112.55, 113.7 , 114.85, 116. ]), <a list of 100 Patch objects>)
plot(sax_sr* peaks[-4:].T/4096.0);
plot(sax_sr* peaks[-6:].T/4096.0);
plot(sax_sr* peaks[-6:].T/4096.0);
ylim((150, 300))
grid()
midi2Hz(53), midi2Hz(55)
(174.61411571650194, 195.99771799087463)
plot(sax_sr* peaks[-6:].T/4096.0);
ylim((150, 200))
grid()
hlines((midi2Hz(53), midi2Hz(55)), 0, 20)
<matplotlib.collections.LineCollection at 0xc206690>
The frequency spectrum technique would need further refinements, in particular better resolution for the low frequencies, with interpolation or zero-padding, and some heuristics to select peaks.
Like Autocorrelation, but uses subtraction instead of multiplication which can be faster.
$$ AMDF(m) = \sum\limits_{n=0}^{N-1}{ |x(n) - x(n + m)|^k}$$def windowed_amdf(input_sig, win_size, hop=None, sr=1.0, maxlags=None, k = 1):
if not hop:
hop = win_size/2
if not maxlags:
maxlags = win_size/4
window_start = arange(0, len(input_sig) - win_size - maxlags, hop)
amdfs = []
for start in window_start:
amdfsn = []
w = input_sig[start: start+win_size].astype(float)
for lag in range(maxlags):
wm = input_sig[start + lag: start+win_size + lag].astype(float)
amdfsn.append(sum(abs(w - wm)**k))
amdfs.append(amdfsn)
times = (window_start + win_size/2)/float(sr)
return times, amdfs
times, amdfs = windowed_amdf(sax[:44100], win_size=2048, hop=1024, sr=44100, maxlags=1024, k=1)
imshow(array(amdfs).T, aspect='auto')
colorbar()
<matplotlib.colorbar.Colorbar instance at 0xa388c68>
Here we look for minima:
plot(amdfs[20][:])
[<matplotlib.lines.Line2D at 0xa3ae410>]
minima_at = argmin(array(amdfs)[:,70:], axis=1) + 70
plot(minima_at)
[<matplotlib.lines.Line2D at 0xb017210>]
In Hz:
freqs = sax_sr/minima_at
plot(freqs)
[<matplotlib.lines.Line2D at 0xd303850>]
plot(freqs)
ylim((50, 250))
grid()
plot(quantize_freq(freqs, quant_freqs))
ylim((30, 60))
grid()
times, amdfs = windowed_amdf(voice[:44100], win_size=2048, hop=1024, sr=44100, maxlags=1024, k=1)
imshow(array(amdfs).T, aspect='auto')
colorbar()
<matplotlib.colorbar.Colorbar instance at 0xc98d4d0>
minima_at = argmin(array(amdfs)[:,20:], axis=1) + 20
freqs = voice_sr/minima_at
plot(quantize_freq(freqs, quant_freqs))
ylim((50, 65))
grid()
times, amdfs = windowed_amdf(guitar[:44100], win_size=2048, hop=1024, sr=44100, maxlags=1024, k=1)
imshow(array(amdfs).T, aspect='auto')
colorbar()
<matplotlib.colorbar.Colorbar instance at 0xb4f8a28>
minima_at = argmin(array(amdfs)[:,20:], axis=1) + 20
freqs = guitar_sr/minima_at
plot(quantize_freq(freqs, quant_freqs))
ylim((55, 70))
grid()
Pxx, times, freqs, im = specgram(sax[:44100], NFFT=2048, noverlap=1024, Fs=sax_sr, scale_by_freq=False)
imshow(log10(Pxx), aspect='auto')
<matplotlib.image.AxesImage at 0xc96fdd0>
log_spec = log10(Pxx)[:]
cepstrum = real(fft.rfft(log_spec, axis=0))**2
imshow(cepstrum[100:,:], aspect='auto')
<matplotlib.image.AxesImage at 0x117d8cd0>
plot(cepstrum[50:,15])
plot(cepstrum[50:,32])
[<matplotlib.lines.Line2D at 0xbf6d0d0>]
plot(cepstrum[50:,5])
[<matplotlib.lines.Line2D at 0x917a710>]
plot(log10(Pxx[:,15]))
plot(log10(Pxx[:,35]))
xlim((0, 100))
(0, 100)
maxima = argmax(cepstrum[50:], axis=0) + 50
plot(maxima)
[<matplotlib.lines.Line2D at 0x899a510>]
plot(maxima)
ylim((100, 150))
grid()
maxima[15], maxima[30]
(127, 112)
X-axis for Ceptra is in time units!
22050.0/maxima[15], 22050.0/maxima[30]
(173.62204724409449, 196.875)
freqs = 22050.0/maxima
plot(freqs)
grid()
plot(quantize_freq(freqs, quant_freqs))
grid()
Pxx, times, freqs, im = specgram(voice[:44100], NFFT=2048, noverlap=1024, Fs=voice_sr, scale_by_freq=False)
log_spec = log10(Pxx)
cepstrum = real(fft.rfft(log_spec, axis=0))**2
maxima = argmax(cepstrum[50:], axis=0) + 50
freqs = 22050.0/maxima
clf()
plot(freqs)
[<matplotlib.lines.Line2D at 0x10a00e10>]
plot(quantize_freq(freqs, quant_freqs))
[<matplotlib.lines.Line2D at 0x8eaa990>]
Pxx, times, freqs, im = specgram(guitar[:44100], NFFT=2048, noverlap=1024, Fs=guitar_sr, scale_by_freq=False)
log_spec = log10(Pxx)
cepstrum = real(fft.rfft(log_spec, axis=0))**2
maxima = argmax(cepstrum[50:], axis=0) + 50
freqs = 22050.0/maxima
clf()
plot(freqs)
grid()
plot(quantize_freq(freqs, quant_freqs))
grid()
More modern approaches:
De Cheveigné, A., & Kawahara, H. (2002). YIN, a fundamental frequency estimator for speech and music. The Journal of the Acoustical Society of America, 111(4), 1917. doi:10.1121/1.1458024
Camacho, A., & Harris, J. G. (2008). A sawtooth waveform inspired pitch estimator for speech and music. The Journal of the Acoustical Society of America, 124(3), 1638–52. doi:10.1121/1.2951592
By Andrés Cabrera mantaraya36@gmail.com
For course MAT 240E at UCSB
This ipython notebook is licensed under the CC-BY-NC-SA license: http://creativecommons.org/licenses/by-nc-sa/4.0/