In [1]:

rcParams['figure.figsize'] = (16, 4) #wide graphs by default

Audio Features-Pitch extraction¶

Pitch¶

In [2]:

import essentia.standard

loader = essentia.standard.MonoLoader(filename = 'sources/180451__iluppai__alto-saxophone-solo.wav')
sax = loader()
sax_sr = 44100

In [3]:

plot(sax);

In [4]:

import essentia.standard

loader = essentia.standard.MonoLoader(filename = 'sources/109193__juskiddink__leq-acappella.wav')
voice = loader()
voice_sr = 44100

In [5]:

import essentia.standard

loader = essentia.standard.MonoLoader(filename = 'sources/32804__johnwally__solo-man.wav')
guitar = loader()
guitar_sr = 44100

Pitch tracking (F0 estimation)¶

Good overview of methods in:

Camacho, A., & Harris, J. G. (2007, September). A sawtooth waveform inspired pitch estimator for speech and music. The Journal of the Acoustical Society of America. University of FloridA.

Rabiner, L., Cheng, M., Rosenberg, a., & McGonegal, C. (1976). A comparative performance study of several pitch detection algorithms. IEEE Transactions on Acoustics, Speech, and Signal Processing, 24(5), 399–418. doi:10.1109/TASSP.1976.1162846

Time domain zero crossing¶

In [6]:

def windowed_zcr(sig_in, winsize, hop, sr = 1.0):
    l = len(sig_in)
    win_start = arange(0, l - winsize, hop)
    zcr = zeros((len(win_start)))
    for i, start in enumerate(win_start):
        sl = sig_in[start: start + winsize].astype(float)
        zcr[i] = (sr/float(winsize)) * sum(sl[:-1]*sl[1:] < 0)

    times = win_start + winsize/2
    return times/float(sr), zcr

In [7]:

plot(linspace(0, 1, 44100), sax[:44100])
ylim((-1,1))
twinx()
plot(*windowed_zcr(sax[:44100], 1024, 128, sax_sr), color='k', lw=2);
ylabel('frequency (Hz)')
grid()

In [8]:

plot(linspace(0, 1, 44100), voice[:44100])
ylim((-1,1))
twinx()
plot(*windowed_zcr(voice[:44100], 1024, 128, voice_sr), color='k', lw=2);
ylabel('frequency (Hz)')
grid()

In [9]:

plot(linspace(0, 1, 44100), guitar[:44100])
ylim((-1,1))
twinx()
plot(*windowed_zcr(guitar[:44100], 1024, 128, guitar_sr), color='k', lw=2);
ylabel('frequency (Hz)')
grid()

In [10]:

plot(linspace(0, 1, 44100), guitar[:44100])
ylim((-0.2,0.2))

hlines(0, 0.55, 0.6)
twinx()
plot(*windowed_zcr(guitar[:44100], 1024, 128, guitar_sr), color='k', marker='o',lw=2);
ylabel('frequency (Hz)')
grid()
xlim((0.55, 0.6))

Out[10]:

(0.55, 0.6)

In [11]:

plot(linspace(0, 1, 44100), sax[:44100])
ylim((-0.02,0.02))

hlines(0, 0.8, 0.85)
twinx()
plot(*windowed_zcr(sax[:44100], 1024, 128, sax_sr), color='k', marker='o',lw=2);
ylabel('frequency (Hz)')
grid()
xlim((0.0, 0.05))

Out[11]:

(0.0, 0.05)

Very low level signals are not distinguishable from high level ones (This is generally a problem of most pitch detection methods!)

In [12]:

def windowed_rms(input_sig, win_size, hop=None, sr=1.0):
    if not hop:
        hop = winsize/2
    rms = []
    window_start = arange(0, len(input_sig) - win_size, hop)
    
    for start in window_start:
        w = input_sig[start: start+win_size].astype(float)
        rms_inst = sqrt(mean(w**2))
        rms.append(rms_inst)
    times = (window_start + win_size/2)/float(sr)
    return times, rms

In [13]:

rms = windowed_rms(sax[:44100], 1024, 128, sax_sr)
zcr = windowed_zcr(sax[:44100], 1024, 128, sax_sr)
plot(*rms)

Out[13]:

[<matplotlib.lines.Line2D at 0x7804910>]

In [14]:

rms_th = where(array(rms[1]) > 0.02, 1.0, 0.0)
plot(zcr[0], zcr[1]*rms_th)

Out[14]:

[<matplotlib.lines.Line2D at 0x6897dd0>]

Now must approximate to "valid" pitches.

In [15]:

def midi2Hz(midinote, tuning=440.0):
    return tuning * (2**((midinote - 69)/12.0))

midi2Hz(69), midi2Hz(60), midi2Hz(24)

Out[15]:

(440.0, 261.6255653005986, 32.70319566257483)

In [16]:

num_freqs = 8*12 # eight octaves from C0

quant_freqs = [midi2Hz(i + 24) for i in range(num_freqs)]
array(quant_freqs)

Out[16]:

array([   32.70319566,    34.64782887,    36.70809599,    38.89087297,
          41.20344461,    43.65352893,    46.24930284,    48.9994295 ,
          51.9130872 ,    55.        ,    58.27047019,    61.73541266,
          65.40639133,    69.29565774,    73.41619198,    77.78174593,
          82.40688923,    87.30705786,    92.49860568,    97.998859  ,
         103.82617439,   110.        ,   116.54094038,   123.47082531,
         130.81278265,   138.59131549,   146.83238396,   155.56349186,
         164.81377846,   174.61411572,   184.99721136,   195.99771799,
         207.65234879,   220.        ,   233.08188076,   246.94165063,
         261.6255653 ,   277.18263098,   293.66476792,   311.12698372,
         329.62755691,   349.22823143,   369.99442271,   391.99543598,
         415.30469758,   440.        ,   466.16376152,   493.88330126,
         523.2511306 ,   554.36526195,   587.32953583,   622.25396744,
         659.25511383,   698.45646287,   739.98884542,   783.99087196,
         830.60939516,   880.        ,   932.32752304,   987.76660251,
        1046.5022612 ,  1108.73052391,  1174.65907167,  1244.50793489,
        1318.51022765,  1396.91292573,  1479.97769085,  1567.98174393,
        1661.21879032,  1760.        ,  1864.65504607,  1975.53320502,
        2093.0045224 ,  2217.46104781,  2349.31814334,  2489.01586978,
        2637.0204553 ,  2793.82585146,  2959.95538169,  3135.96348785,
        3322.43758064,  3520.        ,  3729.31009214,  3951.06641005,
        4186.00904481,  4434.92209563,  4698.63628668,  4978.03173955,
        5274.04091061,  5587.65170293,  5919.91076339,  6271.92697571,
        6644.87516128,  7040.        ,  7458.62018429,  7902.1328201 ])

In [17]:

def quantize_freq(freq_list, quant_freqs, quant_offset=24):
    quantized = zeros_like(freq_list)
    for i in range(len(freq_list)):
        arg = argwhere(quant_freqs > freq_list[i])
        if arg.size == 0 or arg[0] == 0:
            quantized[i] = 0
        elif quant_freqs[arg[0]] - freq_list[i] > freq_list[i] - quant_freqs[arg[0] - 1]:
            quantized[i] = arg[0] - 1
        else:
            quantized[i] = arg[0]
    return quantized + quant_offset

In [18]:

rms_th = where(array(rms[1]) > 0.02, 1.0, 0.0)
quantized = quantize_freq(zcr[1], quant_freqs)
plot(zcr[0], quantized*rms_th)

Out[18]:

[<matplotlib.lines.Line2D at 0x7815a90>]

In [19]:

plot(zcr[0], quantized*rms_th)
#ylim((50, 75))
grid()

To improve presicion, zero-crossings must also try to compensate for multiple crossings within the same period.

Auto-correlation¶

Maximum lags determines the lowest frequency we can detect

In [20]:

44100.0/512

Out[20]:

86.1328125

In [21]:

def windowed_acorr(input_sig, win_size, hop=None, sr=1.0, maxlags=None):
    if not hop:
        hop = win_size/2
    if not maxlags:
        maxlags = win_size/4
    window_start = arange(0, len(input_sig) - win_size, hop)
    acorrfs = []
    for start in window_start:
        w = input_sig[start: start+win_size]
        lags, acorr_inst, lines, line = acorr(w, maxlags=maxlags)
        acorrfs.append(acorr_inst)
        
    times = (window_start + win_size/2)/float(sr)
    clf()
    return times, lags, acorrfs

In [23]:

times, lags, acorrfs = windowed_acorr(sax[:44100], 2048, 512, sax_sr)

In [24]:

imshow(array(acorrfs).T, aspect='auto')
yticks(linspace(0, 1024, 5), linspace(-512, 512, 5).astype(int));

In [25]:

plot(sax[:44100])

Out[25]:

[<matplotlib.lines.Line2D at 0x6c56650>]

In [26]:

array(acorrfs).shape

Out[26]:

(83, 1025)

Need to not count central peak (e.g. by determining highest frequency)

In [27]:

fmax = 4000.0
L = 44100.0/fmax
L

Out[27]:

11.025

In [28]:

apeaks = argmax(array(acorrfs)[:,:512 - round(L)], axis=1)

In [29]:

imshow(array(acorrfs).T, aspect='auto')
plot(apeaks, color='w', lw=3)
xlim((0, 83))
ylim((0, 520))

yticks(linspace(512, 0, 5), linspace(0, -512, 5).astype(int));

Now convert sample delays into frequency:

In [30]:

plot(lags, acorrfs[30])
grid()

In [31]:

plot(44100.0/(512 - apeaks))

Out[31]:

[<matplotlib.lines.Line2D at 0x68b62d0>]

In [33]:

times, sax_rms = windowed_rms(sax[:44100], 2048, 512, sax_sr)
plot(times, sax_rms)

Out[33]:

[<matplotlib.lines.Line2D at 0xa5ee390>]

In [34]:

sax_rms_th = where(array(sax_rms) > 0.02, 1.0, 0.0)

In [35]:

sax_freqs = sax_rms_th * 44100.0/(512 - apeaks)
plot(sax_freqs)

Out[35]:

[<matplotlib.lines.Line2D at 0x11c10890>]

In [36]:

quantized = quantize_freq(sax_freqs, quant_freqs)
plot(sax_freqs)
plot(440 * 2**((quantized - 69)/12.0))

Out[36]:

[<matplotlib.lines.Line2D at 0x11c18690>]

In [37]:

plot(sax_freqs)
plot(440 * 2**((quantized - 69)/12.0))
ylim((150, 200))
legend(['measured', 'quantized'], loc='best')
ylabel('Freq (Hz)')
grid()

In [38]:

quantized[30], quantized[60]

Out[38]:

(53.0, 55.0)

53 = E$\flat$, 55 = F

In [39]:

plot(quantized, 'g')
grid()
ylim((50, 60))

Out[39]:

(50, 60)

In [40]:

win_size = 2048
hop = 512
times, lags, acorrfs = windowed_acorr(guitar[:44100], win_size, hop, guitar_sr)

fmax = 3000.0
L = 44100.0/fmax
apeaks = argmax(array(acorrfs)[:,:hop - round(L)], axis=1)

times, rms = windowed_rms(guitar[:44100], win_size, hop, guitar_sr)

rms_th = where(array(rms) > 0.02, 1.0, 0.0)
detected_freqs = rms_th * 44100.0/(hop - apeaks)
quantized = quantize_freq(detected_freqs, quant_freqs)

In [41]:

imshow(array(acorrfs).T, aspect='auto')
plot(apeaks, color='w', lw=3)
xlim((0, 83))
ylim((0, 520))

yticks(linspace(512, 0, 5), linspace(0, -512, 5).astype(int));

In [42]:

plot(detected_freqs)
plot(440 * 2**((quantized - 69)/12.0))
ylim((250, 450))
legend(['measured', 'quantized'], loc='best')
grid()

In [43]:

plot(quantized)
ylim((55, 70))
grid()

In [44]:

win_size = 2048
hop = 512
times, lags, acorrfs = windowed_acorr(voice[:44100], win_size, hop, voice_sr)

fmax = 3000.0
L = 44100.0/fmax
apeaks = argmax(array(acorrfs)[:,:hop - round(L)], axis=1)

times, rms = windowed_rms(voice[:44100], win_size, hop, voice_sr)

rms_th = where(array(rms) > 0.02, 1.0, 0.0)
detected_freqs = rms_th * 44100.0/(hop - apeaks)
quantized = quantize_freq(detected_freqs, quant_freqs)

In [45]:

imshow(array(acorrfs).T, aspect='auto')
plot(apeaks, color='w', lw=3)
xlim((0, 83))
ylim((0, 520))

yticks(linspace(512, 0, 5), linspace(0, -512, 5).astype(int));

In [46]:

plot(detected_freqs)
plot(440 * 2**((quantized - 69)/12.0))
ylim((150, 450))
legend(['measured', 'quantized'], loc='best')
grid()

In [47]:

plot(quantized)
ylim((50, 65))
grid()

Frequency domain¶

In [48]:

Pxx, times, freqs, line = specgram(sax[:44100], NFFT=2048, noverlap=512+1024, Fs=sax_sr)

In [49]:

peaks = argmax(Pxx, axis=0)
peaks.shape

Out[49]:

(83,)

In [50]:

plot(peaks)

Out[50]:

[<matplotlib.lines.Line2D at 0x10e0dd10>]

In [51]:

Pxx, times, freqs, line = specgram(sax[:44100], NFFT=4096, noverlap=2048, Fs=sax_sr)

In [52]:

peaks = argmax(Pxx, axis=0)
plot(sax_sr* peaks/4096.0)
grid()

In [53]:

peaks = argsort(Pxx, axis=0)
peaks.shape

Out[53]:

(2049, 20)

In [54]:

plot(peaks[-4:].T);

In [57]:

hist(peaks[-4:].flat, bins=100);

Out[57]:

(array([ 7,  4,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  7,  6,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  1,  7,  8,  0,  0,  0,  0, 11,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  4,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2]),
 array([   1.  ,    2.15,    3.3 ,    4.45,    5.6 ,    6.75,    7.9 ,
          9.05,   10.2 ,   11.35,   12.5 ,   13.65,   14.8 ,   15.95,
         17.1 ,   18.25,   19.4 ,   20.55,   21.7 ,   22.85,   24.  ,
         25.15,   26.3 ,   27.45,   28.6 ,   29.75,   30.9 ,   32.05,
         33.2 ,   34.35,   35.5 ,   36.65,   37.8 ,   38.95,   40.1 ,
         41.25,   42.4 ,   43.55,   44.7 ,   45.85,   47.  ,   48.15,
         49.3 ,   50.45,   51.6 ,   52.75,   53.9 ,   55.05,   56.2 ,
         57.35,   58.5 ,   59.65,   60.8 ,   61.95,   63.1 ,   64.25,
         65.4 ,   66.55,   67.7 ,   68.85,   70.  ,   71.15,   72.3 ,
         73.45,   74.6 ,   75.75,   76.9 ,   78.05,   79.2 ,   80.35,
         81.5 ,   82.65,   83.8 ,   84.95,   86.1 ,   87.25,   88.4 ,
         89.55,   90.7 ,   91.85,   93.  ,   94.15,   95.3 ,   96.45,
         97.6 ,   98.75,   99.9 ,  101.05,  102.2 ,  103.35,  104.5 ,
        105.65,  106.8 ,  107.95,  109.1 ,  110.25,  111.4 ,  112.55,
        113.7 ,  114.85,  116.  ]),
 <a list of 100 Patch objects>)

In [58]:

plot(sax_sr* peaks[-4:].T/4096.0);

In [59]:

plot(sax_sr* peaks[-6:].T/4096.0);

In [60]:

plot(sax_sr* peaks[-6:].T/4096.0);
ylim((150, 300))
grid()

In [62]:

midi2Hz(53), midi2Hz(55)

Out[62]:

(174.61411571650194, 195.99771799087463)

In [63]:

plot(sax_sr* peaks[-6:].T/4096.0);
ylim((150, 200))
grid()
hlines((midi2Hz(53), midi2Hz(55)), 0, 20)

Out[63]:

<matplotlib.collections.LineCollection at 0xc206690>

The frequency spectrum technique would need further refinements, in particular better resolution for the low frequencies, with interpolation or zero-padding, and some heuristics to select peaks.

Average Magnitude Difference Function (AMDF)¶

Like Autocorrelation, but uses subtraction instead of multiplication which can be faster.

$$ AMDF(m) = \sum\limits_{n=0}^{N-1}{ |x(n) - x(n + m)|^k}$$

In [64]:

def windowed_amdf(input_sig, win_size, hop=None, sr=1.0, maxlags=None, k = 1):
    if not hop:
        hop = win_size/2
    if not maxlags:
        maxlags = win_size/4
    window_start = arange(0, len(input_sig) - win_size - maxlags, hop)
    amdfs = []
    for start in window_start:
        amdfsn = []
        w = input_sig[start: start+win_size].astype(float)
        for lag in range(maxlags):
            wm = input_sig[start + lag: start+win_size + lag].astype(float)
            amdfsn.append(sum(abs(w - wm)**k))
        amdfs.append(amdfsn)
        
    times = (window_start + win_size/2)/float(sr)
    return times, amdfs

In [65]:

times, amdfs = windowed_amdf(sax[:44100], win_size=2048, hop=1024, sr=44100, maxlags=1024, k=1)
imshow(array(amdfs).T, aspect='auto')
colorbar()

Out[65]:

<matplotlib.colorbar.Colorbar instance at 0xa388c68>

Here we look for minima:

In [66]:

plot(amdfs[20][:])

Out[66]:

[<matplotlib.lines.Line2D at 0xa3ae410>]

In [71]:

minima_at = argmin(array(amdfs)[:,70:], axis=1) + 70
plot(minima_at)

Out[71]:

[<matplotlib.lines.Line2D at 0xb017210>]

In Hz:

In [72]:

freqs = sax_sr/minima_at

In [73]:

plot(freqs)

Out[73]:

[<matplotlib.lines.Line2D at 0xd303850>]

In [74]:

plot(freqs)
ylim((50, 250))
grid()

In [75]:

plot(quantize_freq(freqs, quant_freqs))
ylim((30, 60))
grid()

In [76]:

times, amdfs = windowed_amdf(voice[:44100], win_size=2048, hop=1024, sr=44100, maxlags=1024, k=1)
imshow(array(amdfs).T, aspect='auto')
colorbar()

Out[76]:

<matplotlib.colorbar.Colorbar instance at 0xc98d4d0>

In [78]:

minima_at = argmin(array(amdfs)[:,20:], axis=1) + 20
freqs = voice_sr/minima_at
plot(quantize_freq(freqs, quant_freqs))
ylim((50, 65))
grid()

In [79]:

times, amdfs = windowed_amdf(guitar[:44100], win_size=2048, hop=1024, sr=44100, maxlags=1024, k=1)
imshow(array(amdfs).T, aspect='auto')
colorbar()

Out[79]:

<matplotlib.colorbar.Colorbar instance at 0xb4f8a28>

In [80]:

minima_at = argmin(array(amdfs)[:,20:], axis=1) + 20
freqs = guitar_sr/minima_at
plot(quantize_freq(freqs, quant_freqs))
ylim((55, 70))
grid()

Cepstral pitch detection¶

In [81]:

Pxx, times, freqs, im = specgram(sax[:44100], NFFT=2048, noverlap=1024, Fs=sax_sr, scale_by_freq=False)

In [82]:

imshow(log10(Pxx), aspect='auto')

Out[82]:

<matplotlib.image.AxesImage at 0xc96fdd0>

In [83]:

log_spec = log10(Pxx)[:]
cepstrum = real(fft.rfft(log_spec, axis=0))**2
imshow(cepstrum[100:,:], aspect='auto')

Out[83]:

<matplotlib.image.AxesImage at 0x117d8cd0>

In [85]:

plot(cepstrum[50:,15])
plot(cepstrum[50:,32])

Out[85]:

[<matplotlib.lines.Line2D at 0xbf6d0d0>]

In [98]:

plot(cepstrum[50:,5])

Out[98]:

[<matplotlib.lines.Line2D at 0x917a710>]

In [86]:

plot(log10(Pxx[:,15]))
plot(log10(Pxx[:,35]))

xlim((0, 100))

Out[86]:

(0, 100)

In [87]:

maxima = argmax(cepstrum[50:], axis=0) + 50
plot(maxima)

Out[87]:

[<matplotlib.lines.Line2D at 0x899a510>]

In [88]:

plot(maxima)
ylim((100, 150))
grid()

In [90]:

maxima[15], maxima[30]

Out[90]:

(127, 112)

X-axis for Ceptra is in time units!

In [91]:

22050.0/maxima[15], 22050.0/maxima[30]

Out[91]:

(173.62204724409449, 196.875)

In [92]:

freqs = 22050.0/maxima
plot(freqs)
grid()

In [93]:

plot(quantize_freq(freqs, quant_freqs))
grid()

In [94]:

Pxx, times, freqs, im = specgram(voice[:44100], NFFT=2048, noverlap=1024, Fs=voice_sr, scale_by_freq=False)
log_spec = log10(Pxx)
cepstrum = real(fft.rfft(log_spec, axis=0))**2
maxima = argmax(cepstrum[50:], axis=0) + 50
freqs = 22050.0/maxima

clf()
plot(freqs)

Out[94]:

[<matplotlib.lines.Line2D at 0x10a00e10>]

In [95]:

plot(quantize_freq(freqs, quant_freqs))

Out[95]:

[<matplotlib.lines.Line2D at 0x8eaa990>]

In [96]:

Pxx, times, freqs, im = specgram(guitar[:44100], NFFT=2048, noverlap=1024, Fs=guitar_sr, scale_by_freq=False)
log_spec = log10(Pxx)
cepstrum = real(fft.rfft(log_spec, axis=0))**2
maxima = argmax(cepstrum[50:], axis=0) + 50
freqs = 22050.0/maxima

clf()
plot(freqs)
grid()

In [97]:

plot(quantize_freq(freqs, quant_freqs))
grid()

YIN and SWIPE algorithms¶

More modern approaches:

De Cheveigné, A., & Kawahara, H. (2002). YIN, a fundamental frequency estimator for speech and music. The Journal of the Acoustical Society of America, 111(4), 1917. doi:10.1121/1.1458024

Camacho, A., & Harris, J. G. (2008). A sawtooth waveform inspired pitch estimator for speech and music. The Journal of the Acoustical Society of America, 124(3), 1638–52. doi:10.1121/1.2951592

By Andrés Cabrera mantaraya36@gmail.com

For course MAT 240E at UCSB

This ipython notebook is licensed under the CC-BY-NC-SA license: http://creativecommons.org/licenses/by-nc-sa/4.0/