from scipy.io import wavfile
sr, src = wavfile.read('sources/THX.wav')
sr2, src2 = wavfile.read('sources/passport.wav')
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
#Decimate
#from scipy.signal import decimate
#src_mono_norm_dec = decimate(src_mono_norm, 4)
#sr = sr/4.0
rcParams['figure.figsize'] = (16, 4) #wide graphs by default
mpeg7:TemporalCentroid
$e(t)$ is the energy envelope (windowed RMS)
def windowed_rms(input_sig, win_sizes = [512, 1024, 2048, 4096], hop=None):
rms_windows = []
for win_size in win_sizes:
if not hop:
hop = win_size/2
window_start = arange(0, len(input_sig) - win_size, hop)
rms = []
for start in window_start:
w = input_sig[start: start+win_size].astype(float)
rms_inst = sqrt(mean(w**2))
rms.append(rms_inst)
rms_windows.append(rms)
return rms_windows, win_sizes
L = 0.1
win_len = sr * L
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
sample_dur_secs = len(src_mono_norm)/float(sr)
time = linspace(0, sample_dur_secs, len(w_rms[0]))
tc = sum(w_rms[0]*time)/sum(w_rms[0])
print tc, "Temporal Centroid"
13.4435532636 Temporal Centroid
len(w_rms[0])
499
plot(linspace(0, sample_dur_secs, len(src_mono_norm)), src_mono_norm, alpha=0.5)
vlines(tc, -1, 1, lw=4)
grid()
L = 0.1
win_len = sr * L
w_rms, win_lens = windowed_rms(src2, win_sizes=[win_len])
sample_dur_secs = len(src2)/float(sr2)
time = linspace(0, sample_dur_secs, len(w_rms[0]))
tc = sum(w_rms[0]*time)/sum(w_rms[0])
print tc, "Temporal Centroid"
plot(linspace(0, sample_dur_secs, len(src2)), src2, alpha=0.5)
vlines(tc, -40000, 40000, lw=4)
grid()
3.44768724169 Temporal Centroid
print "%.2f%% Temporal Centroid"%(100* tc/sample_dur_secs)
45.92% Temporal Centroid
Temporal centroid can give a broad perspective of level (is the second half louder than the first?), or it can be applied to individual instrument notes to give an indication of how the instrument decays
absmax = max(src2.max(), abs(src2.min()))
th = absmax * 0.4
eff_dur = argwhere(abs(src2) > th)
len(eff_dur), len(src2)
(16611, 331098)
print "%.2f%% effective duration"%(len(eff_dur)/ float( len(src2)) * 100)
5.02% effective duration
absmax = max(w_rms[0])
th = absmax * 0.4
eff_dur = argwhere(w_rms[0] > th)
len(eff_dur), len(w_rms[0])
print "%.2f%% effective duration"%(len(eff_dur)/ float( len(w_rms[0])) * 100)
65.77% effective duration
Designed for individual notes to give another indication of whether the note decays quicly or holds, but can be used for longer content to determine the amount of "content"
absmax = max(src2.max(), abs(src2.min()))
db_th = -60
th = absmax * pow(10, db_th/20.0)
eff_dur = argwhere(src2 > th)
print "%.2f%% effective duration"%(len(eff_dur)/ float( len(src2)) * 100)
49.96% effective duration
Using similar techniques, the attack, sustain, vibrato and release time can be determined. Useful for individual notes, but if "notes" can be identified on longer content, then these measures can be applied, and we'll end up with a set of values with associated times.
in_sig = src2.astype(float)
zc = argwhere((in_sig[:-1]*in_sig[1:]) < 0)
zc = r_[zc, argwhere(in_sig == 0)]
plot(zc, zeros_like(zc), 'o')
[<matplotlib.lines.Line2D at 0x2c09110>]
plot(in_sig)
plot(zc, zeros_like(zc), 'o')
grid()
xlim(50000, 50400);
ylim((-10000, 10000));
The zero crossing rate is the frequency at which zero-crossings occur
in_sig_sr = sr2
in_sig_dur = len(in_sig)/float(in_sig_sr)
zcr = len(zc)/in_sig_dur
print zcr, 'Zero Crossing Rate'
3471.14781726 Zero Crossing Rate
Speech recognition: Aye, Y.Y.; , "Speech Recognition Using Zero-Crossing Features," Electronic Computer Technology, 2009 International Conference on , vol., no., pp.689-692, 20-22 Feb. 2009
http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4796052&tag=1
Percussive sound discrimination:
Gouyon, F., Pachet, F., & Delerue, O. (2000). On the use of zero-crossing rate for an application of classification of percussive sounds. DAFx G-6 conference on Digital Audio Effects, 3–8. Retrieved from http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.18.7006&rep=rep1&type=pdf
For very clean sounds can be used to detect pitch.
It also tends to correlate to high frequency content.
def windowed_zcr(sig_in, winsize, hop, sr = 1.0):
l = len(sig_in)
win_start = arange(0, l - winsize, hop)
zcr = zeros((len(win_start)))
for i, start in enumerate(win_start):
sl = sig_in[start: start + winsize].astype(float)
zcr[i] = (sr/float(winsize)) * sum(sl[:-1]*sl[1:] < 0)
times = win_start + winsize/2
return times/float(sr), zcr
times, zcr = windowed_zcr(src2, 4410, 2205, sr2)
plot(times, zcr)
[<matplotlib.lines.Line2D at 0x2e2e4d0>]
in_sig = src_mono_norm
in_sig_sr = sr
in_sig_dur = len(in_sig)/float(in_sig_sr)
zcr = len(zc)/in_sig_dur
print zcr, 'Zero Crossing Rate'
times, zcr2 = windowed_zcr(in_sig, 4410, 2205, in_sig_sr)
plot(times, zcr2)
hlines(zcr, 0, times.max(), color='r')
hlines(zcr2.mean(), 0, times.max(), color='g')
grid()
1041.38586247 Zero Crossing Rate
Note: Auto-correlation could be grouped in this set of features, but we'll leave it for later, as it can be used for other purposes too.
Correlation can be used to quantify the similarity between two sets of data. It can be done without regards for time, as if dealing just with a set of samples from a population.
The Cross-correlation function quantifies similarities between two functions at different time lags:
$$R_{xy}(j) = \sum_n x_n y^*_{n-j}$$Because we are dealing with real signals in audio, the complex conjugate is the same as the signal.
N = 1000
sr = 44100
P = N/float(sr)
f = 1.0/P
f
44.1
ccf = []
N = 5000
for i in range(N):
cc = sum(src_mono_norm[:-N]*src_mono_norm[i:-N+i])
ccf.append(cc)
plot(ccf)
grid()
ccf = []
N = 1000
for i in range(N):
cc = sum(src2[:-N]*src2[i:-N+i])
ccf.append(cc)
plot(ccf)
[<matplotlib.lines.Line2D at 0x306ce90>]
Extracts the time varying cross-correlation function by windowing.
The maximum absolute value of the cross-correlation function is known as the Cross-correlation coefficient
acorr(src_mono_norm, maxlags = 10)
Auto-correlation can be used both for tempo and beat detection and pitch estimation. More later...
First, of course, we need the spectrum:
specgram(src_mono_norm);
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024);
Pxx.shape
(1025, 1076)
The time-varying power spectrum is stored in Pxx. Each column (second dimension) holds a single spectral frame.
specgram assumes a real only input signal. For that reason the spectrum contains $\frac{N}{2}+1\ $ points
freqs.shape
(1025,)
times.shape
(1076,)
im
<matplotlib.image.AxesImage at 0x2e24650>
plot(Pxx[:, 99])
title('The 100th spectral frame')
<matplotlib.text.Text at 0x3333c10>
times[99]
2.3219954648526078
plot(freqs,Pxx[:, 99])
title('The 100th spectral frame')
<matplotlib.text.Text at 0x5d78c90>
semilogx(freqs,Pxx[:, 99])
title('The 100th spectral frame')
<matplotlib.text.Text at 0x3211c10>
plot(freqs,Pxx[:, 199])
title('The 200th spectral frame')
<matplotlib.text.Text at 0x63bda90>
loglog(freqs,Pxx[:, 199])
title('The 200th spectral frame')
<matplotlib.text.Text at 0x65ff390>
mpeg7:SpectralCentroid
The spectral centroid correlates to pitch and brightness
X = sqrt(Pxx)
sc = sum(X[:,99]*freqs)/sum(X[:,99])
print sc
892.394746531
plot(freqs,X[:, 99])
title('The 100th spectral frame')
vlines([sc], X[:,99].min(), X[:,99].max(), lw=3)
text(sc*1.1, X[:,99].max() * 0.9, 'sc = ' + str(sc))
<matplotlib.text.Text at 0x8e06690>
sc = sum(X[:,199]*freqs)/sum(X[:,199])
plot(freqs,X[:, 199])
title('The 200th spectral frame')
vlines([sc], X[:,199].min(), X[:,199].max(), lw=3)
text(sc*1.1, X[:,199].max() * 0.9, 'sc = ' + str(sc))
<matplotlib.text.Text at 0x67f0fd0>
centroid = []
for spec in X.T:
sc = sum(spec*freqs)/sum(spec)
centroid.append(sc)
plot(times, centroid)
[<matplotlib.lines.Line2D at 0x8c01090>]
win_len = 0.1 * sr
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
from scipy.interpolate import interp1d
ifunc = interp1d(linspace(0, len(src_mono_norm)/float(sr), len(w_rms[0])), w_rms[0])
interp_rms = ifunc(times)
plot(times, array(centroid) * interp_rms)
[<matplotlib.lines.Line2D at 0x95f7390>]
max(centroid), interp_rms.max()
(4088.5601591983827, 0.42946342333040133)
plot(interp_rms)
[<matplotlib.lines.Line2D at 0x7a95890>]
plot(src_mono_norm)
[<matplotlib.lines.Line2D at 0x7ac0b10>]
Measures how diffuse or focused the spectrum is around its centroid. It is the statistical measure of variance.
mpeg7:AudioSpectrumSpread
$$\operatorname{Var}(X) = \frac{1}{n}\sum_{i=1}^n (x_i - \mu)^2$$$$\mu = \frac{1}{n}\sum_{i=1}^n x_i $$ss = var(X[:,99])
print ss
9.40361259916e-07
spread = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
plot(spread)
[<matplotlib.lines.Line2D at 0x7d9b5d0>]
p1 = plot(spread)
twinx()
p2 = plot(centroid, 'g')
legend((p1 + p2), ('spread', 'centroid'), loc='best')
grid()
spread = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
plot(spread)
[<matplotlib.lines.Line2D at 0x837d650>]
from scipy.signal import butter, lfilter
noise = random.random(10000) - 0.5
b, a = butter(8, [0.2, 0.4], btype='band')
b2, a2 = butter(8, [0.4, 0.6], btype='band')
b3, a3 = butter(8, [0.2, 0.6], btype='band')
filt_noise = r_[lfilter(b,a, noise), lfilter(b2,a2, noise), lfilter(b3,a3, noise)]
Pxx, freqs,times, im = specgram(filt_noise);
X = sqrt(Pxx)
centroid = []
for spec in X.T:
sc = sum(spec*freqs)/sum(spec)
centroid.append(sc)
spread = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
p1 = plot(spread)
twinx()
p2 = plot(centroid, 'g')
legend((p1 + p2), ('spread', 'centroid'), loc='best')
grid()
A measure of the asymmetry around the centroid. (Also called the third order moment of a distribution)
from scipy.stats import moment
skewness = []
for spec in X.T:
ss = moment(spec, moment=3)
skewness.append(ss)
plot(skewness)
[<matplotlib.lines.Line2D at 0x8124e50>]
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024);
X = sqrt(Pxx)
skewness = []
for spec in X.T:
ss = moment(spec, moment=3)
skewness.append(ss)
clf()
plot(skewness)
[<matplotlib.lines.Line2D at 0xbd1a3d0>]
Pxx, freqs, times, im = specgram(src2, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024);
X = sqrt(Pxx)
skewness = []
for spec in X.T:
ss = moment(spec, moment=3)
skewness.append(ss)
clf()
plot(skewness)
[<matplotlib.lines.Line2D at 0xbb810d0>]
Pxx, freqs,times, im = specgram(filt_noise);
kurtosis = []
for spec in X.T:
sk = moment(spec, moment=4)
kurtosis.append(sk)
plot(kurtosis)
[<matplotlib.lines.Line2D at 0xb6bc690>]
X = sqrt(Pxx)
centroid = []
for spec in X.T:
sc = sum(spec*freqs)/sum(spec)
centroid.append(sc)
spread = []
kurtosis = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
sk = moment(spec, moment=4)
kurtosis.append(sk)
p1 = plot(spread)
twinx()
p2 = plot(centroid, 'g')
twinx()
p3 = plot(kurtosis, 'r')
legend((p1 + p2 + p3), ('spread', 'centroid', 'kurtosis'), loc='best')
grid()
Pxx3, freqs3,times3, im3 = specgram(filt_noise);
X = sqrt(Pxx3)
centroid = []
for spec in X.T:
sc = sum(spec*freqs3)/sum(spec)
centroid.append(sc)
spread = []
kurtosis = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
sk = moment(spec, moment=4)
kurtosis.append(sk)
clf()
p1 = plot(spread)
twinx()
p2 = plot(centroid, 'g')
twinx()
p3 = plot(kurtosis, 'r')
legend((p1 + p2 + p3), ('spread', 'centroid', 'kurtosis'), loc='upper left')
grid()
Pxx3, freqs3,times3, im3 = specgram(filt_noise, NFFT=1024, noverlap=64);
X = sqrt(Pxx3)
centroid = []
for spec in X.T:
sc = sum(spec*freqs3)/sum(spec)
centroid.append(sc)
spread = []
kurtosis = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
sk = moment(spec, moment=4)
kurtosis.append(sk)
clf()
p1 = plot(spread)
twinx()
p2 = plot(centroid, 'g')
twinx()
p3 = plot(kurtosis, 'r')
legend((p1 + p2 + p3), ('spread', 'centroid', 'kurtosis'), loc='upper left')
grid()
Pxx3, freqs3,times3, im3 = specgram(filt_noise, NFFT=64, noverlap=0);
X = sqrt(Pxx3)
centroid = []
for spec in X.T:
sc = sum(spec*freqs3)/sum(spec)
centroid.append(sc)
spread = []
skewness = []
kurtosis = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
se = moment(spec, moment=3)
skewness.append(se)
sk = moment(spec, moment=4)
kurtosis.append(sk)
clf()
p1 = plot(spread)
twinx()
p2 = plot(centroid, 'g')
twinx()
p3 = plot(skewness, 'r')
twinx()
p4 = plot(kurtosis, 'k')
legend((p1 + p2 + p3 + p4), ('spread', 'centroid', 'skewness', 'kurtosis'), loc='upper left')
grid()
Because the filter "shape" doesn't change, kurtosis remains "constant"
Entropy gives a measure of the number of bits required to represent some information. When applied to the probability mass function (PMF), entropy can also be used to measure the "peakiness" of a distribution. (i.e. the peakiness of the spectrum)
First normalize to make the spectrum a PMF:
$$ x_i = \frac{X_i}{\sum\limits_n{X_n }} $$Then extract entropy:
$$ H = -\sum\limits_{i=0}^{N}{ x_i \ln{x_i}}$$Proposed as additional feature for speech recognition: Toh, A., Togneri, R., & Nordholm, S. (2005). Spectral entropy as speech features for speech recognition. Proceedings of PEECS, (1). Retrieved from http://www.ee.uwa.edu.au/~roberto/research/theses/tr05-01.pdf
Misra, H., Ikbal, S., Bourlard, H., & Hermansky, H. (n.d.). Spectral entropy based feature for robust ASR. 2004 IEEE International Conference on Acoustics, Speech, and Signal Processing, 1, I–193–6. doi:10.1109/ICASSP.2004.1325955
Pxx3, freqs3,times3, im3 = specgram(filt_noise);
X = sqrt(Pxx3)
sum(X, axis=0).shape
(233,)
X = sqrt(Pxx3)
entropy = []
for spec in X.T:
bands = spec / sum(spec, axis=0)
entropy.append(- sum(bands*log2(bands)))
plot(entropy)
[<matplotlib.lines.Line2D at 0xc07d350>]
X = sqrt(Pxx3)
entropy = []
for spec in X.T:
bands = spec / sum(spec, axis=0)
entropy.append(- sum(bands*log2(bands)))
plot(entropy)
[<matplotlib.lines.Line2D at 0xc1b7bd0>]
Pxx, freqs,times, im = specgram(src_mono_norm);
Pxx2, freqs2,times2, im2 = specgram(src2);
clf()
X = sqrt(Pxx)
entropy = []
for spec in X.T:
bands = spec / sum(spec, axis=0)
entropy.append(- sum(bands*log2(bands)))
plot(entropy)
figure()
X = sqrt(Pxx2)
entropy = []
for spec in X.T:
bands = spec / sum(spec, axis=0)
entropy.append(- sum(bands*log2(bands)))
plot(entropy)
[<matplotlib.lines.Line2D at 0xd972710>]
Can be used to discriminate between "peaky" and flat spectra.
Quantifies the overall "slope" of the spectrum by finding where the cumulative sum crosses a threshold (usually around 85% of the spectrum)
cutoff=0.85
rolloff = []
X = sqrt(Pxx)
for spec in X.T:
where_greater = find(cumsum(spec) >= cutoff*sum(spec))
rf = where_greater[0]/float(len(spec))
rolloff.append(rf)
plot(rolloff)
[<matplotlib.lines.Line2D at 0xdc3a790>]
cutoff=0.85
rolloff = []
X = sqrt(Pxx2)
for spec in X.T:
where_greater = find(cumsum(spec) >= cutoff*sum(spec))
rf = where_greater[0]/float(len(spec))
rolloff.append(rf)
plot(rolloff)
[<matplotlib.lines.Line2D at 0xd885790>]
cutoff=0.85
rolloff = []
X = sqrt(Pxx3)
for spec in X.T:
where_greater = find(cumsum(spec) >= cutoff*sum(spec))
rf = where_greater[0]/float(len(spec))
rolloff.append(rf)
plot(rolloff)
[<matplotlib.lines.Line2D at 0x12fae9d0>]
Pxx3, freqs3,times3, im = specgram(filt_noise, NFFT=2048, noverlap=1024);
clf()
cutoff=0.85
rolloff = []
X = sqrt(Pxx3)
for spec in X.T:
where_greater = find(cumsum(spec) >= cutoff*sum(spec))
rf = where_greater[0]/float(len(spec))
rolloff.append(rf)
plot(rolloff)
[<matplotlib.lines.Line2D at 0xc32f090>]
Process:
Used often for onset detection, as this is where there is often large spectral flux.
Pxx3, freqs3,times3, im = specgram(filt_noise);
X_mag = sqrt(Pxx3)
X1 = X_mag[:,:-1]/sum(X_mag[:,:-1])
X2 = X_mag[:, 1:]/sum(X_mag[:, 1:])
spec_diff = X2 - X1
flux = sum(spec_diff**2, axis=0)
twinx()
plot(linspace(0, times3.max(), len(flux)), flux, lw=4, color='w')
[<matplotlib.lines.Line2D at 0xc33ce50>]
Pxx3, freqs3,times3, im = specgram(src_mono_norm, NFFT=2048, noverlap=1024);
X_mag = sqrt(Pxx3)
X1 = X_mag[:,:-1]/sum(X_mag[:,:-1])
X2 = X_mag[:, 1:]/sum(X_mag[:, 1:])
spec_diff = X2 - X1
flux = sum(spec_diff**2, axis=0)
twinx()
plot(linspace(0, times3.max(), len(flux)), flux, lw=4, color='w')
[<matplotlib.lines.Line2D at 0x16c41650>]
Pxx3, freqs3,times3, im = specgram(src2);
X_mag = sqrt(Pxx3)
X1 = X_mag[:,:-1]/sum(X_mag[:,:-1])
X2 = X_mag[:, 1:]/sum(X_mag[:, 1:])
spec_diff = X2 - X1
flux = sum(spec_diff**2, axis=0)
twinx()
plot(linspace(0, times3.max(), len(flux)), flux, lw=4, color='w')
[<matplotlib.lines.Line2D at 0x13798510>]
There is another similar definition:
Dixon, S. (2006) Onset Detection Revisited, in Proceedings of the 9th International Conference on Digital Audio Effects (DAFx-06), Montreal, Canada, September 18-20, 2006
http://www.dafx.ca/proceedings/papers/p_133.pdf
In this definition, there is no normalization, and instead of finding the sum of the square of the changes, the sum of the half-wave rectified difference is used.
http://en.wikipedia.org/wiki/Rectifier#Half-wave_rectification
def halfwaverect(in_sig):
out_sig = (in_sig + abs(in_sig))/2.0
return out_sig
Pxx3, freqs3,times3, im = specgram(filt_noise);
X = sqrt(Pxx3)
spec_diff = X[:,:-1] - X[:, 1:]
flux = sum(halfwaverect(spec_diff), axis=0)
twinx()
plot(linspace(0, times3.max(), len(flux)), flux, lw=4, color='w')
[<matplotlib.lines.Line2D at 0x13038a50>]
Pxx, freqs,times, im = specgram(src_mono_norm, NFFT=2048, noverlap=1024);
X = sqrt(Pxx)
spec_diff = X[:,:-1] - X[:, 1:]
flux = sum(halfwaverect(spec_diff), axis=0)
twinx()
plot(linspace(0, times.max(), len(flux)), flux, color='w')
[<matplotlib.lines.Line2D at 0x13f0eb90>]
Pxx2, freqs2,times2, im = specgram(src2);
X = sqrt(Pxx2)
spec_diff = X[:,:-1] - X[:, 1:]
flux = sum(halfwaverect(spec_diff), axis=0)
twinx()
plot(linspace(0, times2.max(), len(flux)), flux, color='w')
[<matplotlib.lines.Line2D at 0x1a0f4b10>]
Used widely in speech phoneme recognition, as they provide a somewhat good phoneme match independent of speaker and pitch.
Good summary of the MFCC process: http://apotheca.hpl.hp.com/ftp/pub/compaq/CRL/publications/logan/musicir_paper.pdf
http://en.wikipedia.org/wiki/Mel_frequency_cepstral_coefficient
Have their source in speech recognition, but have been applied to MIR.
The devil's in the details:
Ganchev, T., Fakotakis, N., & Kokkinakis, G. (2005). Comparative evaluation of various MFCC implementations on the speaker verification task. Proceedings of the SPECOM, 1(3), 191–194. Retrieved from [1](ftp://ftp.heanet.ie/disk1/sourceforge/v/project/vo/voicelock/etc/Comparative evaluation of various MFCC implementations on the speaker verification task.pdf)
As usual:
Pxx, freqs,times, im = specgram(src_mono_norm, NFFT=1024, noverlap=512);
But keep only the logarithm of the magnitude spectrum:
mag_spec = log10(Pxx) # The 10 factor is optional, but useful to make it dB scale
def lin2mel(flin):
return 1127 * log( 1 + (flin/700.0))
def mel2lin(fmel):
return 700.0*(exp(fmel/1127) -1)
frequencies = linspace(20, 5000, 1000)
plot(linspace(20, 5000, 1000), lin2mel(frequencies))
grid()
frequencies = linspace(20, 5000, 1000)
plot(linspace(20, 5000, 1000), mel2lin(frequencies))
grid()
13 linear bands first:
linear_bands = linspace(200, 1000, 13, endpoint=True)
print linear_bands
[ 200. 266.66666667 333.33333333 400. 466.66666667 533.33333333 600. 666.66666667 733.33333333 800. 866.66666667 933.33333333 1000. ]
print diff(linear_bands)[0]
66.6666666667
Then 27 exponentially spaced bands:
fc40 = 6400.0
numLogFilt = 27
logStep = exp(log(fc40/1000.0)/numLogFilt)
print logStep
1.07117028749
log_bands = [1000 * logStep**i for i in range(numLogFilt + 2)]
upper_limit = log_bands[-1]
log_bands = log_bands[1:-1]
print upper_limit
print log_bands
6855.48983996 [1071.1702874944676, 1147.4057848109805, 1229.0669843887936, 1316.5400350177026, 1410.238567807889, 1510.6056521145631, 1618.1158906663243, 1733.2776638044134, 1856.6355334451127, 1988.7728181328459, 2130.3143513605437, 2281.9294362004639, 2444.3350102169393, 2618.299035626872, 2804.6441307389241, 3004.2514598432849, 3218.0648999460059, 3447.0955040510189, 3692.4262820952167, 3955.2173221440621, 4236.711276064153, 4538.2392356126929, 4861.2270267299218, 5207.2019517981671, 5577.8000117493957, 5974.7736421722457, 6400.0000000000127]
Now create equal area triangles between the bands:
bands = r_[linear_bands, log_bands]
len(bands)
40
plot(bands)
[<matplotlib.lines.Line2D at 0x18ef48d0>]
Now the actual triangular filters:
Again from Gantchev et al.
filter_edges_ = r_[200.0 - diff(linear_bands)[0], bands ,upper_limit]
filter_edges = [(filter_edges_[i], filter_edges_[i+2]) for i in range(len(filter_edges_) - 2)]
filter_edges
[(133.33333333333331, 266.66666666666669), (200.0, 333.33333333333337), (266.66666666666669, 400.0), (333.33333333333337, 466.66666666666669), (400.0, 533.33333333333337), (466.66666666666669, 600.0), (533.33333333333337, 666.66666666666674), (600.0, 733.33333333333337), (666.66666666666674, 800.0), (733.33333333333337, 866.66666666666674), (800.0, 933.33333333333337), (866.66666666666674, 1000.0), (933.33333333333337, 1071.1702874944676), (1000.0, 1147.4057848109805), (1071.1702874944676, 1229.0669843887936), (1147.4057848109805, 1316.5400350177026), (1229.0669843887936, 1410.238567807889), (1316.5400350177026, 1510.6056521145631), (1410.238567807889, 1618.1158906663243), (1510.6056521145631, 1733.2776638044134), (1618.1158906663243, 1856.6355334451127), (1733.2776638044134, 1988.7728181328459), (1856.6355334451127, 2130.3143513605437), (1988.7728181328459, 2281.9294362004639), (2130.3143513605437, 2444.3350102169393), (2281.9294362004639, 2618.299035626872), (2444.3350102169393, 2804.6441307389241), (2618.299035626872, 3004.2514598432849), (2804.6441307389241, 3218.0648999460059), (3004.2514598432849, 3447.0955040510189), (3218.0648999460059, 3692.4262820952167), (3447.0955040510189, 3955.2173221440621), (3692.4262820952167, 4236.711276064153), (3955.2173221440621, 4538.2392356126929), (4236.711276064153, 4861.2270267299218), (4538.2392356126929, 5207.2019517981671), (4861.2270267299218, 5577.8000117493957), (5207.2019517981671, 5974.7736421722457), (5577.8000117493957, 6400.0000000000127), (5974.7736421722457, 6855.4898399646072)]
for band, edges in zip(bands, filter_edges):
print band, edges
200.0 (133.33333333333331, 266.66666666666669) 266.666666667 (200.0, 333.33333333333337) 333.333333333 (266.66666666666669, 400.0) 400.0 (333.33333333333337, 466.66666666666669) 466.666666667 (400.0, 533.33333333333337) 533.333333333 (466.66666666666669, 600.0) 600.0 (533.33333333333337, 666.66666666666674) 666.666666667 (600.0, 733.33333333333337) 733.333333333 (666.66666666666674, 800.0) 800.0 (733.33333333333337, 866.66666666666674) 866.666666667 (800.0, 933.33333333333337) 933.333333333 (866.66666666666674, 1000.0) 1000.0 (933.33333333333337, 1071.1702874944676) 1071.17028749 (1000.0, 1147.4057848109805) 1147.40578481 (1071.1702874944676, 1229.0669843887936) 1229.06698439 (1147.4057848109805, 1316.5400350177026) 1316.54003502 (1229.0669843887936, 1410.238567807889) 1410.23856781 (1316.5400350177026, 1510.6056521145631) 1510.60565211 (1410.238567807889, 1618.1158906663243) 1618.11589067 (1510.6056521145631, 1733.2776638044134) 1733.2776638 (1618.1158906663243, 1856.6355334451127) 1856.63553345 (1733.2776638044134, 1988.7728181328459) 1988.77281813 (1856.6355334451127, 2130.3143513605437) 2130.31435136 (1988.7728181328459, 2281.9294362004639) 2281.9294362 (2130.3143513605437, 2444.3350102169393) 2444.33501022 (2281.9294362004639, 2618.299035626872) 2618.29903563 (2444.3350102169393, 2804.6441307389241) 2804.64413074 (2618.299035626872, 3004.2514598432849) 3004.25145984 (2804.6441307389241, 3218.0648999460059) 3218.06489995 (3004.2514598432849, 3447.0955040510189) 3447.09550405 (3218.0648999460059, 3692.4262820952167) 3692.4262821 (3447.0955040510189, 3955.2173221440621) 3955.21732214 (3692.4262820952167, 4236.711276064153) 4236.71127606 (3955.2173221440621, 4538.2392356126929) 4538.23923561 (4236.711276064153, 4861.2270267299218) 4861.22702673 (4538.2392356126929, 5207.2019517981671) 5207.2019518 (4861.2270267299218, 5577.8000117493957) 5577.80001175 (5207.2019517981671, 5974.7736421722457) 5974.77364217 (5577.8000117493957, 6400.0000000000127) 6400.0 (5974.7736421722457, 6855.4898399646072)
mf_bands = zeros(len(bands))
freqs_hz = freqs * sr /2.0 # if frequencies are normalized between 0-1
band_scalings = []
for band, edges in zip(bands, filter_edges):
band_scaling = []
norm_factor = 2.0/(edges[1] - edges[0])
for f in freqs_hz:
if f >= edges[0] and f <= edges[1]:
if f < band:
band_scaling.append(norm_factor * (f - edges[0])/(band - edges[0]))
else:
band_scaling.append(norm_factor * (edges[1] - f)/(edges[1] - band))
else:
band_scaling.append(0.0)
band_scalings.append(band_scaling)
for s in band_scalings:
plot(s)
for s in band_scalings:
plot(s)
xlim((0, 100))
(0, 100)
for s in band_scalings:
plot(s)
xlim((0, 25))
(0, 25)
mag_spec.shape
(513, 2154)
count = 0
for s in mag_spec:
plot(s)
count += 1
print count
513
mf_bands = zeros((len(bands), mag_spec.shape[1]))
for i, spec in enumerate(mag_spec.T):
for j, s in enumerate(band_scalings):
mel_band = sum(spec * s)
mf_bands[j, i] = mel_band
imshow(mf_bands, aspect='auto')
<matplotlib.image.AxesImage at 0x20ee96d0>
http://en.wikipedia.org/wiki/Cepstrum
$$Cepstrum =\Big|\mathcal{F}^{-1}\Big[\mbox{log}\big(|\mathcal{F}[ f(t) ]|^2\big)\Big]\Big|^2$$But for MFCCs use the Discrete Cosine Transform instead of the DFT.
For the MFCC the DCT is applied to the individual amplitudes of the Mel scale filter bands as if they composed a signal.
plot(mf_bands[:, 1500])
[<matplotlib.lines.Line2D at 0x251aee90>]
DCT-II (second form)
# Slow version (but explicit...)
def DCT(in_sig):
assert ndim(in_sig) == 1
N = len(in_sig)
dct = []
for k in range(N):
phs = linspace(0.5 * k * pi/N, ((N - 1) + 0.5) * pi * k/N, N, endpoint=True)
bin_value = sum(in_sig*cos(phs))
dct.append(bin_value)
return dct
plot(DCT(mf_bands[:, 1500]))
[<matplotlib.lines.Line2D at 0x18efbd10>]
mfcc = []
for s in mf_bands.T:
mfcc.append(DCT(s)[:14])
imshow(array(mfcc).T, aspect='auto', interpolation='nearest')
colorbar()
<matplotlib.colorbar.Colorbar instance at 0x251bc4d0>
from scipy.fftpack import dct
mfcc = []
for s in mf_bands.T:
mfcc.append(dct(s)[:14])
imshow(array(mfcc).T, aspect='auto', interpolation='nearest')
colorbar()
<matplotlib.colorbar.Colorbar instance at 0x20f0c908>
imshow(array(mfcc)[:,1:13].T, aspect='auto', interpolation='nearest')
<matplotlib.image.AxesImage at 0x20cfc0d0>
from essentia.standard import *
w = Windowing(type = 'hann')
spectrum = Spectrum() # FFT() would return the complex FFT, here we just want the magnitude spectrum
mfcc = MFCC()
mfccs = []
audio = essentia.array(src_mono_norm)
for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512):
mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
mfccs.append(mfcc_coeffs)
mfccs = essentia.array(mfccs).T
imshow(mfccs[1:,:], aspect = 'auto', interpolation='nearest')
<matplotlib.image.AxesImage at 0x24970f90>
mfccs = []
audio = essentia.array(src2)
for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512):
mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
mfccs.append(mfcc_coeffs)
mfccs = essentia.array(mfccs).T
imshow(mfccs[1:,:], aspect = 'auto', interpolation='nearest')
<matplotlib.image.AxesImage at 0x21b276d0>
mfccs = []
audio = essentia.array(filt_noise)
for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512):
mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
mfccs.append(mfcc_coeffs)
mfccs = essentia.array(mfccs).T
imshow(mfccs[1:,:], aspect = 'auto', interpolation='nearest')
<matplotlib.image.AxesImage at 0x21b51bd0>
By: Andrés Cabrera mantaraya36@gmail.com
For Course MAT 240E at UCSB
This ipython notebook is licensed under the CC-BY-NC-SA license: http://creativecommons.org/licenses/by-nc-sa/4.0/