Owen Campbell
MAT 240E
Homework 4
rcParams['figure.figsize'] = (16, 4) #wide graphs by default
from scipy.io import wavfile
from scipy.interpolate import interp1d
from scipy.stats import moment
sources = []
sources.append((wavfile.read("media/01 Everlasting Light.wav")))
sources.append((wavfile.read("media/01 P-Funk (Wants to Get Funked Up).wav")))
sources.append((wavfile.read("media/02 fallingElevatorBlues.wav")))
sources.append((wavfile.read("media/02 The Wilhelm Scream.wav")))
sources.append((wavfile.read("media/1-01 Central Texas.wav")))
def windowed_rms(input_sig, win_sizes = [512, 1024, 2048, 4096], hop=None):
rms_windows = []
for win_size in win_sizes:
if not hop:
hop = win_size/2
window_start = arange(0, len(input_sig) - win_size, hop)
rms = []
for start in window_start:
w = input_sig[start: start+win_size].astype(float)
rms_inst = sqrt(mean(w**2))
rms.append(rms_inst)
rms_windows.append(rms)
return rms_windows, win_sizes
Zero crossing Rate
src = sources[0][1]
sr - sources[0][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
in_sig = src_mono
zc = argwhere((in_sig[:-1]*in_sig[1:]) < 0)
zc = r_[zc, argwhere(in_sig == 0)]
plot(in_sig);
plot(zc, zeros_like(zc), 'o');
duration = len(src_mono) / sr
xps = len(zc) / duration
print 'Black Keys - Everlasting Light'
print 'Zero crossings per second: {}'.format(xps)
Black Keys - Everlasting Light Zero crossings per second: 22111
src = sources[1][1]
sr - sources[1][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
in_sig = src_mono
zc = argwhere((in_sig[:-1]*in_sig[1:]) < 0)
zc = r_[zc, argwhere(in_sig == 0)]
plot(in_sig);
plot(zc, zeros_like(zc), 'o');
duration = len(src_mono) / sr
xps = len(zc) / duration
print 'Parliament - P-Funk (Wants to Get Funked Up)'
print 'Zero crossings per second: {}'.format(xps)
Parliament - P-Funk (Wants to Get Funked Up) Zero crossings per second: 20375
src = sources[2][1]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
#abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
#src_mono_norm = src_mono.astype(float) / abs_max
in_sig = src_mono
zc = argwhere((in_sig[:-1]*in_sig[1:]) < 0)
zc = r_[zc, argwhere(in_sig == 0)]
plot(in_sig);
plot(zc, zeros_like(zc), 'o');
duration = len(src_mono) / sources[0][0]
xps = len(zc) / duration
print 'Me - Falling Elevator Blues'
print 'Zero crossings per second: {}'.format(xps)
Me - Falling Elevator Blues Zero crossings per second: 21718
src = sources[3][1]
sr - sources[3][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
in_sig = src_mono
zc = argwhere((in_sig[:-1]*in_sig[1:]) < 0)
zc = r_[zc, argwhere(in_sig == 0)]
plot(in_sig);
plot(zc, zeros_like(zc), 'o');
duration = len(src_mono) / sr
xps = len(zc) / duration
print 'James Blake - The Wilhelm Scream'
print 'Zero crossings per second: {}'.format(xps)
James Blake - The Wilhelm Scream Zero crossings per second: 21209
src = sources[4][1]
sr - sources[4][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
in_sig = src_mono
zc = argwhere((in_sig[:-1]*in_sig[1:]) < 0)
zc = r_[zc, argwhere(in_sig == 0)]
plot(in_sig);
plot(zc, zeros_like(zc), 'o');
duration = len(src_mono) / sr
xps = len(zc) / duration
print 'Stars of the Lid - Central Texas'
print 'Zero crossings per second: {}'.format(xps)
Stars of the Lid - Central Texas Zero crossings per second: 21136
All these songs had very similar zero crossing rates, so I suppose that's indicative of them having similar high-frequency content. These values seem pretty high though, very close to Nyquist.
Temporal Centroid:
src = sources[0][1]
sr = sources[0][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
L = 0.1
win_len = sr * L
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
sample_dur_secs = len(src_mono_norm)/float(sr)
time = linspace(0, sample_dur_secs, len(w_rms[0]))
tc = sum(w_rms[0]*time)/sum(w_rms[0])
print tc, "Temporal Centroid"
plot(linspace(0, sample_dur_secs, len(src_mono)), src_mono, alpha=0.5);
vlines(tc, -10000, 10000, lw=4);
grid();
print 'Black Keys - Everlasting Light'
title('Temporal Centroid: {:.2f}% of song'.format(100*(tc/sample_dur_secs)))
103.586555312 Temporal Centroid Black Keys - Everlasting Light
<matplotlib.text.Text at 0x109439950>
src = sources[1][1]
sr = sources[1][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
L = 0.1
win_len = sr * L
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
sample_dur_secs = len(src_mono_norm)/float(sr)
time = linspace(0, sample_dur_secs, len(w_rms[0]))
tc = sum(w_rms[0]*time)/sum(w_rms[0])
print tc, "Temporal Centroid"
plot(linspace(0, sample_dur_secs, len(src_mono)), src_mono, alpha=0.5);
vlines(tc, -10000, 10000, lw=4);
grid();
print 'Parliament - P-Funk (Wants to Get Funked Up)'
title('Temporal Centroid: {:.2f}% of song'.format(100*(tc/sample_dur_secs)))
225.927339697 Temporal Centroid Parliament - P-Funk (Wants to Get Funked Up)
<matplotlib.text.Text at 0x1125c90d0>
src = sources[2][1]
sr = sources[2][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
L = 0.1
win_len = sr * L
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
sample_dur_secs = len(src_mono_norm)/float(sr)
time = linspace(0, sample_dur_secs, len(w_rms[0]))
tc = sum(w_rms[0]*time)/sum(w_rms[0])
print tc, "Temporal Centroid"
plot(linspace(0, sample_dur_secs, len(src_mono)), src_mono, alpha=0.5);
vlines(tc, -10000, 10000, lw=4);
grid();
print 'Me - Falling Elevator Blues'
title('Temporal Centroid: {:.2f}% of song'.format(100*(tc/sample_dur_secs)))
82.1355623194 Temporal Centroid Me - Falling Elevator Blues
<matplotlib.text.Text at 0x112600790>
src = sources[3][1]
sr = sources[3][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
L = 0.1
win_len = sr * L
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
sample_dur_secs = len(src_mono_norm)/float(sr)
time = linspace(0, sample_dur_secs, len(w_rms[0]))
tc = sum(w_rms[0]*time)/sum(w_rms[0])
print tc, "Temporal Centroid"
plot(linspace(0, sample_dur_secs, len(src_mono)), src_mono, alpha=0.5);
vlines(tc, -10000, 10000, lw=4);
grid();
print 'James Blake - The Wilhelm Scream'
title('Temporal Centroid: {:.2f}% of song'.format(100*(tc/sample_dur_secs)))
165.586535692 Temporal Centroid James Blake - The Wilhelm Scream
<matplotlib.text.Text at 0x113462f50>
src = sources[4][1]
sr = sources[4][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
L = 0.1
win_len = sr * L
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
sample_dur_secs = len(src_mono_norm)/float(sr)
time = linspace(0, sample_dur_secs, len(w_rms[0]))
tc = sum(w_rms[0]*time)/sum(w_rms[0])
print tc, "Temporal Centroid"
plot(linspace(0, sample_dur_secs, len(src_mono)), src_mono, alpha=0.5);
vlines(tc, -10000, 10000, lw=4);
grid();
print 'Stars of the Lid - Central Texas'
title('Temporal Centroid: {:.2f}% of song'.format(100*(tc/sample_dur_secs)))
125.790659698 Temporal Centroid Stars of the Lid - Central Texas
<matplotlib.text.Text at 0x112eedf10>
All of the songs had a temporal centroid near the middle of the song, though the James Blake and Stars of the Lid songs were slightly later (59% and 56%). This makes sense, as both of these songs start out relatively quiet and build up toward the end.
Spectral Centroid
src = sources[0][1]
sr = sources[0][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
centroid = []
for spec in X.T:
sc = sum(spec*freqs)/sum(spec)
centroid.append(sc)
win_len = 0.1 * sr
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
ifunc = interp1d(linspace(0, len(src_mono_norm)/float(sr), len(w_rms[0])), w_rms[0])
interp_rms = ifunc(times)
centroid_rms = array(centroid) * interp_rms
plot(times, centroid_rms, alpha=0.5)
title('Spectral Centroid Scaled by RMS')
mean_sc = sum(centroid) / len(centroid)
print 'Black Keys - Everlasting Light'
print 'Mean Spectral Centroid: {:.2f} Hz'.format(mean_sc)
Black Keys - Everlasting Light Mean Spectral Centroid: 2540.74 Hz
src = sources[1][1]
sr = sources[1][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
centroid = []
for spec in X.T:
sc = sum(spec*freqs)/sum(spec)
centroid.append(sc)
win_len = 0.1 * sr
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
ifunc = interp1d(linspace(0, len(src_mono_norm)/float(sr), len(w_rms[0])), w_rms[0])
interp_rms = ifunc(times)
centroid_rms = array(centroid[7:]) * interp_rms[7:]
plot(times[7:], centroid_rms, alpha=0.5) #had some 'nan' values in there
title('Spectral Centroid Scaled by RMS')
mean_sc = sum(centroid[7:]) / len(centroid[7:])
print 'Parliament - P-Funk (Wants to Get Funked Up)'
print 'Mean Spectral Centroid: {:.2f} Hz'.format(mean_sc)
Parliament - P-Funk (Wants to Get Funked Up) Mean Spectral Centroid: 3204.29 Hz
src = sources[2][1]
sr = sources[2][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
centroid = []
for spec in X.T:
sc = sum(spec*freqs)/sum(spec)
centroid.append(sc)
win_len = 0.1 * sr
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
ifunc = interp1d(linspace(0, len(src_mono_norm)/float(sr), len(w_rms[0])), w_rms[0])
interp_rms = ifunc(times)
centroid_rms = array(centroid) * interp_rms
plot(times, centroid_rms, alpha=0.5)
title('Spectral Centroid Scaled by RMS')
mean_sc = sum(centroid) / len(centroid)
print 'Me - Falling Elevator Blues'
print 'Mean Spectral Centroid: {} Hz'.format(mean_sc)
Me - Falling Elevator Blues Mean Spectral Centroid: 1005.60722498 Hz
src = sources[3][1]
sr = sources[3][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
centroid = []
for spec in X.T:
sc = sum(spec*freqs)/sum(spec)
centroid.append(sc)
win_len = 0.1 * sr
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
ifunc = interp1d(linspace(0, len(src_mono_norm)/float(sr), len(w_rms[0])), w_rms[0])
interp_rms = ifunc(times)
centroid_rms = array(centroid) * interp_rms
plot(times, centroid_rms, alpha=0.5)
title('Spectral Centroid Scaled by RMS')
mean_sc = sum(centroid) / len(centroid)
print 'James Blake - The Wilhelm Scream'
print 'Mean Spectral Centroid: {} Hz'.format(mean_sc)
James Blake - The Wilhelm Scream Mean Spectral Centroid: 1530.64502564 Hz
src = sources[4][1]
sr = sources[4][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
centroid = []
for spec in X.T:
sc = sum(spec*freqs)/sum(spec)
centroid.append(sc)
win_len = 0.1 * sr
w_rms, win_lens = windowed_rms(src_mono_norm, win_sizes=[win_len])
ifunc = interp1d(linspace(0, len(src_mono_norm)/float(sr), len(w_rms[0])), w_rms[0])
interp_rms = ifunc(times)
centroid_rms = array(centroid[14:]) * interp_rms[14:]# more 'nan' values
plot(times[14:], centroid_rms, alpha=0.5)
title('Spectral Centroid Scaled by RMS')
mean_sc = sum(centroid[14:]) / len(centroid_rms[14:])
print 'Stars of the Lid - Central Texas'
print 'Mean Spectral Centroid: {} Hz'.format(mean_sc)
Stars of the Lid - Central Texas Mean Spectral Centroid: 1858.89022913 Hz
The spectral centroid is really interesting to observe; along with the RMS scaling it shows a lot of information about song structure. It's pretty easy to discern the different sections of the songs. This is a situation where changes to the window size would have a big impact on the level on detail visible in these graphs.
It's interesting that the Parliament song has a mean spectral centroid of ~3200 Hz whereas all the other songs are closer to ~1500 Hz, I'm guessing it has to do with bass frequencies being less prevalent in recordings from that era due to recording / production technologies and practices. I was also struck by how 'tidy' the spectrum was in the Stars of the Lid track, it looks like there's some pretty severe lowpass filtering going on.
Spectral Spread
src = sources[0][1]
sr = sources[0][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
spread = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
print 'Black Keys - Everlasting Light'
plot(spread);
title('Spectral Spread')
Black Keys - Everlasting Light
<matplotlib.text.Text at 0x111f21b50>
src = sources[1][1]
sr = sources[1][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
spread = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
print 'Parliament - P-Funk (Wants to Get Funked Up)'
plot(spread);
title('Spectral Spread')
Parliament - P-Funk (Wants to Get Funked Up)
<matplotlib.text.Text at 0x10a17c650>
src = sources[2][1]
sr = sources[2][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
spread = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
print 'Me - Falling Elevator Blues'
plot(spread);
title('Spectral Spread')
Me - Falling Elevator Blues
<matplotlib.text.Text at 0x10a7bbfd0>
src = sources[3][1]
sr = sources[3][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
spread = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
print 'James Blake - The Wilhelm Scream'
plot(spread);
title('Spectral Spread')
James Blake - The Wilhelm Scream
<matplotlib.text.Text at 0x109daa690>
src = sources[4][1]
sr = sources[4][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
spread = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
print 'Stars of the Lid - Central Texas'
plot(spread);
title('Spectral Spread')
Stars of the Lid - Central Texas
<matplotlib.text.Text at 0x103ee3290>
The spectral spread plots seem to convey somewhat similar information as the spectral centroid, but the song structure details aren't as prnounced as the RMS-scaled centroid plots. It's still possible to discern where there might be moments of heightened excitement in the song, as increased spectral spread might indicate the presence of more instruments or more tonal/harmonic movement. The window size used for this calculation has a big impact on the appearance of the plot. My initial intuition was that averaging over a relatively large window would make more sense from a musical perspective because over very short periods of time, variation due to transients or note onset would produce large spikes which don't necessarily correlate to significant changes in pitch. However, after experimenting with different window sizes I found that the largest spikes emerged with a mid-size FFT window like 2048 samples. Thinking about it more, this actually kind of makes sense, and what was wrong with my intuition was probably just my sense of scale with regards to what features can be captured in a given window size. With a small window size of say, 512, it is too short of a time to observe the type of transient variation I described above, and with a large window size like 8192, these variations just get averaged out. So when using a window size of 2048, a feature on the order of a note onset might actually resolve in one window, resulting in a higher variance calculation.
Spectral Skewness
src = sources[0][1]
sr = sources[0][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
skewness = []
for spec in X.T:
ss = moment(spec, moment=3)
skewness.append(ss)
print 'Black Keys - Everlasting Light'
plot(skewness)
title('Spectral Skewness')
Black Keys - Everlasting Light
<matplotlib.text.Text at 0x116596790>
src = sources[1][1]
sr = sources[1][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
skewness = []
for spec in X.T:
ss = moment(spec, moment=3)
skewness.append(ss)
print 'Parliament - P-Funk (Wants to Get Funked Up)'
plot(skewness)
title('Spectral Skewness')
Parliament - P-Funk (Wants to Get Funked Up)
<matplotlib.text.Text at 0x107567e50>
src = sources[2][1]
sr = sources[2][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
skewness = []
for spec in X.T:
ss = moment(spec, moment=3)
skewness.append(ss)
print 'Me - Falling Elevator Blues'
plot(skewness)
title('Spectral Skewness')
Me - Falling Elevator Blues
<matplotlib.text.Text at 0x11659f210>
src = sources[3][1]
sr = sources[3][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=2048, Fs=sr, window=window_hanning, noverlap=1024)
X = sqrt(Pxx)
skewness = []
for spec in X.T:
ss = moment(spec, moment=3)
skewness.append(ss)
print 'James Blake - The Wilhelm Scream'
plot(skewness)
title('Spectral Skewness')
James Blake - The Wilhelm Scream
<matplotlib.text.Text at 0x106abf510>
src = sources[4][1]
sr = sources[4][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=4096, Fs=sr, window=window_hanning, noverlap=2048)
X = sqrt(Pxx)
skewness = []
for spec in X.T:
ss = moment(spec, moment=3)
skewness.append(ss)
print 'Stars of the Lid - Central Texas'
plot(skewness)
title('Spectral Skewness')
Stars of the Lid - Central Texas
<matplotlib.text.Text at 0x10941dc50>
Spectral Kurtosis
src = sources[0][1]
sr = sources[0][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=4096, Fs=sr, window=window_hanning, noverlap=2048)
X = sqrt(Pxx)
kurtosis = []
for spec in X.T:
sk = moment(spec, moment=4)
kurtosis.append(sk)
print 'Black Keys - Everlasting Light'
plot(kurtosis)
title('Spectral Kurtosis')
Black Keys - Everlasting Light
<matplotlib.text.Text at 0x108ffd750>
src = sources[1][1]
sr = sources[1][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=4096, Fs=sr, window=window_hanning, noverlap=2048)
X = sqrt(Pxx)
kurtosis = []
for spec in X.T:
sk = moment(spec, moment=4)
kurtosis.append(sk)
print 'Parliament - P-Funk (Wants to Get Funked Up)'
plot(kurtosis)
title('Spectral Kurtosis')
Parliament - P-Funk (Wants to Get Funked Up)
<matplotlib.text.Text at 0x109031890>
src = sources[2][1]
sr = sources[2][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=4096, Fs=sr, window=window_hanning, noverlap=2048)
X = sqrt(Pxx)
kurtosis = []
for spec in X.T:
sk = moment(spec, moment=4)
kurtosis.append(sk)
print 'Me - Falling Elevator Blues'
plot(kurtosis)
title('Spectral Kurtosis')
Me - Falling Elevator Blues
<matplotlib.text.Text at 0x107882690>
src = sources[3][1]
sr = sources[3][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=4096, Fs=sr, window=window_hanning, noverlap=2048)
X = sqrt(Pxx)
kurtosis = []
for spec in X.T:
sk = moment(spec, moment=4)
kurtosis.append(sk)
print 'James Blake - The Wilhelm Scream'
plot(kurtosis)
title('Spectral Kurtosis')
James Blake - The Wilhelm Scream
<matplotlib.text.Text at 0x10fee0950>
src = sources[4][1]
sr = sources[4][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=4096, Fs=sr, window=window_hanning, noverlap=2048)
X = sqrt(Pxx)
kurtosis = []
for spec in X.T:
sk = moment(spec, moment=4)
kurtosis.append(sk)
print 'Stars of the Lid - Central Texas'
plot(kurtosis)
title('Spectral Kurtosis')
Stars of the Lid - Central Texas
<matplotlib.text.Text at 0x10fca5ed0>
I've been trying to wrap my head about what spectral skewness is measuring. It's defined as a measure of shape of the frequency spectrum below the spectral centroid vs the shape above it. But how is shape being compared? The definition of spectral kurtosis makes more sense to me (a measure of how the distribution around the mean differs from that of a Gaussian distrution), but mathematically the two measures only differ by the power the difference between each frequency and the mean is raised to (aka the moment, 3rd vs 4th). Aha! This actually kind of makes sense: skewness measures lopsidedness, which requires both positive and negative values, thus the odd-numbered exponent. Kurtosis is measuring the overall difference of a given distribution from a normal distribution, which needs only high or low values, hence the even exponent. As I understand it, kurtosis describes the shape of a distribution with respect to the 'shoulders', or how much mass exists close to the mean vs out on the tails. This also tied to the curvature of the distribution, that is, how sharp or rounded the peak is. So as I understand it, skewness measures lopsidedness around the mean, while kurtosis attempts to measure 'tall, sharp, skinniness' vs 'short, round, squatness'.
So getting back to the actual data here, I'm really confused as to why none of the tracks exhibit any negative skewness. Only with a small window of 512 samples did I ever observe even a miniscule negative value. This would imply that the distribution is always slightly lopsided toward higher frequencies...this seems suspicious. Maybe some of the pre-processing steps are messing it up? Also, I found it interesting that kurtosis does not seem to vary significantly with different FFT window sizes.
Spectral Flux
src = sources[0][1]
sr = sources[0][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=1024, Fs=sr, window=window_hanning, noverlap=512)
X_mag = sqrt(Pxx)
X1 = X_mag[:,:-1]/sum(X_mag[:,:-1])
X2 = X_mag[:, 1:]/sum(X_mag[:, 1:])
spec_diff = X2 - X1
flux = sum(spec_diff**2, axis=0)
twinx()
print 'Black Keys - Everlasting Light'
plot(linspace(0, times.max(), len(flux)), flux, lw=1, color='w')
title('Spectral Flux')
Black Keys - Everlasting Light
<matplotlib.text.Text at 0x13d390410>
src = sources[1][1]
sr = sources[1][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=1024, Fs=sr, window=window_hanning, noverlap=512)
X_mag = sqrt(Pxx)
X1 = X_mag[:,:-1]/sum(X_mag[:,:-1])
X2 = X_mag[:, 1:]/sum(X_mag[:, 1:])
spec_diff = X2 - X1
flux = sum(spec_diff**2, axis=0)
twinx()
print 'Parliament - P-Funk (Wants to Get Funked Up)'
plot(linspace(0, times.max(), len(flux)), flux, lw=1, color='w')
title('Spectral Flux')
Parliament - P-Funk (Wants to Get Funked Up)
<matplotlib.text.Text at 0x13d17b610>
src = sources[2][1]
sr = sources[2][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=1024, Fs=sr, window=window_hanning, noverlap=512)
X_mag = sqrt(Pxx)
X1 = X_mag[:,:-1]/sum(X_mag[:,:-1])
X2 = X_mag[:, 1:]/sum(X_mag[:, 1:])
spec_diff = X2 - X1
flux = sum(spec_diff**2, axis=0)
twinx()
print 'Me - Falling Elevator Blues'
plot(linspace(0, times.max(), len(flux)), flux, lw=1, color='w')
title('Spectral Flux')
Me - Falling Elevator Blues
<matplotlib.text.Text at 0x11b6eb310>
src = sources[3][1]
sr = sources[3][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=1024, Fs=sr, window=window_hanning, noverlap=512)
X_mag = sqrt(Pxx)
X1 = X_mag[:,:-1]/sum(X_mag[:,:-1])
X2 = X_mag[:, 1:]/sum(X_mag[:, 1:])
spec_diff = X2 - X1
flux = sum(spec_diff**2, axis=0)
twinx()
print 'James Blake - The Wilhelm Scream'
plot(linspace(0, times.max(), len(flux)), flux, lw=1, color='w')
title('Spectral Flux')
James Blake - The Wilhelm Scream
<matplotlib.text.Text at 0x1079f6590>
src = sources[4][1]
sr = sources[4][0]
# Downmix
src_mono = sum(src.astype(float)/src.shape[1], axis=1).astype(src.dtype)
# Normalize
abs_max = max(abs(src_mono.min().astype(float)), abs(src_mono.max().astype(float)))
src_mono_norm = src_mono.astype(float) / abs_max
Pxx, freqs, times, im = specgram(src_mono_norm, NFFT=1024, Fs=sr, window=window_hanning, noverlap=512)
X_mag = sqrt(Pxx)
X1 = X_mag[:,:-1]/sum(X_mag[:,:-1])
X2 = X_mag[:, 1:]/sum(X_mag[:, 1:])
spec_diff = X2 - X1
flux = sum(spec_diff**2, axis=0)
twinx()
print 'Stars of the Lid - Central Texas'
plot(linspace(0, times.max(), len(flux)), flux, lw=1, color='w')
title('Spectral Flux')
Stars of the Lid - Central Texas
<matplotlib.text.Text at 0x152adb310>
It appears that when larger analysis windows are used, the spectral flux values take on an offset correlated with the overall energy of the spectrum, but with shorter windows this offset goes to zero. Since the amplitude spectrum is normalized prior to the calculation, this may just be due to there being more variation from window to window because more time has elapsed. Like many of the otherspectral features, spectral flux seems to be a good indicator for changes between parts of songs. It seems like it makes more sense to use window sizes on the smaller side for this feature to avoid the offset described above, which is kind of misleading. Instantaneous spectral flux shouldn't really be something that has a constant value underlying the peaks. Then again, maybe the offset observed when looking at larger windows is showing higher level features like chord changes, which might have a rhythm associated with them which would manifest as the offset.