from os import listdir, chdir
from essentia.standard import AudioLoader
cd ~/Desktop/Music/
/Users/adj/Desktop/Music
base_dir = '/Users/adj/Desktop/Music/'
music_list = listdir(base_dir)
def getRMS(x):
return sqrt(mean(x**2))
def downmix(x):
if x.ndim == 1:
return x #x is already mono
else:
return sum(x.astype(float), axis=1)/x.ndim
music_list = music_list[1:]
def normalize(x):
abs_max = max(abs(x.min().astype(float)), abs(x.max().astype(float)))
return x.astype(float) / abs_max
def windowed_rms(input_sig, win_sizes = [512, 1024, 2048, 4096], hop=None):
rms_windows = []
for win_size in win_sizes:
if not hop:
hop = win_size/2
window_start = arange(0, len(input_sig) - win_size, hop)
rms = []
for start in window_start:
w = input_sig[start: start+win_size].astype(float)
rms_inst = getRMS(w)
rms.append(rms_inst)
rms_windows.append(rms)
return rms_windows, win_sizes
polonaise = music_list[16]
polonaise
'02 Military Polonaise.m4a'
july = music_list[12]
july
'02 First of July.m4a'
campanella = music_list[22]
campanella
'03 La Campanella Etude.m4a'
seashore = music_list[21]
seashore
'03 III_ Seashore.m4a'
filtering = music_list[-10]
filtering
'62 Sonogram Filtering.m4a'
loader = AudioLoader(filename = polonaise)
song, sr, nchnls = loader()
polonaise = downmix(song)
polonaise = normalize(polonaise)
plot(polonaise)
[<matplotlib.lines.Line2D at 0x10c587d90>]
loader = AudioLoader(filename = campanella)
song, sr, nchnls = loader()
campanella = downmix(song)
campanella = normalize(campanella)
plot(campanella)
[<matplotlib.lines.Line2D at 0x10c5e8bd0>]
loader = AudioLoader(filename = july)
song, sr, nchnls = loader()
july = downmix(song)
july = normalize(july)
plot(july)
[<matplotlib.lines.Line2D at 0x10c600c50>]
loader = AudioLoader(filename = seashore)
song, sr, nchnls = loader()
seashore = downmix(song)
seashore = normalize(seashore)
plot(seashore)
[<matplotlib.lines.Line2D at 0x10c69ea90>]
loader = AudioLoader(filename = filtering)
song, sr, nchnls = loader()
filtering = downmix(song)
filtering = normalize(filtering)
plot(filtering)
[<matplotlib.lines.Line2D at 0x10c622490>]
music = [polonaise, campanella, july, seashore, filtering]
<s2 Temporal Centroid
for song in music:
L = 0.1
win_len = sr * L
w_rms, win_lens = windowed_rms(song, win_sizes=[win_len])
sample_dur_secs = len(song)/float(sr)
time = linspace(0, sample_dur_secs, len(w_rms[0]))
tc = sum(w_rms[0]*time)/sum(w_rms[0])
print tc, "Temporal Centroid"
plot(linspace(0, sample_dur_secs, len(song)), song, alpha=0.5)
vlines(tc, -40000, 40000, lw=4)
grid()
show()
164.805354789 Temporal Centroid
163.214972153 Temporal Centroid
137.429838631 Temporal Centroid
58.8143900863 Temporal Centroid
14.2174055114 Temporal Centroid
#We can see that, despite the diversity of the genres (a couple classical solo piano pieces, a 21st century percussion quartet,
#a country song, and a sample of human speech from 'Microsound', most of the temporal centroids are very close to the center of the track.
#Probably the most interesting result was for the second one, 'La Campanella', which has such a dramatically loud and flamboyant
#ending and such a quiet and timid beginning that its centroid is skewed toward the end a bit more than usual.
def windowed_zcr(sig_in, winsize, hop, sr = 1.0):
l = len(sig_in)
win_start = arange(0, l - winsize, hop)
zcr = zeros((len(win_start)))
for i, start in enumerate(win_start):
sl = sig_in[start: start + winsize].astype(float)
zcr[i] = (sr/float(winsize)) * sum(sl[:-1]*sl[1:] < 0)
times = win_start + winsize/2
return times/float(sr), zcr
for song in music:
times, zcr = windowed_zcr(song, 0.1 * sr, 0.05 * sr, sr)
plot(times, zcr)
show()
#These are all quite distinct! The first two, both being solo piano classical pieces, seem to bear the closest resemblance, although
#we see that the second piece ('La Campanella') encompasses a broader range, up to almost 6000. The country song goes all the way up
#to 18000, which isn't so easy for me to understand---perhaps there is some noise that includes high frequencies. It also seems
#to be the most time-varying. We see that the percussion piece overall has some lower values, perhaps because many of the sounds
#are inharmonic, complex tones rather than pitches. I'm not sure how to interpret the last track ('Sonogram filtering'), which is
#actually human speech being modified.
Pxx, freqs, times, im = specgram(music[0]);
X = sqrt(Pxx)
centroid = []
for spec in X.T:
sc = sum(spec*freqs)/sum(spec)
centroid.append(sc)
spread = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
p1 = plot(spread)
twinx()
p2 = plot(centroid, 'g')
legend((p1 + p2), ('spread', 'centroid'), loc='best')
grid()
Pxx, freqs, times, im = specgram(music[1]);
X = sqrt(Pxx)
centroid = []
for spec in X.T:
sc = sum(spec*freqs)/sum(spec)
centroid.append(sc)
spread = []
for spec in X.T:
ss = var(spec)
spread.append(ss)
p1 = plot(spread)
twinx()
p2 = plot(centroid, 'g')
legend((p1 + p2), ('spread', 'centroid'), loc='best')
grid()
#Even though these two (the 'Military Polonaise' and 'La Campanella' are very similar genres, both being
#solo piano Romantic pieces, the spread/centroid seems quite different! In the second, 'La Campanella',
#for the most part the spread value is under the centroid value, whereas in the Polonaise, it is above
#equally often. 'La Campanella' is full of some very high-pitched (and probably bright in timbre) ostinato
#notes, representing the bells, whereas the Polonaise overall is a bit darker and in a lower register,
#so perhaps this is the reason.
Pxx, freqs, times, im = specgram(music[-1]); #the filtered human speech
Pxx2, freqs2, times2, im2 = specgram(music[-2]); #the percussion quartet
X = sqrt(Pxx)
entropy = []
for spec in X.T:
bands = spec / sum(spec, axis=0)
entropy.append(- sum(bands*log(bands)))
plot(entropy)
figure()
X = sqrt(Pxx2)
entropy = []
for spec in X.T:
bands = spec / sum(spec, axis=0)
entropy.append(- sum(bands*log2(bands)))
plot(entropy)
[<matplotlib.lines.Line2D at 0x107977c50>]
#I would have expected a much more stark difference! Not really sure how to interpret the results...
Pxx, freqs, times, im = specgram(music[2], NFFT=2048, noverlap=1024); #the country music track
Pxx2, freqs2, times2, im2 = specgram(music[-1], NFFT=2048, noverlap=1024); #the human speech again
cutoff=0.85
rolloff = []
X = sqrt(Pxx)
for spec in X.T:
where_greater = find(cumsum(spec) >= cutoff*sum(spec))
rf = where_greater[0]/float(len(spec))
rolloff.append(rf)
plot(rolloff)
figure()
X = sqrt(Pxx2)
for spec in X.T:
where_greater = find(cumsum(spec) >= cutoff*sum(spec))
rf = where_greater[0]/float(len(spec))
rolloff.append(rf)
plot(rolloff)
[<matplotlib.lines.Line2D at 0x113d05bd0>]
#I had expected maybe the results to be somewhat similar since both tracks have vocals, so at least this isn't
#too surprising.
Overall, my results were somewhat mixed, perhaps as a function of my lack of understanding the suitable paramteters or the correct interpretation of these features. The most diverse results seemed to be the zero-crossing rate, although again I do not know exactly how to interpret that (my best guess was in terms of pitched, harmonic sounds vs. complex, inharmonic sounds). The two solo piano pieces, which were closest in genre, did exhibit some relevant similarities, but seemed interestingly distinct in spectral centroid/spread. It would be interesting to look at more examples of solo piano music---perhaps it could be possible to try to distinguish between the Classical and Romantic era based on some feature like spectral centroid!