Preprocessing

In [46]:

from os import listdir, chdir
from essentia.standard import AudioLoader

In [2]:

cd ~/Desktop/Music/

/Users/adj/Desktop/Music

In [3]:

base_dir = '/Users/adj/Desktop/Music/'
music_list = listdir(base_dir)

In [4]:

def getRMS(x):
    return sqrt(mean(x**2))

In [5]:

def downmix(x):
    if x.ndim == 1:
        return x     #x is already mono
    else:
        return sum(x.astype(float), axis=1)/x.ndim

In [6]:

music_list = music_list[1:]

In [7]:

def normalize(x):
    abs_max = max(abs(x.min().astype(float)), abs(x.max().astype(float)))
    return x.astype(float) / abs_max

In [8]:

def windowed_rms(input_sig, win_sizes = [512, 1024, 2048, 4096], hop=None):
    rms_windows = []
    for win_size in win_sizes:
        if not hop:
            hop = win_size/2 
        window_start = arange(0, len(input_sig) - win_size, hop)
        rms = []
        for start in window_start:
            w = input_sig[start: start+win_size].astype(float)
            rms_inst = getRMS(w)
            rms.append(rms_inst)
        rms_windows.append(rms)
    return rms_windows, win_sizes

In [9]:

polonaise = music_list[16]

In [10]:

polonaise

Out[10]:

'02 Military Polonaise.m4a'

In [11]:

july = music_list[12]

In [12]:

july

Out[12]:

'02 First of July.m4a'

In [13]:

campanella = music_list[22]

In [14]:

campanella

Out[14]:

'03 La Campanella Etude.m4a'

In [15]:

seashore = music_list[21]

In [16]:

seashore

Out[16]:

'03 III_ Seashore.m4a'

In [19]:

filtering = music_list[-10]

In [20]:

filtering

Out[20]:

'62 Sonogram Filtering.m4a'

In [28]:

loader = AudioLoader(filename = polonaise)
song, sr, nchnls = loader()

In [29]:

polonaise = downmix(song)
polonaise = normalize(polonaise)
plot(polonaise)
    

Out[29]:

[<matplotlib.lines.Line2D at 0x10c587d90>]

In [31]:

loader = AudioLoader(filename = campanella)
song, sr, nchnls = loader()
campanella = downmix(song)
campanella = normalize(campanella)
plot(campanella)

Out[31]:

[<matplotlib.lines.Line2D at 0x10c5e8bd0>]

In [32]:

loader = AudioLoader(filename = july)
song, sr, nchnls = loader()
july = downmix(song)
july = normalize(july)
plot(july)

Out[32]:

[<matplotlib.lines.Line2D at 0x10c600c50>]

In [33]:

loader = AudioLoader(filename = seashore)
song, sr, nchnls = loader()
seashore = downmix(song)
seashore = normalize(seashore)
plot(seashore)

Out[33]:

[<matplotlib.lines.Line2D at 0x10c69ea90>]

In [34]:

loader = AudioLoader(filename = filtering)
song, sr, nchnls = loader()
filtering = downmix(song)
filtering = normalize(filtering)
plot(filtering)

Out[34]:

[<matplotlib.lines.Line2D at 0x10c622490>]

In [45]:

music = [polonaise, campanella, july, seashore, filtering]

Temporal Centroids

<s2 Temporal Centroid

In [42]:

for song in music:
    
    L = 0.1
    win_len = sr * L
    
    w_rms, win_lens = windowed_rms(song, win_sizes=[win_len])
    
    sample_dur_secs = len(song)/float(sr)
    time = linspace(0, sample_dur_secs, len(w_rms[0]))
    
    tc = sum(w_rms[0]*time)/sum(w_rms[0])
    print tc, "Temporal Centroid"
    plot(linspace(0, sample_dur_secs, len(song)), song, alpha=0.5)
    vlines(tc, -40000, 40000, lw=4)
    grid()
    show()

164.805354789 Temporal Centroid

163.214972153 Temporal Centroid

137.429838631 Temporal Centroid

58.8143900863 Temporal Centroid

14.2174055114 Temporal Centroid

In [38]:

#We can see that, despite the diversity of the genres (a couple classical solo piano pieces, a 21st century percussion quartet,
#a country song, and a sample of human speech from 'Microsound', most of the temporal centroids are very close to the center of the track.
#Probably the most interesting result was for the second one, 'La Campanella', which has such a dramatically loud and flamboyant
#ending and such a quiet and timid beginning that its centroid is skewed toward the end a bit more than usual.

Zero-crossing rate

In [47]:

def windowed_zcr(sig_in, winsize, hop, sr = 1.0):
    l = len(sig_in)
    win_start = arange(0, l - winsize, hop)
    zcr = zeros((len(win_start)))
    for i, start in enumerate(win_start):
        sl = sig_in[start: start + winsize].astype(float)
        zcr[i] = (sr/float(winsize)) * sum(sl[:-1]*sl[1:] < 0)

    times = win_start + winsize/2
    return times/float(sr), zcr

In [49]:

for song in music:   
    times, zcr = windowed_zcr(song, 0.1 * sr, 0.05 * sr, sr)
    plot(times, zcr)
    show()

In [50]:

#These are all quite distinct! The first two, both being solo piano classical pieces, seem to bear the closest resemblance, although
#we see that the second piece ('La Campanella') encompasses a broader range, up to almost 6000. The country song goes all the way up
#to 18000, which isn't so easy for me to understand---perhaps there is some noise that includes high frequencies. It also seems
#to be the most time-varying. We see that the percussion piece overall has some lower values, perhaps because many of the sounds
#are inharmonic, complex tones rather than pitches. I'm not sure how to interpret the last track ('Sonogram filtering'), which is
#actually human speech being modified.

Spectral Centroid and Spread

In [63]:

Pxx, freqs, times, im = specgram(music[0]);

In [68]:

X = sqrt(Pxx)

centroid = []
for spec in X.T:
    sc = sum(spec*freqs)/sum(spec)
    centroid.append(sc)
    
spread = []
for spec in X.T:
    ss = var(spec)
    spread.append(ss)

p1 = plot(spread)
twinx()
p2 = plot(centroid, 'g')

legend((p1 + p2), ('spread', 'centroid'), loc='best')
grid()

In [69]:

Pxx, freqs, times, im = specgram(music[1]);

In [70]:

X = sqrt(Pxx)

centroid = []
for spec in X.T:
    sc = sum(spec*freqs)/sum(spec)
    centroid.append(sc)
    
spread = []
for spec in X.T:
    ss = var(spec)
    spread.append(ss)

p1 = plot(spread)
twinx()
p2 = plot(centroid, 'g')

legend((p1 + p2), ('spread', 'centroid'), loc='best')
grid()

In [73]:

#Even though these two (the 'Military Polonaise' and 'La Campanella' are very similar genres, both being 
#solo piano Romantic pieces, the spread/centroid seems quite different! In the second, 'La Campanella',
#for the most part the spread value is under the centroid value, whereas in the Polonaise, it is above 
#equally often. 'La Campanella' is full of some very high-pitched (and probably bright in timbre) ostinato
#notes, representing the bells, whereas the Polonaise overall is a bit darker and in a lower register,
#so perhaps this is the reason.

Spectral entropy

In [76]:

Pxx, freqs, times, im = specgram(music[-1]);     #the filtered human speech

In [77]:

Pxx2, freqs2, times2, im2 = specgram(music[-2]);     #the percussion quartet

In [78]:

X = sqrt(Pxx)

entropy = []
for spec in X.T:
    bands = spec / sum(spec, axis=0)
    entropy.append(- sum(bands*log(bands)))
    
plot(entropy)

figure()
X = sqrt(Pxx2)

entropy = []
for spec in X.T:
    bands = spec / sum(spec, axis=0)
    entropy.append(- sum(bands*log2(bands)))

plot(entropy)

Out[78]:

[<matplotlib.lines.Line2D at 0x107977c50>]

In [79]:

#I would have expected a much more stark difference! Not really sure how to interpret the results...

Spectral Roll-off

In [84]:

Pxx, freqs, times, im = specgram(music[2], NFFT=2048, noverlap=1024);     #the country music track

In [85]:

Pxx2, freqs2, times2, im2 = specgram(music[-1], NFFT=2048, noverlap=1024);     #the human speech again

In [86]:

cutoff=0.85
rolloff = []

X = sqrt(Pxx)

for spec in X.T:
    where_greater = find(cumsum(spec) >= cutoff*sum(spec))
    rf = where_greater[0]/float(len(spec))
    rolloff.append(rf)
    
plot(rolloff)
    
figure()

X = sqrt(Pxx2)

for spec in X.T:
    where_greater = find(cumsum(spec) >= cutoff*sum(spec))
    rf = where_greater[0]/float(len(spec))
    rolloff.append(rf)

plot(rolloff)

Out[86]:

[<matplotlib.lines.Line2D at 0x113d05bd0>]

In [87]:

#I had expected maybe the results to be somewhat similar since both tracks have vocals, so at least this isn't
#too surprising.

Conclusions

Overall, my results were somewhat mixed, perhaps as a function of my lack of understanding the suitable paramteters or the correct interpretation of these features. The most diverse results seemed to be the zero-crossing rate, although again I do not know exactly how to interpret that (my best guess was in terms of pitched, harmonic sounds vs. complex, inharmonic sounds). The two solo piano pieces, which were closest in genre, did exhibit some relevant similarities, but seemed interestingly distinct in spectral centroid/spread. It would be interesting to look at more examples of solo piano music---perhaps it could be possible to try to distinguish between the Classical and Romantic era based on some feature like spectral centroid!