rcParams['figure.figsize'] = (16, 4) #wide graphs by default
Tzanetakis, G., Ermolinskyi, A., & Cook, P. (2003). Pitch histograms in audio and symbolic music information retrieval. Journal of New Music Research. Retrieved from http://www.tandfonline.com/doi/abs/10.1076/jnmr.32.2.143.16743
The accuracy and relevance of this measure depends on the pitch detection algorithm used.
The above article proposes using an enhanced version of the autocorrelation method that can eliminate octave-shift errors.
from essentia.standard import *
loader = MonoLoader(filename='sources/Bob Marley - Buffalo Soldier.mp3')
audio = loader()
def windowed_f0(input_sig, win_size, hop=None, sr=1.0, maxlags=None, fmax=4000.0):
if not hop:
hop = win_size/2
if not maxlags:
maxlags = win_size/4
L = float(sr)/fmax
window_start = arange(0, len(input_sig) - win_size, hop)
acorrfs = []
for start in window_start:
w = input_sig[start: start+win_size]
lags, acorr_inst, lines, line = acorr(w, maxlags=maxlags)
acorrfs.append(acorr_inst)
times = (window_start + win_size/2)/float(sr)
apeaks = argmax(array(acorrfs)[:,:maxlags - round(L)], axis=1)
f0 = float(sr)/(maxlags - apeaks)
clf()
return times, f0
times, f0 = windowed_f0(audio[:441000], 2048, 1024, 44100, 512)
plot(times, f0)
[<matplotlib.lines.Line2D at 0x2def7e10>]
def midi2Hz(midinote, tuning=440.0):
return tuning * (2**((midinote - 69)/12.0))
num_freqs = 8*12 # eight octaves from C0
quant_freqs = [midi2Hz(i + 24) for i in range(num_freqs)]
def quantize_freq(freq_list, quant_freqs, quant_offset=24):
quantized = zeros_like(freq_list)
for i in range(len(freq_list)):
arg = argwhere(array(quant_freqs) > freq_list[i])
if arg.size == 0 or arg[0] == 0:
quantized[i] = 0
elif quant_freqs[arg[0]] - freq_list[i] > freq_list[i] - quant_freqs[arg[0] - 1]:
quantized[i] = arg[0] - 1
else:
quantized[i] = arg[0]
return quantized + quant_offset
f0_q = quantize_freq(f0, quant_freqs)
plot(f0_q)
[<matplotlib.lines.Line2D at 0x3034fd10>]
hist(f0_q, arange(40, 100));
pitches = []
pitchconfs = []
w = Windowing(type = 'hann')
spectrum = Spectrum() # FFT() would return the complex FFT, here we just want the magnitude spectrum
pitchyin = PitchYinFFT()
for frame in FrameGenerator(audio, frameSize = 2048, hopSize = 1024):
pitch, pconfidence = pitchyin(spectrum(w(frame)))
pitches.append(pitch)
pitchconfs.append(pconfidence)
plot(pitches)
[<matplotlib.lines.Line2D at 0x231d1750>]
plot(pitchconfs)
[<matplotlib.lines.Line2D at 0x2decf850>]
plot(pitches[1000:1100])
twinx()
plot(pitchconfs[1000:1100], 'g')
[<matplotlib.lines.Line2D at 0x2f54c410>]
f0_q = quantize_freq(pitches, quant_freqs)
plot(f0_q)
[<matplotlib.lines.Line2D at 0x2f562c10>]
hist(f0_q, arange(40, 100));
loader = essentia.standard.MonoLoader(filename = 'sources/Stevie Wonder - Superstition.mp3')
audio = loader()
f0 = []
pitchconfs = []
for frame in FrameGenerator(audio, frameSize = 2048, hopSize = 1024):
pitch, pconfidence = pitchyin(spectrum(w(frame)))
f0.append(pitch)
pitchconfs.append(pconfidence)
f0_q = quantize_freq(f0, quant_freqs)
n_wonder, bins, patches = hist(f0_q, arange(40, 101));
loader = essentia.standard.MonoLoader(filename = 'sources/Bob Dylan - Canadee-I-O.mp3')
dylan = loader()
f0 = []
pitchconfs = []
for frame in FrameGenerator(dylan, frameSize = 2048, hopSize = 1024):
pitch, pconfidence = pitchyin(spectrum(w(frame)))
f0.append(pitch)
pitchconfs.append(pconfidence)
f0_q = quantize_freq(f0, quant_freqs)
n_dylan, bins, patches = hist(f0_q, arange(40, 101));
loader = essentia.standard.MonoLoader(filename = 'sources/Bob Marley - Buffalo Soldier.mp3')
marley = loader()
f0 = []
pitchconfs = []
for frame in FrameGenerator(marley, frameSize = 2048, hopSize = 1024):
pitch, pconfidence = pitchyin(spectrum(w(frame)))
f0.append(pitch)
pitchconfs.append(pconfidence)
f0_q = quantize_freq(f0, quant_freqs)
n_marley, bins, patches = hist(f0_q, arange(40, 101));
loader = essentia.standard.MonoLoader(filename = 'sources/Palestrina-Gloria.mp3')
palestrina = loader()
f0 = []
pitchconfs = []
for frame in FrameGenerator(palestrina, frameSize = 2048, hopSize = 1024):
pitch, pconfidence = pitchyin(spectrum(w(frame)))
f0.append(pitch)
pitchconfs.append(pconfidence)
f0_q = quantize_freq(f0, quant_freqs)
n_palestrina, bins, patches = hist(f0_q, arange(40, 101));
plot(pitchconfs)
[<matplotlib.lines.Line2D at 0x4025efd0>]
pclasses = n_wonder.reshape(5, 12).sum(axis=0)
bar(arange(12), pclasses)
<Container object of 12 artists>
Folded pitch histograms are generally organized by fifths (C -> G -> D ... etc.)
pclasses = n_wonder.reshape(5, 12).sum(axis=0)
foldedpcs = []
pcs = []
for i in range(12):
pc = (i * 5)%12
foldedpcs.append(pclasses[pc])
pcs.append(pc)
bar(arange(12), foldedpcs)
xticks(arange(12) + 0.5, pcs);
title('Folded pitch histogram (Superstition)')
<matplotlib.text.Text at 0x3d2efe90>
pclasses = n_dylan.reshape(5, 12).sum(axis=0)
foldedpcs = []
pcs = []
for i in range(12):
pc = (i * 5)%12
foldedpcs.append(pclasses[pc])
pcs.append(pc)
bar(arange(12), foldedpcs)
xticks(arange(12) + 0.5, pcs);
title('Folded pitch histogram (Dylan)')
<matplotlib.text.Text at 0x255e1a10>
pclasses = n_marley.reshape(5, 12).sum(axis=0)
foldedpcs = []
pcs = []
for i in range(12):
pc = (i * 5)%12
foldedpcs.append(pclasses[pc])
pcs.append(pc)
bar(arange(12), foldedpcs)
xticks(arange(12) + 0.5, pcs);
title('Folded pitch histogram (Marley)')
<matplotlib.text.Text at 0x1dc73b90>
pclasses = n_palestrina.reshape(5, 12).sum(axis=0)
foldedpcs = []
pcs = []
for i in range(12):
pc = (i * 5)%12
foldedpcs.append(pclasses[pc])
pcs.append(pc)
bar(arange(12), foldedpcs)
xticks(arange(12) + 0.5, pcs);
title('Folded pitch histogram (Palestrina)')
<matplotlib.text.Text at 0x1c721950>
Similar to the pitch histogram, but the notes are counted directly from the FFT bins.
loader = MonoLoader(filename='sources/Stevie Wonder - Superstition.mp3')
superstition = loader()
import librosa
cm = librosa.feature.chromagram(superstition, 44100, hop_length=1024)
librosa.display.specshow(cm, sr=44100, hop_length=1024, x_axis='time', y_axis='chroma', vmin=0, vmax=1)
title('Chromagram')
colorbar()
tight_layout()
bar(arange(12), sum(cm, axis=1))
<Container object of 12 artists>
cm.shape
(12, 11457)
cm = sum(cm, axis=1)
cm = r_[cm[3:],cm[:3]]
foldedpcs = []
pcs = []
for i in range(12):
pc = (i * 5)%12
foldedpcs.append(cm[pc])
pcs.append(pc)
bar(arange(12), cm)
xticks(arange(12) + 0.5, pcs);
pclasses = n_wonder.reshape(5, 12).sum(axis=0)
foldedpcs = []
pcs = []
for i in range(12):
pc = (i * 5)%12
foldedpcs.append(pclasses[pc])
pcs.append(pc)
bar(arange(12), foldedpcs)
xticks(arange(12) + 0.5, pcs);
title('Folded pitch histogram (Superstition)')
<matplotlib.text.Text at 0x2622d650>
cm = librosa.feature.chromagram(palestrina, 44100, hop_length=1024)
librosa.display.specshow(cm, sr=44100, hop_length=1024, x_axis='time', y_axis='chroma', vmin=0, vmax=1)
title('Chromagram')
colorbar()
tight_layout()
cm = sum(cm, axis=1)
cm = r_[cm[3:],cm[:3]]
foldedpcs = []
pcs = []
for i in range(12):
pc = (i * 5)%12
foldedpcs.append(cm[pc])
pcs.append(pc)
bar(arange(12), cm)
xticks(arange(12) + 0.5, pcs);
pclasses = n_palestrina.reshape(5, 12).sum(axis=0)
foldedpcs = []
pcs = []
for i in range(12):
pc = (i * 5)%12
foldedpcs.append(pclasses[pc])
pcs.append(pc)
bar(arange(12), foldedpcs)
xticks(arange(12) + 0.5, pcs);
title('Folded pitch histogram (Palestrina)')
<matplotlib.text.Text at 0x29b77a50>
cm = librosa.feature.chromagram(dylan, 44100, hop_length=1024)
librosa.display.specshow(cm, sr=44100, hop_length=1024, x_axis='time', y_axis='chroma', vmin=0, vmax=1)
title('Chromagram')
colorbar()
tight_layout()
bar(arange(12), sum(cm, axis=1))
<Container object of 12 artists>
cm = sum(cm, axis=1)
cm = r_[cm[3:],cm[:3]]
foldedpcs = []
pcs = []
for i in range(12):
pc = (i * 5)%12
foldedpcs.append(cm[pc])
pcs.append(pc)
bar(arange(12), cm)
xticks(arange(12) + 0.5, pcs);
pclasses = n_dylan.reshape(5, 12).sum(axis=0)
foldedpcs = []
pcs = []
for i in range(12):
pc = (i * 5)%12
foldedpcs.append(pclasses[pc])
pcs.append(pc)
bar(arange(12), foldedpcs)
xticks(arange(12) + 0.5, pcs);
title('Folded pitch histogram (Dylan)')
<matplotlib.text.Text at 0x39f202d0>
def windowed_rms(input_sig, win_size, hop=None, sr=1.0):
if not hop:
hop = winsize/2
rms = []
window_start = arange(0, len(input_sig) - win_size, hop)
for start in window_start:
w = input_sig[start: start+win_size].astype(float)
rms_inst = sqrt(mean(w**2))
rms.append(rms_inst)
times = (window_start + win_size/2)/float(sr)
return times, rms
times, super_rms = windowed_rms(superstition, 4096, 512, 44100)
plot(times, super_rms)
[<matplotlib.lines.Line2D at 0x2f9da810>]
from scipy.signal import decimate
rms_dec = decimate(super_rms, 3)
plot(rms_dec)
[<matplotlib.lines.Line2D at 0x1b6c6690>]
plot(rms_dec[:100])
[<matplotlib.lines.Line2D at 0x30022110>]
ss_win_size = 200
ss_hop = 10
in_sig = rms_dec
win_start = arange(0, len(in_sig)- ss_win_size, ss_hop)
ssm = []
for starti in win_start:
for startj in win_start:
wini = in_sig[starti: starti+ss_win_size]
winj = in_sig[startj: startj+ss_win_size]
ssm.append(corrcoef(wini, winj)[0, 1])
ssm = array(ssm).reshape(len(win_start), len(win_start))
ssm.shape
(744, 744)
imshow(ssm, cmap='gray')
#xticks(linspace(0, 754, 8),linspace(0, times[-1], 8).astype(int))
#ylim((0, 44))
gcf().set_figheight(8)
ss_win_size = 200
ss_hop = 10
times, super_rms = windowed_rms(dylan, 4096, 512, 44100)
rms_dec = decimate(super_rms, 3)
in_sig = rms_dec
win_start = arange(0, len(in_sig)- ss_win_size, ss_hop)
ssm = []
for starti in win_start:
for startj in win_start:
wini = in_sig[starti: starti+ss_win_size]
winj = in_sig[startj: startj+ss_win_size]
ssm.append(corrcoef(wini, winj)[0, 1])
ssm = array(ssm).reshape(len(win_start), len(win_start))
imshow(ssm, cmap='gray')
#xticks(linspace(0, ssm.shape[0], 8),linspace(0, times[-1], 8).astype(int))
#ylim((0, 44))
gcf().set_figheight(8)
ss_win_size = 200
ss_hop = 10
times, super_rms = windowed_rms(palestrina, 4096, 512, 44100)
rms_dec = decimate(super_rms, 3)
in_sig = rms_dec
win_start = arange(0, len(in_sig)- ss_win_size, ss_hop)
ssm = []
for starti in win_start:
for startj in win_start:
wini = in_sig[starti: starti+ss_win_size]
winj = in_sig[startj: startj+ss_win_size]
ssm.append(corrcoef(wini, winj)[0, 1])
ssm = array(ssm).reshape(len(win_start), len(win_start))
imshow(ssm, cmap='gray')
xticks(linspace(0, ssm.shape[0], 8),linspace(0, times[-1], 8).astype(int))
#ylim((0, 44))
gcf().set_figheight(8)
Self-similarity matrices can be applied to any feature.
Foote, J. (1999). Visualizing music and audio using self-similarity. Proceedings of the Seventh ACM International Conference on Multimedia (Part 1) - MULTIMEDIA ’99, 77–80. doi:10.1145/319463.319472
loader = MonoLoader(filename = 'sources/Dire Straits - Walk of life.mp3')
dire = loader()
mfcc = MFCC()
help(MFCC)
Help on class Algo in module essentia.standard: class Algo(Algorithm) | MFCC | | | Inputs: | | [vector_real] spectrum - the audio spectrum | | | Outputs: | | [vector_real] bands - the energies in mel bands | [vector_real] mfcc - the mel frequency cepstrum coefficients | | | Parameters: | | highFrequencyBound: | real ∈ (0,inf) (default = 11000) | the upper bound of the frequency range [Hz] | | lowFrequencyBound: | real ∈ [0,inf) (default = 0) | the lower bound of the frequency range [Hz] | | numberBands: | integer ∈ [1,inf) (default = 40) | the number of mel-bands in the filter | | numberCoefficients: | integer ∈ [1,inf) (default = 13) | the number of output mel coefficients | | sampleRate: | real ∈ (0,inf) (default = 44100) | the sampling rate of the audio signal [Hz] | | | Description: | | This algorithm computes the mel-frequency cepstrum coefficients. | As there is no standard implementation, the MFCC-FB40 is used by default: | - filterbank of 40 bands from 0 to 11000Hz | - take the log value of the spectrum energy in each mel band | - DCT of the 40 bands down to 13 mel coefficients | There is a paper describing various MFCC implementations [1]. | | This algorithm depends on the algorithms MelBands and DCT and therefore | inherits their parameter restrictions. An exception is thrown if any of these | restrictions are not met. The input "spectrum" is passed to the MelBands | algorithm and thus imposes MelBands' input requirements. Exceptions are | inherited by MelBands as well as by DCT. | | References: | [1] T. Ganchev, N. Fakotakis, and G. Kokkinakis, "Comparative evaluation | of various MFCC implementations on the speaker verification task," in | International Conference on Speach and Computer (SPECOM’05), 2005, vol. | 1, | pp. 191–194. | | [2] Mel-frequency cepstrum - Wikipedia, the free encyclopedia, | http://en.wikipedia.org/wiki/Mel_frequency_cepstral_coefficient | | Method resolution order: | Algo | Algorithm | __builtin__.object | | Methods defined here: | | __call__(self, *args) | | __init__(self, **kwargs) | | __str__(self) | | compute(self, *args) | | configure(self, **kwargs) | | ---------------------------------------------------------------------- | Data descriptors defined here: | | __dict__ | dictionary for instance variables (if defined) | | __weakref__ | list of weak references to the object (if defined) | | ---------------------------------------------------------------------- | Data and other attributes defined here: | | __struct__ = {'description': 'This algorithm computes the mel-frequenc... | | ---------------------------------------------------------------------- | Methods inherited from Algorithm: | | __compute__(...) | compute the algorithm | | __configure__(...) | Configure the algorithm | | getDoc(...) | Returns the doc string for the algorithm | | getStruct(...) | Returns the doc struct for the algorithm | | inputNames(...) | Returns the names of the inputs of the algorithm. | | inputType(...) | Returns the type of the input given by its name | | name(...) | Returns the name of the algorithm. | | outputNames(...) | Returns the names of the outputs of the algorithm. | | paramType(...) | Returns the type of the parameter given by its name | | paramValue(...) | Returns the value of the parameter or None if not yet configured | | parameterNames(...) | Returns the names of the parameters for this algorithm. | | reset(...) | Reset the algorithm to its initial state (if any). | | ---------------------------------------------------------------------- | Data and other attributes inherited from Algorithm: | | __new__ = <built-in method __new__ of type object> | T.__new__(S, ...) -> a new object with type S, a subtype of T
w = Windowing(type = 'hann')
spectrum = Spectrum() # FFT() would return the complex FFT, here we just want the magnitude spectrum
frame = dire[5*44100 : 5*44100 + 1024]
spec = spectrum(w(frame))
plot(spec)
[<matplotlib.lines.Line2D at 0x1b8cbb10>]
mfccs = []
for frame in FrameGenerator(dire, frameSize = 1024, hopSize = 512):
mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
mfccs.append(mfcc_coeffs)
mfccs = array(mfccs).T
imshow(mfccs[1:,:], aspect='auto')
<matplotlib.image.AxesImage at 0x1b8ed8d0>
from essentia.streaming import *
# This will overwrite the previous "standard" mode imports!
Starts the same way:
loader = MonoLoader(filename = 'sources/Dire Straits - Walk of life.mp3')
frameCutter = FrameCutter(frameSize = 1024, hopSize = 512)
w = Windowing(type = 'hann')
spec = Spectrum()
mfcc = MFCC()
But then things are "connected" in a graph
loader.audio >> frameCutter.signal
frameCutter.frame >> w.frame >> spec.frame
spec.spectrum >> mfcc.spectrum
<essentia.streaming._StreamConnector instance at 0x1b8f9b00>
And then run:
essentia.run(loader)
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) <ipython-input-83-5a68facf7b1d> in <module>() ----> 1 essentia.run(loader) /usr/local/lib/python2.7/dist-packages/essentia/__init__.py in run(gen) 145 if isinstance(gen, VectorInput) and not gen.connections.values()[0]: 146 raise EssentiaError('VectorInput is not connected to anything...') --> 147 return _essentia.run(gen) 148 149 log.debug(EPython, 'Successfully imported essentia python module (log fully available and synchronized with the C++ one)') RuntimeError: MFCC::bands is not connected to any sink...
Ooops! Must remember to connect outputs too!
pool = essentia.Pool()
mfcc.bands >> (pool, 'lowlevel.mfcc_bands')
mfcc.mfcc >> (pool, 'lowlevel.mfcc')
essentia.run(loader)
print 'Pool contains %d frames of MFCCs' % len(pool['lowlevel.mfcc'])
Pool contains 21752 frames of MFCCs
imshow(pool['lowlevel.mfcc'].T[1:,:], aspect = 'auto')
figure()
imshow(pool['lowlevel.mfcc_bands'].T, aspect = 'auto', interpolation = 'nearest')
<matplotlib.image.AxesImage at 0x368e6d0>
Graphs can be reconfigured;
mfcc.mfcc.disconnect((pool, 'lowlevel.mfcc'))
fileout = FileOutput(filename = 'mfccframes.txt')
mfcc.mfcc >> fileout
# reset the network otherwise the loader in particular will not do anything useful
essentia.reset(loader)
# and rerun it!
essentia.run(loader)
mfcc_coefs = pool['lowlevel.mfcc'][:3000,1:]
mfcc_coefs.shape
(3000, 12)
ssm = []
for mfcc_i in mfcc_coefs:
for mfcc_j in mfcc_coefs:
cc = sum(mfcc_i * mfcc_j)
ssm.append(cc)
ssm = array(ssm).reshape(len(mfcc_coefs), len(mfcc_coefs))
imshow(ssm)
<matplotlib.image.AxesImage at 0x4443fd0>
ssm = array(ssm).reshape(len(mfcc_coefs), len(mfcc_coefs))
imshow(ssm[:3000, :3000], cmap='gray', interpolation='nearest')
<matplotlib.image.AxesImage at 0x447fb90>
loader = MonoLoader(filename = 'sources/Bob Marley - Buffalo Soldier.mp3')
frameCutter = FrameCutter(frameSize = 1024, hopSize = 512)
w = Windowing(type = 'hann')
spec = Spectrum()
mfcc = MFCC()
pool = essentia.Pool()
loader.audio >> frameCutter.signal
frameCutter.frame >> w.frame >> spec.frame
spec.spectrum >> mfcc.spectrum
mfcc.bands >> (pool, 'lowlevel.mfcc_bands')
mfcc.mfcc >> (pool, 'lowlevel.mfcc')
essentia.run(loader)
mfcc_coefs = pool['lowlevel.mfcc'][:3000,1:]
ssm = []
for mfcc_i in mfcc_coefs:
for mfcc_j in mfcc_coefs:
cc = sum(mfcc_i * mfcc_j)
ssm.append(cc)
ssm = array(ssm).reshape(len(mfcc_coefs), len(mfcc_coefs))
imshow(ssm[:3000, :3000], cmap='gray', interpolation='nearest')
<matplotlib.image.AxesImage at 0x1ae72950>
loader = MonoLoader(filename = 'sources/Messiaen-Turangalila4.mp3')
frameCutter = FrameCutter(frameSize = 1024, hopSize = 512)
w = Windowing(type = 'hann')
spec = Spectrum()
mfcc = MFCC()
pool = essentia.Pool()
loader.audio >> frameCutter.signal
frameCutter.frame >> w.frame >> spec.frame
spec.spectrum >> mfcc.spectrum
mfcc.bands >> (pool, 'lowlevel.mfcc_bands')
mfcc.mfcc >> (pool, 'lowlevel.mfcc')
essentia.run(loader)
mfcc_coefs = pool['lowlevel.mfcc'][:3000,1:]
ssm = []
for mfcc_i in mfcc_coefs:
for mfcc_j in mfcc_coefs:
cc = sum(mfcc_i * mfcc_j)
ssm.append(cc)
ssm = array(ssm).reshape(len(mfcc_coefs), len(mfcc_coefs))
imshow(ssm[:3000, :3000], cmap='gray', interpolation='nearest')
<matplotlib.image.AxesImage at 0x1a51d790>
from essentia import *
from essentia.standard import *
filename = 'sources/Isaac Hayes - Out Of The Ghetto.mp3'
audio = MonoLoader(filename = filename)()
# create the pool and the necessary algorithms
pool = Pool()
w = Windowing()
spec = Spectrum()
centroid = Centroid()
# compute the centroid for all frames in our audio and add it to the pool
for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512):
c = centroid(spec(w(frame))) * 44100/2
pool.add('lowlevel.centroid', c)
aggrpool = PoolAggregator(defaultStats = [ 'mean', 'var' ])(pool)
aggrpool.descriptorNames()
['lowlevel.centroid.mean', 'lowlevel.centroid.var']
aggrpool['lowlevel.centroid.mean']
2627.591796875
aggrpool['lowlevel.centroid.var']
2143515.0
sqrt(_)
1464.0747931714418
Proposed by Tzanetakis and Cook to model the time needed to identify higher level features (e.g. identify instruments or genres)
centroid = pool['lowlevel.centroid']
plot(centroid)
[<matplotlib.lines.Line2D at 0x1a527190>]
centroid.shape
(29938,)
tex_win_time = 1.0 #seconds
hopsize = 512
sr = 44100
tex_win_size = int(tex_win_time * sr/hopsize)
tex_win_size
86
win_start = arange(0, len(centroid), tex_win_size)
centroid_tex_mean = []
centroid_tex_var = []
for start in win_start:
win = centroid[start: start + tex_win_size]
centroid_tex_mean.append(mean(win))
centroid_tex_var.append(var(win))
plot(centroid_tex_mean)
twinx()
plot(centroid_tex_var, 'g')
[<matplotlib.lines.Line2D at 0x2c26e290>]
filename = 'sources/Messiaen-Turangalila4.mp3'
tex_win_time = 1.0 #seconds
hopsize = 512
sr = 44100
tex_win_size = int(tex_win_time * sr/hopsize)
tex_win_size
audio = MonoLoader(filename = filename, sampleRate=sr)()
# create the pool and the necessary algorithms
pool = Pool()
w = Windowing()
spec = Spectrum()
centroid = Centroid()
# compute the centroid for all frames in our audio and add it to the pool
for frame in FrameGenerator(audio, frameSize = 1024, hopSize = hopsize):
c = centroid(spec(w(frame))) * sr/2
pool.add('lowlevel.centroid', c)
centroid = pool['lowlevel.centroid']
win_start = arange(0, len(centroid), tex_win_size)
centroid_tex_mean = []
centroid_tex_var = []
for start in win_start:
win = centroid[start: start + tex_win_size]
centroid_tex_mean.append(mean(win))
centroid_tex_var.append(var(win))
plot(centroid_tex_mean)
twinx()
plot(centroid_tex_var, 'g')
[<matplotlib.lines.Line2D at 0x26834c10>]
"The first five peaks of the autocorrelation function are detected and their corresponding periodicities in beats per minute (bpm) are calculated and added in a “beat” histogram"
G. Tzanetakis and P. Cook. Musical genre classification of audio signals. IEEE Transactions on Speech and Audio Processing, 10(5):293–302, 2002
from IPython.core.display import Image
Image(filename='Beat_histogram.png')
Essentially finding "sub-tempos" in the autocorrelation function to find other significant rhythmic subdivisions, and their relationships.
Image(filename='beat_hist.png')
from sklearn import mixture
n_samples = 300
# generate random sample, two components
random.seed(0)
C = array([[0., -0.7], [3.5, .7]])
X_train = r_[np.dot(np.random.randn(n_samples, 2), C),
np.random.randn(n_samples, 2) + np.array([20, 20])]
scatter(X_train[:, 0], X_train[:, 1], .8)
<matplotlib.collections.PathCollection at 0x4ef4190>
clf = mixture.GMM(n_components=2, covariance_type='full')
clf.fit(X_train)
x = linspace(-20.0, 30.0)
y = linspace(-20.0, 40.0)
X, Y = meshgrid(x, y)
XX = c_[X.ravel(), Y.ravel()]
Z = np.log(-clf.eval(XX)[0])
Z = Z.reshape(X.shape)
CS = contour(X, Y, Z)
CB = colorbar(CS, shrink=0.8, extend='both')
scatter(X_train[:, 0], X_train[:, 1], .8)
axis('tight')
/usr/local/lib/python2.7/dist-packages/scikit_learn-0.14.1-py2.7-linux-x86_64.egg/sklearn/utils/__init__.py:93: DeprecationWarning: Function eval is deprecated; GMM.eval was renamed to GMM.score_samples in 0.14 and will be removed in 0.16. warnings.warn(msg, category=DeprecationWarning)
(-22.5, 32.5, -23.0, 43.0)
clf.predict([(0, 10)])
array([0])
clf.predict([(25, 20)])
array([1])
clf.predict([(-20, 20)]), clf.predict([(-20, 30)])
(array([0]), array([1]))
points = [(-20, 20), (-20, 30), (25, 20), (0, 10)]
points0 = []
points1 = []
for p in points:
cls = clf.predict([p])
if cls[0] == 1:
points1.append(p)
else:
points0.append(p)
print points1
print points0
[(-20, 30), (25, 20)] [(-20, 20), (0, 10)]
clf = mixture.GMM(n_components=2, covariance_type='full')
clf.fit(X_train)
x = linspace(-20.0, 30.0)
y = linspace(-20.0, 40.0)
X, Y = meshgrid(x, y)
XX = c_[X.ravel(), Y.ravel()]
Z = np.log(-clf.eval(XX)[0])
Z = Z.reshape(X.shape)
CS = contour(X, Y, Z)
scatter(*zip(points0))
scatter(*zip(points1), color='g')
axis('tight');
/usr/local/lib/python2.7/dist-packages/scikit_learn-0.14.1-py2.7-linux-x86_64.egg/sklearn/utils/__init__.py:93: DeprecationWarning: Function eval is deprecated; GMM.eval was renamed to GMM.score_samples in 0.14 and will be removed in 0.16. warnings.warn(msg, category=DeprecationWarning)
clf.predict_proba([(25, 20)])
array([[ 3.15136965e-103, 1.00000000e+000]])
from sklearn import svm
X = [[0, 0], [1, 1]]
y = [0, 1]
clf = svm.SVC()
clf.fit(X, y)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
clf.predict([[2., 2.]])
array([1])
clf.predict([[0., 0.1]])
array([0])
By Andrés Cabrera mantaraya36@gmail.com
For course MAT 240E at UCSB
This ipython notebook is licensed under the CC-BY-NC-SA license: http://creativecommons.org/licenses/by-nc-sa/4.0/