from os import listdir, chdir from essentia.standard import AudioLoader cd ~/Desktop/Music/ base_dir = '/Users/adj/Desktop/Music/' music_list = listdir(base_dir) def getRMS(x): return sqrt(mean(x**2)) def downmix(x): if x.ndim == 1: return x #x is already mono else: return sum(x.astype(float), axis=1)/x.ndim music_list = music_list[1:] def normalize(x): abs_max = max(abs(x.min().astype(float)), abs(x.max().astype(float))) return x.astype(float) / abs_max def windowed_rms(input_sig, win_sizes = [512, 1024, 2048, 4096], hop=None): rms_windows = [] for win_size in win_sizes: if not hop: hop = win_size/2 window_start = arange(0, len(input_sig) - win_size, hop) rms = [] for start in window_start: w = input_sig[start: start+win_size].astype(float) rms_inst = getRMS(w) rms.append(rms_inst) rms_windows.append(rms) return rms_windows, win_sizes polonaise = music_list[16] polonaise july = music_list[12] july campanella = music_list[22] campanella seashore = music_list[21] seashore filtering = music_list[-10] filtering loader = AudioLoader(filename = polonaise) song, sr, nchnls = loader() polonaise = downmix(song) polonaise = normalize(polonaise) plot(polonaise) loader = AudioLoader(filename = campanella) song, sr, nchnls = loader() campanella = downmix(song) campanella = normalize(campanella) plot(campanella) loader = AudioLoader(filename = july) song, sr, nchnls = loader() july = downmix(song) july = normalize(july) plot(july) loader = AudioLoader(filename = seashore) song, sr, nchnls = loader() seashore = downmix(song) seashore = normalize(seashore) plot(seashore) loader = AudioLoader(filename = filtering) song, sr, nchnls = loader() filtering = downmix(song) filtering = normalize(filtering) plot(filtering) music = [polonaise, campanella, july, seashore, filtering] for song in music: L = 0.1 win_len = sr * L w_rms, win_lens = windowed_rms(song, win_sizes=[win_len]) sample_dur_secs = len(song)/float(sr) time = linspace(0, sample_dur_secs, len(w_rms[0])) tc = sum(w_rms[0]*time)/sum(w_rms[0]) print tc, "Temporal Centroid" plot(linspace(0, sample_dur_secs, len(song)), song, alpha=0.5) vlines(tc, -40000, 40000, lw=4) grid() show() #We can see that, despite the diversity of the genres (a couple classical solo piano pieces, a 21st century percussion quartet, #a country song, and a sample of human speech from 'Microsound', most of the temporal centroids are very close to the center of the track. #Probably the most interesting result was for the second one, 'La Campanella', which has such a dramatically loud and flamboyant #ending and such a quiet and timid beginning that its centroid is skewed toward the end a bit more than usual. def windowed_zcr(sig_in, winsize, hop, sr = 1.0): l = len(sig_in) win_start = arange(0, l - winsize, hop) zcr = zeros((len(win_start))) for i, start in enumerate(win_start): sl = sig_in[start: start + winsize].astype(float) zcr[i] = (sr/float(winsize)) * sum(sl[:-1]*sl[1:] < 0) times = win_start + winsize/2 return times/float(sr), zcr for song in music: times, zcr = windowed_zcr(song, 0.1 * sr, 0.05 * sr, sr) plot(times, zcr) show() #These are all quite distinct! The first two, both being solo piano classical pieces, seem to bear the closest resemblance, although #we see that the second piece ('La Campanella') encompasses a broader range, up to almost 6000. The country song goes all the way up #to 18000, which isn't so easy for me to understand---perhaps there is some noise that includes high frequencies. It also seems #to be the most time-varying. We see that the percussion piece overall has some lower values, perhaps because many of the sounds #are inharmonic, complex tones rather than pitches. I'm not sure how to interpret the last track ('Sonogram filtering'), which is #actually human speech being modified. Pxx, freqs, times, im = specgram(music[0]); X = sqrt(Pxx) centroid = [] for spec in X.T: sc = sum(spec*freqs)/sum(spec) centroid.append(sc) spread = [] for spec in X.T: ss = var(spec) spread.append(ss) p1 = plot(spread) twinx() p2 = plot(centroid, 'g') legend((p1 + p2), ('spread', 'centroid'), loc='best') grid() Pxx, freqs, times, im = specgram(music[1]); X = sqrt(Pxx) centroid = [] for spec in X.T: sc = sum(spec*freqs)/sum(spec) centroid.append(sc) spread = [] for spec in X.T: ss = var(spec) spread.append(ss) p1 = plot(spread) twinx() p2 = plot(centroid, 'g') legend((p1 + p2), ('spread', 'centroid'), loc='best') grid() #Even though these two (the 'Military Polonaise' and 'La Campanella' are very similar genres, both being #solo piano Romantic pieces, the spread/centroid seems quite different! In the second, 'La Campanella', #for the most part the spread value is under the centroid value, whereas in the Polonaise, it is above #equally often. 'La Campanella' is full of some very high-pitched (and probably bright in timbre) ostinato #notes, representing the bells, whereas the Polonaise overall is a bit darker and in a lower register, #so perhaps this is the reason. Pxx, freqs, times, im = specgram(music[-1]); #the filtered human speech Pxx2, freqs2, times2, im2 = specgram(music[-2]); #the percussion quartet X = sqrt(Pxx) entropy = [] for spec in X.T: bands = spec / sum(spec, axis=0) entropy.append(- sum(bands*log(bands))) plot(entropy) figure() X = sqrt(Pxx2) entropy = [] for spec in X.T: bands = spec / sum(spec, axis=0) entropy.append(- sum(bands*log2(bands))) plot(entropy) #I would have expected a much more stark difference! Not really sure how to interpret the results... Pxx, freqs, times, im = specgram(music[2], NFFT=2048, noverlap=1024); #the country music track Pxx2, freqs2, times2, im2 = specgram(music[-1], NFFT=2048, noverlap=1024); #the human speech again cutoff=0.85 rolloff = [] X = sqrt(Pxx) for spec in X.T: where_greater = find(cumsum(spec) >= cutoff*sum(spec)) rf = where_greater[0]/float(len(spec)) rolloff.append(rf) plot(rolloff) figure() X = sqrt(Pxx2) for spec in X.T: where_greater = find(cumsum(spec) >= cutoff*sum(spec)) rf = where_greater[0]/float(len(spec)) rolloff.append(rf) plot(rolloff) #I had expected maybe the results to be somewhat similar since both tracks have vocals, so at least this isn't #too surprising.