import os import numpy as np import scipy.stats from sklearn.metrics import roc_auc_score, average_precision_score import pmf train_tracks = list() with open('train_tracks.txt', 'rb') as f: for line in f: train_tracks.append(line.strip()) test_tracks = list() with open('test_tracks.txt', 'rb') as f: for line in f: test_tracks.append(line.strip()) tags = list() with open('voc.txt', 'rb') as f: for line in f: tags.append(line.strip()) def construct_pred_mask(tags_predicted, predictat): n_samples, n_tags = tags_predicted.shape rankings = np.argsort(-tags_predicted, axis=1)[:, :predictat] tags_predicted_binary = np.zeros_like(tags_predicted, dtype=bool) for i in xrange(n_samples): tags_predicted_binary[i, rankings[i]] = 1 return tags_predicted_binary def per_tag_prec_recall(tags_predicted_binary, tags_true_binary): mask = np.logical_and(tags_predicted_binary, tags_true_binary) prec = mask.sum(axis=0) / (tags_predicted_binary.sum(axis=0) + np.spacing(1)) tags_true_count = tags_true_binary.sum(axis=0).astype(float) idx = (tags_true_count > 0) recall = mask.sum(axis=0)[idx] / tags_true_count[idx] return prec, recall def aroc_ap(tags_true_binary, tags_predicted): n_tags = tags_true_binary.shape[1] auc = list() aprec = list() for i in xrange(n_tags): if np.sum(tags_true_binary[:, i]) != 0: auc.append(roc_auc_score(tags_true_binary[:, i], tags_predicted[:, i])) aprec.append(average_precision_score(tags_true_binary[:, i], tags_predicted[:, i])) return auc, aprec def print_out_metrics(tags_true_binary, tags_predicted, predictat): tags_predicted_binary = construct_pred_mask(tags_predicted, predictat) prec, recall = per_tag_prec_recall(tags_predicted_binary, tags_true_binary) mprec, mrecall = np.mean(prec), np.mean(recall) print 'Precision = %.3f (%.3f)' % (mprec, np.std(prec) / sqrt(prec.size)) print 'Recall = %.3f (%.3f)' % (mrecall, np.std(recall) / sqrt(recall.size)) print 'F-score = %.3f' % (2 * mprec * mrecall / (mprec + mrecall)) auc, aprec = aroc_ap(tags_true_binary, tags_predicted) print 'AROC = %.3f (%.3f)' % (np.mean(auc), np.std(auc) / sqrt(len(auc))) print 'AP = %.3f (%.3f)' % (np.mean(aprec), np.std(aprec) / sqrt(len(aprec))) # take tracks with at least 20 tags y_test = None test_tracks_selected = list() for tid in test_tracks: tdir = os.path.join('vq_hist', '/'.join(tid[2:5])) bot = np.load(os.path.join(tdir, '%s_BoT.npy' % tid)) if (bot > 0).sum() >= 20: test_tracks_selected.append(tid) if y_test is None: y_test = bot else: y_test = np.vstack((y_test, bot)) hist(np.sum( (y_test > 0), axis=1), bins=50) pass K = 512 n_subset = 10000 np.random.seed(98765) train_tracks_subset = np.random.choice(train_tracks, size=n_subset, replace=False) D = K + len(tags) X = np.empty((n_subset, D), dtype=np.int16) for (i, tid) in enumerate(train_tracks_subset): tdir = os.path.join('vq_hist', '/'.join(tid[2:5])) vq = np.load(os.path.join(tdir, '%s_K%d.npy' % (tid, K))).ravel() bot = np.load(os.path.join(tdir, '%s_BoT.npy' % tid)) bot[bot > 0] = 1 X[i] = np.hstack((vq, bot)) bar(np.arange(D), X[1000]) X_test = np.empty((len(test_tracks_selected), K), dtype=int16) for (i, tid) in enumerate(test_tracks_selected): tdir = os.path.join('vq_hist', '/'.join(tid[2:5])) vq = np.load(os.path.join(tdir, '%s_K%d.npy' % (tid, K))).ravel() X_test[i] = vq n_components = 100 coder = pmf.PoissonMF(n_components=n_components, random_state=98765, verbose=True) coder.fit(X) # randomly plot 30 "topics" indices = np.random.choice(n_components, size=30, replace=False) figure(figsize=(45, 15)) for i in xrange(30): subplot(10, 3, i+1) topic = coder.Eb[indices[i]].copy() # properly normalize the BoT dimensions for visualization purposes topic[K:] /= topic[K:].max() topic[K:] *= topic[:K].max() bar(np.arange(D), topic) axvline(x=K, color='red') title('Component #%d' % indices[i]) #savefig('dict.eps') tagger = pmf.PoissonMF(n_components=n_components, random_state=98765, verbose=True) tagger.set_components(coder.gamma_b[:, :K], coder.rho_b) Et = tagger.transform(X_test) Et /= Et.sum(axis=1, keepdims=True) tags_predicted = Et.dot(coder.Eb[:, K:]) print tags_predicted.min(), tags_predicted.max() div_factor = 3 tags_predicted = tags_predicted - div_factor * np.mean(tags_predicted, axis=0) predictat = 20 tags_true_binary = (y_test > 0) print_out_metrics(tags_true_binary, tags_predicted, predictat) n_components = 100 online_coder = pmf.OnlinePoissonMF(n_components=n_components, batch_size=500, n_pass=1, random_state=98765, verbose=True) online_coder.fit(X, est_total=len(train_tracks)) plot(online_coder.bound) pass ents = np.zeros((n_components, )) for k in xrange(n_components): ents[k] = scipy.stats.entropy(online_coder.Eb[k]) idx = np.argsort(-ents) plot(ents[idx], '-o') pass tagger = pmf.PoissonMF(n_components=n_components, random_state=98765, verbose=True) tagger.set_components(online_coder.gamma_b[:, :K], online_coder.rho_b[:, :K]) Et = tagger.transform(X_test) Et /= Et.sum(axis=1, keepdims=True) tags_predicted = Et.dot(online_coder.Eb[:, K:]) n_samples, n_tags = tags_predicted.shape print tags_predicted.min(), tags_predicted.max() div_factor = 3 tags_predicted = tags_predicted - div_factor * np.mean(tags_predicted, axis=0) predictat = 20 tags_true_binary = (y_test > 0) print_out_metrics(tags_true_binary, tags_predicted, predictat) # very naive implementation of out-of-core fit for stochastic PMF def ooc_fit(obj, train_tracks, K, n_feats): n_samples = len(train_tracks) obj._scale = float(n_samples) / obj.batch_size obj._init_components(n_feats) obj.bound = list() for count in xrange(obj.n_pass): print 'Iteration %d: passing through the data...' % count indices = np.arange(n_samples) if obj.shuffle: np.random.shuffle(indices) for (i, istart) in enumerate(xrange(0, n_samples, obj.batch_size), 1): print 'Mini-batch %d:' % i iend = min(istart + obj.batch_size, n_samples) obj.set_learning_rate(iter=i) mini_batch = np.zeros((iend - istart, n_feats)) for s in xrange(iend - istart): tid = train_tracks[indices[istart + s]] #print '\tRead in track: %s' % tid tdir = os.path.join('vq_hist', '/'.join(tid[2:5])) vq = np.load(os.path.join(tdir, '%s_K%d.npy' % (tid, K))).ravel() bot = np.load(os.path.join(tdir, '%s_BoT.npy' % tid)) bot[bot > 0] = 1 mini_batch[s] = np.hstack((vq, bot)) obj.partial_fit(mini_batch) obj.bound.append(obj._stoch_bound(mini_batch)) return obj n_components = 100 batch_size = 1000 online_coder_full = pmf.OnlinePoissonMF(n_components=n_components, batch_size=batch_size, n_pass=1, random_state=98765, verbose=True) online_coder_full = ooc_fit(online_coder_full, train_tracks, K, D) plot(online_coder_full.bound) pass ents = np.zeros((n_components, )) for k in xrange(n_components): ents[k] = scipy.stats.entropy(online_coder_full.Eb[k]) idx = np.argsort(-ents) plot(ents[idx], '-o') pass tagger = pmf.PoissonMF(n_components=n_components, random_state=98765, verbose=True) tagger.set_components(online_coder_full.gamma_b[:, :K], online_coder_full.rho_b[:, :K]) Et = tagger.transform(X_test) Et /= Et.sum(axis=1, keepdims=True) tags_predicted = Et.dot(online_coder_full.Eb[:, K:]) print tags_predicted.min(), tags_predicted.max() div_factor = 3 tags_predicted = tags_predicted - div_factor * np.mean(tags_predicted, axis=0) predictat = 20 tags_true_binary = (y_test > 0) print_out_metrics(tags_true_binary, tags_predicted, predictat)