import cPickle as pickle import os import re import sqlite3 import nltk from nltk.stem import PorterStemmer MSD_DIR = u'/q/boar/boar-p9/MillionSong/' MSD_LFM_ROOT = os.path.join(MSD_DIR, 'Lastfm') MSD_ADD = os.path.join(MSD_DIR, 'AdditionalFiles') tags_dbfile = os.path.join(MSD_LFM_ROOT, 'lastfm_tags.db') uniq_tag_f = os.path.join(MSD_LFM_ROOT, 'unique_tags.txt') md_dbfile = 'track_metadata.db' # shameless steal from https://github.com/bmcfee/hypergraph_playlist/blob/master/buildTagmatrix.py def getVocab(dbc): vocab = [] cur = dbc.cursor() cur.execute('''SELECT tag FROM tags''') for (term,) in cur: vocab.append(term) pass return vocab def getTrackRows(dbc): cur = dbc.cursor() tid = {} cur.execute('''SELECT tid FROM tids''') for (i, (track,)) in enumerate(cur, 1): tid[track] = i pass return tid with sqlite3.connect(tags_dbfile) as dbc: vocab = getVocab(dbc) tid = getTrackRows(dbc) def tid_to_dir(base_dir, tid, ext='.h5'): return os.path.join(base_dir, '/'.join(tid[2:5]), tid + ext) def sanitize(tag): return re.sub(r'(\W|_)+', '', re.sub('(&| n )', 'and', ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(tag.lower())]))) filtered_tags = (# favorate/like/love/blabla 'favorites', 'Favorite', 'Favourites', 'favourite', 'favorite songs', 'Favourite Songs', 'favorite song', 'songs i love', 'lovedbybeyondwithin', 'Love it', 'love at first listen', 'fav', 'my favorite', 'top 40', 'songs I absolutely love', 'favs', 'My Favorites', 'Favorite Artists', 'All time favourites', 'personal favourites', 'favouritestreamable', 'favorite tracks', 'Favorite Bands', 'like it', 'I love this song', 'rex ferric faves', 'love to death', 'my gang 09', 'My Favourites', 'BeatbabeBop selection', 'I Like It', 'newbest', 'top', 'IIIIIIIIII AMAZING TRACK :D IIIIIIIIII', 'best songs of the 80s', 'LOVE LOVE LOVE', 'i love it', 'most loved', 'favorite by this group', 'amayzes loved', 'DJPMan-loved-tracks', 'best of 2008', 'loved', 'Makes Me Smile', '77davez-all-tracks', 'My pop music', 'best songs ever', 'favorite by this singer', 'I like', 'my music', 'Soundtrack Of My Life', 'UK top 40', 'Like', 'malloy2000 playlist - top songs - classical to metal', 'loved tracks', 'top artists', 'all time favorites', 'best songs of the 00s', 'favourite tracks', 'Solomusika-Loved', 'all time faves', 'british i like', 'Jills Station', 'de todo mio favoritos', 'Faves', 'Fave', 'acclaimed music top 3000', 'top 2000', 'leapsandloved', 'Radiotsar approved', # great/awesome/blabla 'kick ass', 'wonderful', 'excellent', 'Great Lyricists', 'badass', 'awesomeness', 'great song', 'Awesome', 'cool', 'amazing', 'good', 'nice', 'sweet', 'best', 'FUCKING AWESOME', 'lovely', 'Good Stuff', 'brilliant', 'feel good', 'perfect', 'all the best', 'cute', 'the best', '<3', 'interesting', 'feelgood', 'pretty', 'i feel good', 'good shit', 'good music', 'good song', 'great songs', 'yeah', 'best song ever', 'wow', 'worship', 'makes me happy', 'ok', 'damned good', 'underrated', 'Perfection', 'super', # rating '1', '3', '4', '5', '4 Stars', '3 stars', '4 Star', '3 star', '3-star', # year '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '00s', '10s', '1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '20th Century', '21st century', "50's", '50s', "60's", '60s', '60s Gold', "70's", '70s', "80's", '80s', '80s Pop', '80s rock', "90's", '90s', '90s Rock', # descriptive 'songwriter', 'singer-songwriter', 'cover', 'covers', 'seen live', 'heard on Pandora', 'title is a full sentence', 'Retro', 'Miscellaneous', 'collection', 'billboard number ones', 'ost', 'cover song', 'singer songwriter', 'new', 'download', 'over 5 minutes long', 'Soundtracks', 'under two minutes', 'albums I own', 'cover songs', 'Radio', 'heard on last-fm', 'Soundtrack', # I don't know what you are talking about 'buy', 'lol', 'us', 'other', '2giveme5', 'i am a party girl here is my soundtrack', 'names', 'Tag', 'check out', 'f', 'test', 'out of our heads', 'me', 'I want back to the 80s', '9 lbs hammer', 'yes', 'streamable track wants', 'aitch', 'slgdmbestof', 'gotanygoodmusic', 'Brems tagg radio', 'gh 3', 'Sousaphonic AOTM 201102', 'fH Projex', 'GH10', 'Ion B radio', 'ik ben', 'quarkzangsun v1', ) stag_to_tag = dict() stemmer = PorterStemmer() # we only pick the tags with >= 1000 counts, otherwise it's just too noisy # e.g. "writing papers to pay for the college you have gotten into" has 13 counts with open(uniq_tag_f, 'rb') as f: for line in f: try: tag, count = line.strip().split('\t', 2) if int(count) >= 1000: if not tag in filtered_tags: stag = sanitize(tag) if stag in stag_to_tag: stag_to_tag[stag].append(tag) else: stag_to_tag[stag] = [tag] else: # since the file is ordered by count break except ValueError as e: print 'The following line raises the error:', e # there is one line with no tag information, but with less than 1000 counts print line tags = sorted(stag_to_tag.keys()) tags import json with open('stag_to_tag.json') as f: stag_to_tag = json.load(f) stag_to_tag voc_to_num = dict((tag, i) for (i, tag) in enumerate(tags)) def getArtistTracks(cur, aid): cur.execute("SELECT track_id FROM songs WHERE artist_id='%s'" % aid) for (track, ) in cur_md: yield track def getValidTrackTags(cur, track, tid, vocab, voc_to_num): cur.execute("SELECT tag, val FROM tid_tag WHERE tid = %d AND val > 0" % tid[track]) out = {} for (tag, val) in cur: stag = sanitize(vocab[tag-1]) if stag not in voc_to_num: continue if voc_to_num[stag] in out: new_val = min(100, out[voc_to_num[stag]] + float(val)) out[voc_to_num[stag]] = new_val else: out[voc_to_num[stag]] = float(val) return out def numberize(infile, outfile, cur_md, cur_td, tid, vocab, voc_to_num): with open(infile, 'rb') as fr, open(outfile, 'wb') as fw: for line in fr: aid = line.strip() for track in getArtistTracks(cur_md, aid): if track not in tid: continue out = getValidTrackTags(cur_td, track, tid, vocab, voc_to_num) if len(out) != 0: fw.write('%s\t%s\n' % (track, ' '.join('%d:%.1f' % pair for pair in out.items()))) # turn the whole MSD tags to numbers with sqlite3.connect(md_dbfile) as conn_md, sqlite3.connect(tags_dbfile) as conn_td: cur_md = conn_md.cursor() cur_td = conn_td.cursor() # artists_train.txt and artists_test.txt can be obtained from # https://github.com/tbertinmahieux/MSongsDB/tree/master/Tasks_Demos/Tagging numberize('artists_train.txt', 'tracks_tag_train.num', cur_md, cur_td, tid, vocab, voc_to_num) numberize('artists_test.txt', 'tracks_tag_test.num', cur_md, cur_td, tid, vocab, voc_to_num) def densify_and_save(infile, ncol): with open(infile, 'rb') as fr: for line in fr: tmp = line.split('\t', 2) tid = tmp[0].strip() tdir = os.path.join('vq_hist', '/'.join(tid[2:5])) # this folder should already exist assert os.path.exists(tdir) pairs = tmp[-1].strip().split() keyvals = [p.split(':') for p in pairs] keyvals = [(int(key), float(val)) for key, val in keyvals] row = np.zeros((ncol, ), dtype=np.int16) for (k, v) in keyvals: row[k] = v np.save(os.path.join(tdir, tid + '_BoT'), row) pass densify_and_save('tracks_tag_train.num', len(tags)) densify_and_save('tracks_tag_test.num', len(tags))