#!/usr/bin/env python # coding: utf-8 # # Clustering Musical Artists # # *Note: if you are visualizing this notebook directly from GitHub, some mathematical symbols might display incorrectly or not display at all. This same notebook can be rendered from nbviewer by following [this link.](http://nbviewer.jupyter.org/github/david-cortes/datascienceprojects/blob/master/machine_learning/clustering_fm_artists.ipynb)* # # This project consists on clustering musical artists using a dataset with the top 50 played artists per user of a random sample of ~360,000 users from Last.fm, which can be found [here](http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html), based on the idea that artists who are preferred by the same users tend to be similar. # [1. Loading and cleaning the data](#1) # * [1.1 Downloading the dataset and generating a sample](#1.1) # * [1.2 Downloading and formatting the data](#1.2) # # [2. Establishing Artists' pairwise similarities](#2) # * [2.1 Generating candidate pairs of aritsts to compare](#2.1) # * [2.2 Converting to cosine distances](#2.2) # # [3. Clustering Artists](#3) # * [Additional clustering without Spark](#3.1) # # [4. Checking cluster sizes and calculating cluster quality metrics](#4) # * [4.1 Checking the sizes of the largest clusters for the different algorithms](#4.1) # * [4.2 Calculating cluster quality metrics](#4.2) # # [5. Checking a sample of the results](#5) # # ## Part 1 - Loading and cleaning the data # # ### 1.1 - Downloading the dataset and generating a sample # In[1]: import os, sys def download_data(): import urllib, tarfile data_file = urllib.URLopener() data_file.retrieve("http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-360K.tar.gz", "lastfm-dataset-360K.tar.gz") data_file = tarfile.open("lastfm-dataset-360K.tar.gz", 'r:gz') data_file.extractall() def generate_sample(file_path=os.getcwd()+'/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv',n=10000): with open('small_sample.tsv','w') as out: with open(file_path) as f: for i in range(n): out.write(f.readline()) #download_data() # generate_sample() # # ### 1.2 - Loading and formatting the data # In[8]: import os, json #Here I'm assuming there's an Spark Context already set up under the name 'sc' # dataset=sc.textFile(os.getcwd()+'/small_sample.tsv') dataset=sc.textFile(os.getcwd()+'/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv') dataset=dataset.map(lambda x: x.split('\t')).filter(lambda x: len(x[1])>30) #removing invalid artists (e.g. 'bbc radio') dataset=dataset.filter(lambda x: x[1]!='89ad4ac3-39f7-470e-963a-56509c546377') #removing 'various artists' #converting hash values to integers to speed up the analysis u_list=dataset.map(lambda x: x[0]).distinct().collect() users=dict() for v,k in enumerate(u_list): users[k]=v a_list=dataset.map(lambda x: x[1]).distinct().collect() artists=dict() for v,k in enumerate(a_list): artists[k]=v n_users=len(u_list) n_artists=len(a_list) dataset=dataset.map(lambda x: (users[x[0]],artists[x[1]],x[2])) #the number of plays is not relevant here dataset.cache() del u_list, a_list, users, artists print("There are ", n_users, " different users") print("There are ", n_artists, " valid artists") #Generating some useful files for later #Artists in this dataset can appear under more than one name def build_arts_dict(dataset): return dict(dataset.map(lambda x: (x[1],x[2])).groupByKey().mapValues(list).mapValues(lambda x: x[0]).map(lambda x: [x[0],x[1]]).collect()) arts_dic=build_arts_dict(dataset) with open('artists.json', 'w') as outfile: json.dump(arts_dic, outfile) del arts_dic dataset.cache() dataset.map(lambda x: (x[0],x[1])).saveAsTextFile('processed_data') users_per_artist=dict(dataset.map(lambda x: (x[1],x[0])).groupByKey().mapValues(len).map(list).collect()) with open('users_per_artist.json', 'w') as outfile: json.dump(users_per_artist, outfile) del users_per_artist dataset.unpersist() # # ## Part 2 - Establishing Artists' pairwise similarities # # Calculating cosine similarities among artists (this penalizes according to the frequencies of artists across users, giving less of a penalty for cases of assimetric frequencies than dice similarity, for example), considering only wheter they are in a user's top played list regardless of the number of plays and only taking into account those with a high-enoguh similarity. # # If we assume that each user has roughly 50-top played artists, and each artist has an equal chance of appearing within any given user's playlist, then the expected cosine similarity between two artists, if everything happened by chance, could be approximated like this: # # $$expected.sim=\frac{(\frac{50}{n. artists})^2 \times n.users}{\sqrt{50}^2} \approx 0.0007 $$ # # However, since pairs of artists with such similarity would likely not be in the same cluster, it could be a better idea to set an arbitrary threshold instead, so as to decrease the number of pairs. A cosine similarity of 0.1 would be equivalent to two artists appearing each in the playlists of 100 users and having 10 users in common; or in a different case, to an artist appearing in 100 users' playlist and another in 50, having 7 users in common. # # In order to decrease the number of artists pairs to evaluate in the clustering part and make it manageable on a single machine, it would be better to set a minimum requirement for common users among artists - here I set it to 7, so a pair of artists should have at least 7 users in common in order for it to be assigned a non-zero distance, otherwise there would be artists assigned to the same cluster only because one user heard both - as well as a threshold for the cosine distance, which I set at 4 times the expected value if everything happened at random, in order for a pair to be considered as having a certain similarity. # # ### 2.1 Generating candidate pairs of aritsts to compare # # This dataset encompasses 360,000 users, each having a list of 50 top played artists, summing up to 160,000 different artists. Trying to establish similarities between all the artists would imply looping through $160,000 \times (160,000-1)/2 \approx 13,000,000,000$ pairs, so it's better to first see which artists have users in common, as of the 13 billion possible pairs, there are very few with at least one user in common. Doing it this way would imply looping over only $360,000 \times 50 \times (50-1)/2 \approx 441,000,000$ pairs, and in the process, it's possible to count how many users do artists have in common to ease further distance calculations. # In[2]: import itertools from operator import add def ordered(pair): n1,n2=pair if n2>n1: return n1,n2 else: return n2,n1 dataset=sc.textFile('processed_data').map(eval).groupByKey().mapValues(list).map(lambda x: x[1]) dataset=dataset.flatMap(lambda x: [(ordered(i),1) for i in itertools.combinations(x,2)]).reduceByKey(add).map(lambda x: (x[0][0],x[0][1],x[1])).filter(lambda x: x[2]>6) dataset.cache() # # ### 2.2 Converting to cosine distances # In[3]: from math import sqrt import json #Taken from the previous results n_artists=160162 n_users=359339 threshold=4* ( ((50.0/n_artists)**2)*n_users/(sqrt(50.0)**2) ) with open('users_per_artist.json') as file: users_per_artist=json.load(file) users_per_artist={int(k):v for k,v in users_per_artist.items()} bc_dic=sc.broadcast(users_per_artist) del users_per_artist dataset=dataset.map(lambda x: (x[0],x[1],x[2]*1.0/(sqrt(bc_dic.value[x[0]])*sqrt(bc_dic.value[x[1]])))).filter(lambda x: x[2]>threshold) dataset.cache() dataset.saveAsTextFile('sims') print('There are ',dataset.count(),' non-zero pairwise distances') # # ## Part 3 - Clustering Artists # # Here I'll produce different clusterings, using 100, 200, 500, 700 and 1000 clusters usign power iteration clustering, which provides similar (though usually slightly inferior) results to spectral clustering but runs faster and is scalable. # In[3]: from pyspark.mllib.clustering import PowerIterationClustering as pic import pandas as pd import json # dataset=sc.textFile('sims').map(eval) # dataset.cache() n_clusters=100 clusters=pic.train(dataset,n_clusters) clusters.assignments().map(lambda x: str(x[0])+','+str(x[1])).repartition(1).saveAsTextFile('clusts100') del clusters n_clusters=200 clusters=pic.train(dataset,n_clusters) clusters.assignments().map(lambda x: str(x[0])+','+str(x[1])).repartition(1).saveAsTextFile('clusts200') del clusters n_clusters=500 clusters=pic.train(dataset,n_clusters) clusters.assignments().map(lambda x: str(x[0])+','+str(x[1])).repartition(1).saveAsTextFile('clusts500') del clusters n_clusters=700 clusters=pic.train(dataset,n_clusters) clusters.assignments().map(lambda x: str(x[0])+','+str(x[1])).repartition(1).saveAsTextFile('clusts700') del clusters n_clusters=1000 clusters=pic.train(dataset,n_clusters) clusters.assignments().map(lambda x: str(x[0])+','+str(x[1])).repartition(1).saveAsTextFile('clusts1000') del clusters dataset=pd.read_csv('clusts100\part-00000',header=None) dataset.columns=['artist_id','cluster100'] dataset200=pd.read_csv('clusts200\part-00000',header=None) dataset200.columns=['artist_id','cluster200'] dataset500=pd.read_csv('clusts500\part-00000',header=None) dataset500.columns=['artist_id','cluster500'] dataset700=pd.read_csv('clusts700\part-00000',header=None) dataset700.columns=['artist_id','cluster700'] dataset1000=pd.read_csv('clusts1000\part-00000',header=None) dataset1000.columns=['artist_id','cluster1000'] dataset=dataset.merge(dataset200,how='outer',on='artist_id').merge(dataset500,how='outer',on='artist_id').merge(dataset700,how='outer',on='artist_id').merge(dataset1000,how='outer',on='artist_id') with open('artists.json') as art: artists_dict=json.load(art) artists_dict={int(k):v for k,v in artists_dict.items()} dataset['artist_name']=[artists_dict[art] for art in dataset['artist_id']] dataset.to_csv('results_all.csv',index=False) del dataset200,dataset500,dataset700,dataset1000 print(dataset.shape[0],' artists were clustered') dataset.head() # Since most artists are in only one or two user's playlists, it's unreliable (and computationally complex) to cluster them with so few data. That's why only a fraction (around one third) of the artists were considered for the clustering process. # # _**Additional:** clustering with scikit-learn (dbscan) and igraph (louvain modularity) (both are non-parallel). I chose these parameters and algorithms after some manual experimentation seeing which ones give a reasonable spread of artists across clusters. These algorithms have the nice property of automatically determining the number of clusters._ # In[6]: #Rearranging the data format import re dataset=sc.textFile('sims') dataset=dataset.map(lambda x: re.sub('[\(\)\s]','',x)) dataset.repartition(1).saveAsTextFile('sims_csv') dataset.unpersist() del dataset # In[4]: import sklearn, igraph, scipy, re import pandas as pd import sklearn.cluster dataset=pd.read_csv('sims_csv/part-00000',header=None) dataset.columns=['art1','art2','sim'] dataset['dist']=[1-i for i in dataset['sim']] present_artists=set(dataset['art1'].append(dataset['art2']).values.tolist()) new_numer_art_to_int=dict() new_numer_int_to_art=dict() count=0 for art in present_artists: new_numer_art_to_int[art]=count new_numer_int_to_art[count]=art count+=1 del present_artists, count dataset['art1']=[new_numer_art_to_int[i] for i in dataset['art1']] dataset['art2']=[new_numer_art_to_int[i] for i in dataset['art2']] I=dataset['art1'].append(dataset['art2']) J=dataset['art2'].append(dataset['art1']) V=dataset['dist'].append(dataset['dist']) dataset_matrix=scipy.sparse.csr_matrix((V,(I,J))) del I,J,V dataset_matrix dbsc=sklearn.cluster.DBSCAN(eps=0.775,metric='precomputed').fit_predict(dataset_matrix) new_res=pd.Series(range(dataset_matrix.shape[0])).to_frame() new_res.columns=['artist_id'] new_res['dbsc']=dbsc del dbsc, dataset_matrix g=igraph.Graph(edges=dataset[['art1','art2']].values.tolist(),directed=False) g.es['weight']=dataset['sim'].values.tolist() del dataset louvain_weighted=g.community_multilevel(weights=g.es['weight']) new_res['louvain']=louvain_weighted.membership new_res['artist_id']=[new_numer_int_to_art[i] for i in new_res['artist_id']] results=pd.read_csv('results_all.csv',engine='python') results=results.merge(new_res,how='left',on='artist_id') new_res=new_res.merge(results[['artist_id','cluster100','cluster200','cluster500','cluster700','cluster1000']],how='left',on='artist_id') cols=results.columns.tolist() cols=cols[6:7]+cols[1:6]+cols[7:9] results=results[cols] results.columns=[re.sub('cluster','pic',i) for i in results.columns] new_res.columns=[re.sub('cluster','pic',i) for i in new_res.columns] results.to_csv('results_all.csv',index=False) results.head() # _Note: a cluster assignment of -1 means that the row was not asigned to any cluster. In DBSCAN most of the artists are not assigned to any cluster, thus those clusters should be of better quality._ # # ## Part 4 - Checking cluster sizes and calculating cluster quality metrics # # ### 4.1 Checking the sizes of the largest clusters for the different algorithms # In[5]: sizes=[pd.Series(results[i].value_counts()) for i in results.columns[1:]] sizes[5]=sizes[5][1:] for i in range(len(sizes)): sizes[i].index=range(len(sizes[i])) sizes=pd.DataFrame(sizes).transpose() sizes.columns=results.columns[1:] sizes.fillna('') # From these results, it can be seen that 1000 clusters was definitely too much, since many artists ended up in their own cluster. # # ### 4.2 Calculating cluster quality metrics # # Given the size of this dataset, it's not feasible to calculate typical clustering quality metrics such as the silhouette coefficient or the Dunn index, but some metrics for graph cuts can be used. In this case, I'll use modularity, which can be calculated very efficiently for this dataset. This metric is, however, very sensitive to singleton clusters (clusters of size 1) and favors larger clusters, so in this case it might not be the best decision criteria to see which algorithm did better, but it's a good indicator to have some idea of it. Possible values for modularity range from -0.5 to 1, with more being better. For the case of DBSCAN, however, this metric wouldn't be comparable to other algorithms, since most artists are not assigned to any cluster. # In[6]: import numpy as np print('Modularity for Power Iteration Clustering with 100 clusters :',g.modularity(membership=new_res['pic100'],weights=g.es['weight'])) print('Modularity for Power Iteration Clustering with 200 clusters :',g.modularity(membership=new_res['pic200'],weights=g.es['weight'])) print('Modularity for Power Iteration Clustering with 500 clusters :',g.modularity(membership=new_res['pic500'],weights=g.es['weight']),) print('Modularity for Power Iteration Clustering with 700 clusters :',g.modularity(membership=new_res['pic700'],weights=g.es['weight'])) print('Modularity for Power Iteration Clustering with 1000 clusters :',g.modularity(membership=new_res['pic1000'],weights=g.es['weight'])) print('Modularity for Louvain Modularity (',len(set(louvain_weighted.membership)),'clusters) :',louvain_weighted.modularity) print() print("Results for DBSCAN:") print("Number of clusters: ",len(set(results['dbsc']))-1) print("Number of artists belonging to a cluster: ",len(results['dbsc'].loc[results['dbsc']!=-1])) del g, louvain_weighted, new_res, new_numer_art_to_int, new_numer_int_to_art # Graph modularity seems to suggest that, in this case, power iteration clustering did terribly bad compared to louvain modularity, but as mentioned before, this metric might not be the most adequate and it doesn't necessarily mean that the results are invalid. A good knowledge about the music industry and manual examination would be needed to tell this. Morever, the clusters obtained from power iteration clustering also come with different levels of granularity (accordign to the number of clusters). # # ## Part 5 - Checking a sample of the results # # Here I'll check some of the medium-sized clusters obtained from DBSCAN, which should be of better quality than the ones obtained from the other algorithms, since it only assigned a fraction of them to a cluster. # In[7]: clusters=results['dbsc'].value_counts()[10:15].index clusters=[pd.DataFrame(results[['artist_name']].loc[results['dbsc']==i]) for i in clusters] for i in range(len(clusters)): clusters[i].index=range(len(clusters[i])) clusters=clusters[0].merge(clusters[1],left_index=True, right_index=True).merge(clusters[2],left_index=True, right_index=True).merge(clusters[3],left_index=True, right_index=True).merge(clusters[4],left_index=True, right_index=True) clusters.columns=['cluster'+str(i) for i in range(1,len(clusters.columns)+1)] clusters # Trying to interpret the clusters: # * The first cluster seems to contain mostly Turkish musicians of popular and folk music. # * The second cluster seems to contain mostly Indian-Punjabi musicians, also of popular and folk music. # * The third cluster seems to contain mostly German-speaking singers of pop and movie-derived songs. # * The fourth cluster seems to contain mostly east European small artists of alternative rock and indie art. # * The fifth cluster seems to contain mostly country artists from the US.