DAT 13 Code Notebook - Matt Lentz¶

Data Acquisition¶

In [ ]:

from bs4 import BeautifulSoup
import urllib2
from datetime import datetime
import pandas as pd
import json


data = pd.DataFrame(columns=('date','rank','uri'))
data2 = pd.DataFrame(columns=('title','artist','popularity','uri'))


for year in range(1960,2015):
    for month in range(1,13):
        if month<10:
            month = str(0)+str(month)
        for day in range(1,32):
            if day < 10:
                day = str(0)+str(day)
            date = str(month)+'-'+str(day)+'-'+str(year)
            try:
                date1 = datetime.strptime(date, '%m-%d-%Y')
                if datetime.weekday(date1) == 5:
                    try:            
                        link = urllib2.urlopen('http://www.billboard.com/charts/%s-%s-%s/hot-100' % (year,month,day))
                        soup = BeautifulSoup(link)
                        spotsoup1 = soup.findAll('a')
                        splitsoup1 = str(spotsoup1[58]).split('\"')
                        splitsoup2= splitsoup1[3].split(':')
                        splitsoup3 = splitsoup2[2].split(',')

                        for i in range(40):
                            dic = {}
                            dic['date'] = date
                            dic['rank'] = str(i+1)
                            dic['uri'] = str(splitsoup3[i])
                            data = data.append(dic,ignore_index=True)
                            
                    except TypeError:
                        pass
            except ValueError:
                pass

In [ ]:

spoturl = 'http://ws.spotify.com/lookup/1/.json?uri=spotify:track:'

uri_list = data['uri'].unique()

for i in uri_list:
    dic2 = {}
    url = spoturl + str(i)
    response = urllib2.urlopen(url)
    json_object = json.load(response)
    dic2['title'] = json_object['track']['name']
    dic2['artist'] = json_object['track']['artists'][0]['name']
    dic2['popularity'] = json_object['track']['popularity']
    dic2['uri'] = i
    data2 = data2.append(dic2,ignore_index=True)
    
project_data = pd.merge(data,data2,on='uri')
project_data['rank'] = project_data['rank'].astype('int')
project_data['popularity'] = project_data['popularity'].astype('float')
project_data.sort(['date','rank'])
project_data.head()

In [ ]:

data3 = pd.DataFrame(columns=('danceability','duration','energy','instrumentalness','key','liveness',
                              'loudness','speechiness','tempo','time_signature','uri'))

#echonest data
echourl = 'http://developer.echonest.com/api/v4/track/profile?api_key=API=json&id=spotify:track:'
echourl2 ='&bucket=audio_summary'



for i in uri_list:
    try:
        uri = i
        url = echourl+uri+echourl2
        dic3 = {}
        response = urllib2.urlopen(url)
        json_object = json.load(response)
        dic3['danceability'] = json_object['response']['track']['audio_summary']['danceability']
        dic3['duration'] = json_object['response']['track']['audio_summary']['duration']
        dic3['energy'] = json_object['response']['track']['audio_summary']['energy']
        dic3['instrumentalness'] = json_object['response']['track']['audio_summary']['instrumentalness']
        dic3['key'] = json_object['response']['track']['audio_summary']['key']
        dic3['liveness'] = json_object['response']['track']['audio_summary']['liveness']
        dic3['loudness'] = json_object['response']['track']['audio_summary']['loudness']
        dic3['speechiness'] = json_object['response']['track']['audio_summary']['speechiness']
        dic3['tempo'] = json_object['response']['track']['audio_summary']['tempo']
        dic3['time_signature'] = json_object['response']['track']['audio_summary']['time_signature']
        dic3['uri'] = i
        data3 = data3.append(dic3,ignore_index=True)
    except KeyError:
        pass

In [ ]:

song_data=pd.merge(project_data,data3,on='uri')
song_data.to_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project\song_data.csv')

In [ ]:

spoturl = 'http://ws.spotify.com/lookup/1/.json?uri=spotify:track:'

for i in range(8033,11115):
    dic = {}
    url = spoturl + str(uri[i])
    response = urllib2.urlopen(url)
    json_object = json.load(response)
    dic['artist_uri'] = json_object['track']['artists'][0]['href']
    dic['uri'] = uri[i]
    data = data.append(dic,ignore_index=True)

In [ ]:

echourl = 'http://developer.echonest.com/api/v4/artist/terms?api_key=API&id='
echour2 = '&format=json'

num = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


for i in range(3414,3421):
    try:
        uri = artist[i]
        url = echourl+uri+echour2
        dic = {}
        dic2 = {}
        response = urllib2.urlopen(url)
        json_object = json.load(response)
        dic['genre'] = json_object['response']['terms']
        term_rank = pd.DataFrame(columns=['frequency','name','weight'])
        dic2['artist_uri'] = artist[i]
        
        for genre in dic['genre']:
            term_rank = term_rank.append(genre,ignore_index=True)
        term_rank = (term_rank.sort('frequency',ascending=False)).reset_index()
        if term_rank['name'][0]=='pop':
            if term_rank['name'][1][0] in num:
                dic2['genre'] = term_rank['name'][2]
            else:
                dic2['genre'] = term_rank['name'][1]
        elif term_rank['name'][0][0] in num:
            if term_rank['name'][1]=='pop':
                dic2['genre'] = term_rank['name'][2]
            else:
                dic2['genre'] = term_rank['name'][1]
        else:
            dic2['genre'] = term_rank['name'][0]
        genre_list = genre_list.append(dic2,ignore_index=True)
    except IndexError:
        pass

Project 1¶

In [2]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy
import seaborn as sns

songs = pd.read_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project\song_data.csv')
songs.head()

Out[2]:

	date	rank	uri	title	artist	popularity	danceability	duration	energy	instrumentalness	key	liveness	loudness	speechiness	tempo	time_signature
0	1/2/1960	1	3hvakqVpwaz4L7zN5HfTCY	Why	Frankie Avalon	0.27	0.422345	155.23955	0.409041	5.260000e-09	5	0.112646	-8.54	0.026997	94.986	4
1	1/9/1960	2	3hvakqVpwaz4L7zN5HfTCY	Why	Frankie Avalon	0.27	0.422345	155.23955	0.409041	5.260000e-09	5	0.112646	-8.54	0.026997	94.986	4
2	1/16/1960	2	3hvakqVpwaz4L7zN5HfTCY	Why	Frankie Avalon	0.27	0.422345	155.23955	0.409041	5.260000e-09	5	0.112646	-8.54	0.026997	94.986	4
3	1/23/1960	2	3hvakqVpwaz4L7zN5HfTCY	Why	Frankie Avalon	0.27	0.422345	155.23955	0.409041	5.260000e-09	5	0.112646	-8.54	0.026997	94.986	4
4	1/30/1960	3	3hvakqVpwaz4L7zN5HfTCY	Why	Frankie Avalon	0.27	0.422345	155.23955	0.409041	5.260000e-09	5	0.112646	-8.54	0.026997	94.986	4

In [2]:

#Convert dtype for date

songs['date'] = pd.to_datetime(songs['date'],format='%m/%d/%Y')

In [3]:

songs.describe()

Out[3]:

	rank	popularity	danceability	duration	energy	instrumentalness	key	liveness	loudness	speechiness	tempo	time_signature
count	114422.000000	114422.000000	114422.000000	114422.000000	114422.000000	1.144220e+05	114422.000000	114422.000000	114422.000000	113607.000000	114422.000000	114422.000000
mean	20.497098	0.399589	0.611887	231.943356	0.630440	3.373952e-02	5.228592	0.190379	-8.468298	0.066200	120.196028	3.951277
std	11.545553	0.214628	0.150314	61.443767	0.194966	1.388380e-01	3.560654	0.163452	3.525335	0.070969	27.585767	0.273730
min	1.000000	0.000000	0.068750	61.266210	0.020085	4.090000e-15	0.000000	0.013549	-41.613000	0.022336	41.409000	1.000000
25%	10.000000	0.250000	0.514386	192.746210	0.490307	9.110000e-08	2.000000	0.088227	-10.689000	0.031694	99.948750	4.000000
50%	20.000000	0.420000	0.623672	229.559550	0.648759	8.550000e-06	5.000000	0.128692	-7.996000	0.040176	118.972500	4.000000
75%	31.000000	0.560000	0.717292	263.372880	0.788447	7.380170e-04	8.000000	0.243539	-5.803000	0.062878	134.682500	4.000000
max	40.000000	1.000000	0.984268	1367.092880	0.995899	9.702722e-01	11.000000	0.990848	-0.073000	0.891714	217.748000	5.000000

In [4]:

fig = plt.figure(figsize =(30,25))
ax1 = fig.add_subplot(2,2,1)
ax1.hist(songs.popularity)
ax1.set_xlabel('Popularity',fontsize=20)
ax1.set_title('Histogram of Current Popularity Ratings',fontsize =25)
ax2 = fig.add_subplot(2,2,2)
ax2.set_ylim(-.1,1.1)
ax2.set_ylabel('Popularity',fontsize=20)
ax2.boxplot(songs.popularity)
ax2.set_title('Boxplot of Current Popularity Ratings',fontsize =25)
ax3 = fig.add_subplot(2,2,3)
ax3 = sns.violinplot(songs.popularity)
ax3.set_ylabel('Popularity',fontsize=20)
ax3.set_title('Violin Plot of Current Popularity Ratings',fontsize =25)
ax4 = fig.add_subplot(2,2,4)
ax4 = songs.popularity.plot(kind='kde')
ax4.set_title('KDE Plot of Current Popularity Ratings',fontsize =25)
ax4.set_ylabel('Popularity',fontsize=20)
fig.suptitle('Distributions of Current Popularity Ratings',fontsize =30)

Out[4]:

<matplotlib.text.Text at 0x1662ef28>

In [5]:

pd.scatter_matrix(songs[['popularity','danceability','duration','tempo','loudness']],figsize=(20,20))
plt.suptitle('Scatterplot Matrix of Song Attributes')

Out[5]:

<matplotlib.text.Text at 0x9ca5588>

In [6]:

#What song is the duration outlier?
big = songs[['title','artist']][songs['duration']>1000]
big.drop_duplicates()

Out[6]:

	title	artist
32217	Autobahn - 2009 Remastered Version	Kraftwerk

In [7]:

#What songs currently have the highest popularity?

hits = songs[['title','artist','popularity']][songs['popularity']>.95]
hits.drop_duplicates()
#Recent songs appear to have the highest popularity scores in the set.

Out[7]:

	title	artist	popularity
113442	All of Me	John Legend	0.96
113739	Summer	Calvin Harris	0.97
113807	A Sky Full Of Stars	Coldplay	0.96
114084	Shower	Becky G	0.97
114150	All About That Bass	Meghan Trainor	1.00
114231	Don't	Ed Sheeran	0.96

In [8]:

#What songs currently have the highest popularity?

hits = songs[['title','artist','popularity']][songs['popularity']>.95]
hits.drop_duplicates()
#Recent songs appear to have the highest popularity scores in the set.

Out[8]:

	title	artist	popularity
113442	All of Me	John Legend	0.96
113739	Summer	Calvin Harris	0.97
113807	A Sky Full Of Stars	Coldplay	0.96
114084	Shower	Becky G	0.97
114150	All About That Bass	Meghan Trainor	1.00
114231	Don't	Ed Sheeran	0.96

In [9]:

songs_1 = songs[songs['rank'] == 1]
songs_10 = songs[songs['rank'] == 10]
songs_20 = songs[songs['rank'] == 20]
songs_40 = songs[songs['rank'] == 40]

In [10]:

songs_1.plot(x='date',y='popularity',figsize=(25,8))
plt.suptitle('Timeseries of Popularity for #1 Songs')
plt.ylabel('Current Popularity')

Out[10]:

<matplotlib.text.Text at 0x195f83c8>

In [11]:

fig = plt.figure(figsize =(30,25))
fig.suptitle('Timeseries Comparison for #1s, #10s, and #40s')
ax1 = fig.add_subplot(3,1,1)
ax1.set_title('#1s',fontsize = 20)
ax1 = songs_1.plot(x='date',y='popularity')
ax1.set_ylabel('Popularity')
ax2 = fig.add_subplot(3,1,2)
ax2.set_title('#10s',fontsize = 20)
ax2.set_ylabel('Popularity')
ax2 = songs_10.plot(x='date',y='popularity')
ax3 = fig.add_subplot(3,1,3)
ax3.set_title('#40s',fontsize = 20)
ax3.set_ylabel('Popularity')
ax3 = songs_40.plot(x='date',y='popularity')

Project 2¶

In [ ]:

#Get songs' highest rank and the last date it achieved that rank

unique = songs.uri.unique()
top_rank = pd.DataFrame(columns=['uri','rank','date'])


ranker = {}
for i in unique:
    ranker['uri'] = i
    unique_song = songs[songs.uri == i]
    min_rank = unique_song[unique_song['rank'] == unique_song['rank'].min()]
    min_rank = min_rank.reset_index()
    ranker['rank'] = min_rank['rank'][0]
    max_date = min_rank[min_rank['date']==min_rank['date'].max()]
    max_date = max_date.reset_index()
    ranker['date'] = max_date['date'][0]
    top_rank = top_rank.append(ranker,ignore_index=True)

top_rank.head()

In [12]:

top_songs = pd.read_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project/top_rank.csv')

In [15]:

top_songs['date'] = pd.to_datetime(top_songs['date'])

In [23]:

top_songs['time'] = top_songs.date.max() - top_songs.date
top_songs['days'] = (top_songs.time /np.timedelta64(1, 'D')).astype(int)
top_songs = top_songs.drop('Unnamed: 0',axis=1)
top_songs.head()

Out[23]:

	uri	rank	date	time	days
0	3hvakqVpwaz4L7zN5HfTCY	1	1960-01-02	19992 days	19992
1	4f8hBeMXMvssn6HtFAtblo	1	1960-01-16	19978 days	19978
2	1XRXD6RmgxtySaKbrVBfzk	3	1960-01-09	19985 days	19985
3	22TgqnP9tyLU8i0eZnbuMR	4	1960-01-02	19992 days	19992
4	6TdAcAgVw7Z8pzU2KswtvH	3	1960-01-16	19978 days	19978

In [27]:

top_songs = pd.merge(songs,top_songs,on='uri')
import matplotlib.pyplot as plt
features = ['danceability','duration','energy','instrumentalness','key','liveness','loudness','tempo','time_signature','days']

pd.scatter_matrix(top_songs[features],figsize=(20,20))

Out[27]:

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001FA5A2E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000221B8438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023EDA320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001FBB9780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022CDE470>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001FB75400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023B4F5C0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023A932B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023DECA58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022D50748>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000022E37160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023F41E80>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023F5F550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000240C5668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000002417F0F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024211BA8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024317898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024287F98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024479AC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000245D26D8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000246B4908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000247795F8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024805E48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024914978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000249A7208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000025C60E10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000025D24978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000025E08518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000025F0E208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000025E3AA90>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000026072438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026179F60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026229278>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026320F28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000263767B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000264D92E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000002658FB38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000002667F780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000267842E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000267B7710>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000268FFEF0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026A1B780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026AC8320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026B9DEF0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026C83BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026D458D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026D03048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026F2CB00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026FAD668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000270A2940>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000271A7630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000271F0E80>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000273429B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027459240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000274F8E48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000275BB9B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000276A0550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000277A5240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000276D3940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027929470>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000027A20F98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027B292B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027C1FF60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027D0B7F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027DDB320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027EF3B70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027F9F7B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000280A6320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000280E99E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000281FDF60>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000028314710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000283C05C0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000284C7D30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000002857CDA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000002863D8D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000285F0A90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028836C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000002893D828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000289DFBE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028AED470>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000028BD3080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028CCCB70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028DE0400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028ED0EB8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028F97AC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000002905B710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029160400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000002908CB00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029316630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000293E9198>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000294CE470>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000295D4160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000002965E9B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000002976F4E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029826D30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000298E1978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000299FA4E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029B3A080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029BEED30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029B6B470>]], dtype=object)

In [28]:

plt.figure(figsize=(20,20))
for i in range(len(features)):
    v = i +1
    ax1 = subplot(5,2,v)
    ax1.hist(top_songs[features[i]])
    ax1.set_title(str(features[i]),fontsize=15)

In [30]:

plt.hist(top_songs['popularity'])
plt.suptitle('Top Popularity')

Out[30]:

<matplotlib.text.Text at 0x36e3e390>

In [31]:

the_1s = songs[songs['rank']==1]
plt.figure(figsize=(20,10))
the_1s.plot(x='date',y='popularity')
plt.suptitle('Current Popularity Ratings for #1 songs 1960-2014',fontsize = 25)

Out[31]:

<matplotlib.text.Text at 0x38318828>

In [32]:

from sklearn import linear_model
clf = linear_model.LinearRegression()

In [33]:

X = top_songs[features]
X = X.values

In [34]:

y = top_songs['popularity']
y = y.values

In [35]:

from sklearn.cross_validation import train_test_split
xtrain,xtest,ytrain,ytest =  train_test_split(X, y)

In [36]:

model = clf.fit(xtrain,ytrain)

In [37]:

model.score(xtrain,ytrain)

Out[37]:

0.32966268207020744

In [38]:

model.score(xtest,ytest)

Out[38]:

0.32361192116190363

In [39]:

pd.DataFrame(zip(features,model.coef_.T),columns=['Variable','Coefficient'])

Out[39]:

	Variable	Coefficient
0	danceability	0.001357
1	duration	0.000056
2	energy	-0.046787
3	instrumentalness	-0.113890
4	key	-0.001034
5	liveness	-0.057652
6	loudness	0.007267
7	tempo	-0.000007
8	time_signature	0.003422
9	days	-0.000018

In [40]:

from sklearn import feature_selection
f = feature_selection.f_regression(X,y)
pd.DataFrame(zip(features,f[1].T),columns=['Variable','P-Value'])

Out[40]:

	Variable	P-Value
0	danceability	0.000000e+00
1	duration	0.000000e+00
2	energy	0.000000e+00
3	instrumentalness	0.000000e+00
4	key	5.298700e-01
5	liveness	9.832641e-152
6	loudness	0.000000e+00
7	tempo	3.581412e-02
8	time_signature	8.023966e-199
9	days	0.000000e+00

In [41]:

new_features = ['danceability','duration','energy','instrumentalness','liveness','loudness','time_signature']

In [42]:

X = top_songs[new_features]
X = X.values

In [43]:

y = top_songs['popularity']
y = y.values

In [44]:

xtrain,xtest,ytrain,ytest =  train_test_split(X, y)

In [45]:

model = clf.fit(xtrain,ytrain)

In [46]:

model.score(xtrain,ytrain)

Out[46]:

0.18319207407832316

In [47]:

model.score(xtest,ytest)

Out[47]:

0.18412613746261552

In [48]:

pd.DataFrame(zip(new_features,model.coef_.T),columns=['Variable','Coefficient'])

Out[48]:

	Variable	Coefficient
0	danceability	0.111771
1	duration	0.000597
2	energy	-0.099886
3	instrumentalness	-0.146259
4	liveness	-0.084151
5	loudness	0.023090
6	time_signature	0.020949

In [49]:

pd.DataFrame(zip(new_features,f[1].T),columns=['Variable','P-Value'])

Out[49]:

	Variable	P-Value
0	danceability	0.000000e+00
1	duration	0.000000e+00
2	energy	0.000000e+00
3	instrumentalness	0.000000e+00
4	liveness	5.298700e-01
5	loudness	9.832641e-152
6	time_signature	0.000000e+00

In [50]:

features = ['danceability','duration','energy','instrumentalness','key','liveness','loudness','tempo','time_signature','days']
plt.figure(figsize=(20,20))
for i in range(len(features)):
    v = i +1
    ax1 = subplot(5,2,v)
    ax1.scatter(top_songs[features[i]],top_songs['popularity'])
    ax1.set_title(str(features[i]),fontsize=15)

In [51]:

newer_features = ['days','loudness']

In [52]:

X = top_songs[newer_features]
X = X.values

In [53]:

y = top_songs['popularity']
y = y.values

In [54]:

model = clf.fit(xtrain,ytrain)

In [55]:

model.score(xtrain,ytrain)

Out[55]:

0.18319207407832316

In [56]:

model.score(xtest,ytest)

Out[56]:

0.18412613746261552

In [57]:

pd.DataFrame(zip(newer_features,model.coef_.T),columns=['Variable','Coefficient'])

Out[57]:

	Variable	Coefficient
0	days	0.111771
1	loudness	0.000597

In [58]:

pd.DataFrame(zip(newer_features,f[1].T),columns=['Variable','P-Value'])

Out[58]:

	Variable	P-Value
0	days	0
1	loudness	0

Final Project¶

In [3]:

songs = pd.read_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project/song_genre.csv')

In [4]:

songs['log_speech'] = np.log(songs['speechiness'])
songs['log_speech'] = songs['log_speech'] - songs['log_speech'].min() 
songs['log_instrument'] = np.log(songs['instrumentalness'])
songs['log_instrument'] = songs['log_instrument'] - songs['log_instrument'].min()
songs.head()

Out[4]:

	artist_uri	genre	uri	date	rank	title	artist	popularity	danceability	duration	energy	instrumentalness	key	liveness	loudness	speechiness	tempo	time_signature	log_speech	log_instrument
0	spotify:artist:5zNOI87gG4RttFmYAZWaxQ	rock	3hvakqVpwaz4L7zN5HfTCY	1/2/1960	1	Why	Frankie Avalon	0.27	0.422345	155.23955	0.409041	5.260000e-09	5	0.112646	-8.54	0.026997	94.986	4	0.189548	14.067097
1	spotify:artist:5zNOI87gG4RttFmYAZWaxQ	rock	3hvakqVpwaz4L7zN5HfTCY	1/9/1960	2	Why	Frankie Avalon	0.27	0.422345	155.23955	0.409041	5.260000e-09	5	0.112646	-8.54	0.026997	94.986	4	0.189548	14.067097
2	spotify:artist:5zNOI87gG4RttFmYAZWaxQ	rock	3hvakqVpwaz4L7zN5HfTCY	1/16/1960	2	Why	Frankie Avalon	0.27	0.422345	155.23955	0.409041	5.260000e-09	5	0.112646	-8.54	0.026997	94.986	4	0.189548	14.067097
3	spotify:artist:5zNOI87gG4RttFmYAZWaxQ	rock	3hvakqVpwaz4L7zN5HfTCY	1/23/1960	2	Why	Frankie Avalon	0.27	0.422345	155.23955	0.409041	5.260000e-09	5	0.112646	-8.54	0.026997	94.986	4	0.189548	14.067097
4	spotify:artist:5zNOI87gG4RttFmYAZWaxQ	rock	3hvakqVpwaz4L7zN5HfTCY	1/30/1960	3	Why	Frankie Avalon	0.27	0.422345	155.23955	0.409041	5.260000e-09	5	0.112646	-8.54	0.026997	94.986	4	0.189548	14.067097

In [5]:

genres = pd.DataFrame(songs.genre.value_counts())

In [6]:

genres = songs['genre'].unique()

In [7]:

genres

Out[7]:

array(['rock', 'country', "rock 'n roll", 'swing', 'jazz', 'choral music',
       'r&b', 'soul', 'easy listening', 'male vocalist', 'doo-wop',
       'ballad', 'rockabilly', 'blues', 'traditional pop', 'folk', 'dance',
       'christian', 'break', 'instrumental', 'punk', 'polka', 'symphony',
       'comedy', 'classic rock', '50s', 'adult contemporary', 'pop rock',
       'brill building pop', 'soundtrack', 'latin', 'club', 'big band',
       'ska', 'post-hardcore', 'reggae', 'tejano', '60s pop', '70s',
       'lounge', 'singer-songwriter', 'electronic', 'funk', 'sunshine pop',
       'house', 'hip hop', 'psychedelic', 'psychedelic rock', 'metal',
       'northern soul', 'electronica', 'soft rock', 'new age', 'disco',
       'folk rock', 'garage rock', 'motown', 'power pop', 'romantic',
       'classical', 'ccm', 'new wave', 'eurodance', 'indie', 'synthpop',
       'rap', 'boy band', '80s', 'dance pop', 'techno', 'hardcore',
       'experimental', 'alternative', 'teen pop', 'gospel', 'trance',
       'soca', 'crunk', 'opera', 'remix', 'indie pop'], dtype=object)

In [8]:

genre_list = ['rock','r&b','country','soul','hip hop','dance','rap','jazz','folk','disco','funk',
              'reggae','latin','house','electronic','blues','metal']

In [9]:

song_genre = songs[songs['genre'].isin(genre_list)]

In [10]:

song_genre['date'] = pd.to_datetime(song_genre['date'],format='%m/%d/%Y')

-c:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead

In [11]:

genre_dummy = pd.get_dummies(song_genre['genre'])

In [12]:

song_date = song_genre['date']

In [13]:

song_date = genre_dummy.join(song_date)

In [14]:

song_group = song_date.groupby('date').agg('sum')

In [15]:

song_group.plot(kind='area',figsize=(30,18),colormap ='Paired')
plt.legend(loc=2,fontsize=15,ncol=3,markerscale=100)
plt.suptitle('Billboard Top 40 by Genre (Jan 1960 - Sep 2014)',fontsize=30)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.xlabel('date',fontsize=25)

Out[15]:

<matplotlib.text.Text at 0x19708860>

In [16]:

rock = song_genre[song_genre['genre']=='rock']

In [17]:

def decade(date):
    year = str(date).split('-')[0]
    last_num = year[3]
    decade = int(year) - int(last_num)
    return decade

In [18]:

rock['decade'] = rock['date'].apply(decade)

In [19]:

rock = rock.drop(['rank','date'],axis=1)

In [20]:

rock = rock.drop_duplicates()

In [21]:

rock = rock.reset_index(drop=True)

In [22]:

features = ['danceability','duration','energy','key','liveness','loudness','tempo','time_signature','instrumentalness','speechiness']

In [23]:

plt.figure(figsize=(20,20))
for i in range(len(features)):
    v = i +1
    ax1 = subplot(5,2,v)
    ax1.hist(rock[features[i]])
    ax1.set_title(str(features[i]),fontsize=15)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-23-35224f1c7e8c> in <module>()
      3     v = i +1
      4     ax1 = subplot(5,2,v)
----> 5     ax1.hist(rock[features[i]])
      6     ax1.set_title(str(features[i]),fontsize=15)

C:\Users\Matt\Anaconda\lib\site-packages\matplotlib\axes\_axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
   5650             # this will automatically overwrite bins,
   5651             # so that each histogram uses the same bins
-> 5652             m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
   5653             m = m.astype(float)  # causes problems later if it's an int
   5654             if mlast is None:

C:\Users\Matt\Anaconda\lib\site-packages\numpy\lib\function_base.pyc in histogram(a, bins, range, normed, weights, density)
    163         if (mn > mx):
    164             raise AttributeError(
--> 165                 'max must be larger than min in range parameter.')
    166 
    167     if not iterable(bins):

AttributeError: max must be larger than min in range parameter.

In [24]:

features = ['danceability','duration','energy','key','liveness','loudness','tempo','time_signature','log_speech','log_instrument']
plt.figure(figsize=(20,20))
for i in range(len(features)):
    v = i +1
    ax1 = rock.boxplot(features[i],by='decade',ax=subplot(5,2,v))
    ax1.set_title(str(features[i]),fontsize=15)
    ax1.set_xlabel('')
plt.suptitle('Feature Boxplots by Decade',size=20)

Out[24]:

<matplotlib.text.Text at 0x21d711d0>

In [25]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split

In [26]:

rock = rock.dropna()

In [27]:

rock['loudness'] = rock['loudness']-rock['loudness'].min()

In [28]:

X = rock[features]
Y = rock['decade']

In [29]:

xtrain,xtest,ytrain,ytest = train_test_split(X,Y)

In [30]:

mnb = MultinomialNB().fit(xtrain,ytrain)

In [31]:

print 'Train: ', mnb.score(xtrain,ytrain)
print 'Test: ', mnb.score(xtest, ytest)

Train:  0.396097046414
Test:  0.384493670886

In [32]:

from sklearn import metrics

In [33]:

y_pred = mnb.predict(X)

In [34]:

decades = ['1960','1970','1980','1990','2000','2010']
print "Confusion Matrix"
matrix = pd.DataFrame(metrics.confusion_matrix(Y,y_pred),columns=decades,index=decades)
matrix

Confusion Matrix

Out[34]:

	1960	1970	1980	1990	2000	2010
1960	954	186	65	36	38	11
1970	394	309	250	236	25	3
1980	206	321	394	335	39	10
1990	62	98	168	228	30	7
2000	107	34	87	79	73	26
2010	55	36	34	28	62	30

In [35]:

plt.hist(ytrain)

Out[35]:

(array([ 992.,    0.,  911.,    0.,  955.,    0.,  442.,    0.,  310.,  182.]),
 array([ 1960.,  1965.,  1970.,  1975.,  1980.,  1985.,  1990.,  1995.,
         2000.,  2005.,  2010.]),
 <a list of 10 Patch objects>)

In [36]:

new_features = ['liveness','loudness','energy','log_speech','danceability','log_instrument']

In [37]:

X = rock[new_features]

In [38]:

X = rock[new_features]

In [39]:

xtrain,xtest,ytrain,ytest = train_test_split(X,Y)

In [40]:

mnb = MultinomialNB().fit(xtrain,ytrain)

In [41]:

print 'Train: ', mnb.score(xtrain,ytrain)
print 'Test: ', mnb.score(xtest, ytest)

Train:  0.314609704641
Test:  0.340189873418

In [42]:

from sklearn.ensemble import RandomForestClassifier

In [43]:

X = rock[features]
Y = rock['decade']
xtrain,xtest,ytrain,ytest = train_test_split(X,Y)

In [44]:

RFC = RandomForestClassifier(n_estimators=800).fit(xtrain,ytrain)

In [45]:

print 'Train: ', RFC.score(xtrain,ytrain)
print 'Test: ', RFC.score(xtest, ytest)

Train:  0.985232067511
Test:  0.549841772152

In [46]:

from sklearn.metrics import classification_report

In [47]:

y_pred = RFC.predict(xtest)
decades = ['1960','1970','1980','1990','2000','2010']
print 'Train: ', RFC.score(xtrain,ytrain)
print 'Test: ', RFC.score(xtest, ytest)
print matrix, '\n'
print 'Classification Report'
print classification_report(ytest,y_pred,target_names=decades),"\n"

Train:  0.985232067511
Test:  0.549841772152
      1960  1970  1980  1990  2000  2010
1960   954   186    65    36    38    11
1970   394   309   250   236    25     3
1980   206   321   394   335    39    10
1990    62    98   168   228    30     7
2000   107    34    87    79    73    26
2010    55    36    34    28    62    30 

Classification Report
             precision    recall  f1-score   support

       1960       0.77      0.84      0.80       324
       1970       0.46      0.41      0.43       299
       1980       0.51      0.67      0.58       325
       1990       0.27      0.10      0.15       139
       2000       0.41      0.49      0.45       113
       2010       0.48      0.22      0.30        64

avg / total       0.53      0.55      0.53      1264

In [48]:

genre = ['r&b','hip hop','country']

In [49]:

genre_subset = songs[songs['genre'].isin(genre)]

In [50]:

genre_subset = genre_subset.drop(['date','rank'],axis=1)
genre_subset = genre_subset.drop_duplicates()

In [51]:

plt.figure(figsize=(20,20))
for i in range(len(features)):
    v = i +1
    ax1 = genre_subset.boxplot(features[i],by='genre',ax=subplot(5,2,v))
    ax1.set_title(str(features[i]),fontsize=15)
    ax1.set_xlabel('')
plt.suptitle('Feature Boxplots by Genre',size=20)

Out[51]:

<matplotlib.text.Text at 0x2fec4908>

In [52]:

genre_subset = genre_subset.dropna()
X = genre_subset[features]

In [53]:

def genre_num(lst):
    index = genre.index(lst)
    return index

In [54]:

genre_subset['genre_num'] = genre_subset['genre'].apply(genre_num)

In [55]:

genre_subset = genre_subset.dropna()
X = genre_subset[features]
Y = genre_subset['genre_num']
xtrain,xtest,ytrain,ytest = train_test_split(X,Y)

In [56]:

rf = RandomForestClassifier(n_estimators=800).fit(xtrain,ytrain)

In [57]:

print 'Train: ', rf.score(xtrain,ytrain)
print 'Test: ', rf.score(xtest, ytest)

Train:  1.0
Test:  0.729691876751

In [58]:

print "Confusion Matrix"
y_pred = rf.predict(xtest)
matrix = pd.DataFrame(metrics.confusion_matrix(ytest,y_pred),columns=genre,index=genre)
matrixgenre = ['r&b','hip hop','country','rock']
features = ['danceability','energy','liveness','time_signature','log_speech','log_instrument']

Confusion Matrix

Out[58]:

	r&b	hip hop	country
r&b	211	26	59
hip hop	42	83	4
country	60	2	227

In [59]:

genre = ['r&b','hip hop','country','rock']
features = ['danceability','energy','liveness','time_signature','log_speech','log_instrument']

In [60]:

genre_subset = songs[songs['genre'].isin(genre)]

In [61]:

genre_subset = genre_subset.drop(['date','rank'],axis=1)
genre_subset = genre_subset.drop_duplicates()

In [62]:

plt.figure(figsize=(20,20))
for i in range(len(features)):
    v = i +1
    ax1 = genre_subset.boxplot(features[i],by='genre',ax=subplot(5,2,v))
    ax1.set_title(str(features[i]),fontsize=15)
    ax1.set_xlabel('')
plt.suptitle('Feature Boxplots by Genre',size=20)

Out[62]:

<matplotlib.text.Text at 0x383acc88>

In [63]:

for i in genre:
    cluster_genre = genre_subset[genre_subset['genre']==i]
    print 'Top 10 Songs for Genre: %s' %i
    print
    print cluster_genre[['title','artist','popularity']].sort('popularity',ascending=False).head(10)
    print

Top 10 Songs for Genre: r&b

                            title         artist  popularity
98244                   All of Me    John Legend        0.96
109074                  Bang Bang       Jessie J        0.91
107679  Wiggle (feat. Snoop Dogg)   Jason Derulo        0.90
111827     Me And My Broken Heart         Rixton        0.88
95037               Drunk in Love        Beyonc�        0.88
111037                 Break Free  Ariana Grande        0.87
111017                    Problem  Ariana Grande        0.86
106012             Don't Tell 'Em        Jeremih        0.85
111970                       2 On        Tinashe        0.82
100347                  New Flame    Chris Brown        0.82

Top 10 Songs for Genre: hip hop

                                   title                   artist  popularity
111810                           Classic                     MKTO        0.92
111660                Turn Down for What                 DJ Snake        0.91
111802                       Black Widow              Iggy Azalea        0.90
111792                             Fancy              Iggy Azalea        0.89
110791  Can't Hold Us - feat. Ray Dalton  Macklemore & Ryan Lewis        0.89
95283                              Happy        Pharrell Williams        0.87
96674                             Timber                  Pitbull        0.86
95318                    Come Get It Bae        Pharrell Williams        0.86
111779                             Fancy              Iggy Azalea        0.84
96708                     Wild Wild Love                  Pitbull        0.84

Top 10 Songs for Genre: country

                      title                artist  popularity
101556      Burnin' It Down          Jason Aldean        0.85
110642                 Dirt  Florida Georgia Line        0.84
85629         American Kids         Kenny Chesney        0.83
110624  This Is How We Roll  Florida Georgia Line        0.81
107489        Play It Again            Luke Bryan        0.80
95474      Drunk On A Plane        Dierks Bentley        0.80
109348             Beachin'             Jake Owen        0.79
106214            Bartender       Lady Antebellum        0.78
110572               Cruise  Florida Georgia Line        0.77
110516        Where It's At          Dustin Lynch        0.76

Top 10 Songs for Genre: rock

                      title      artist  popularity
94681   A Sky Full Of Stars    Coldplay        0.96
111899                 Rude      Magic!        0.94
94691                 Magic    Coldplay        0.91
105275    This Is How We Do  Katy Perry        0.90
105209           Dark Horse  Katy Perry        0.89
103259                 Maps    Maroon 5        0.87
103708         Ain't It Fun    Paramore        0.87
94655              Paradise    Coldplay        0.84
108576  When I Was Your Man  Bruno Mars        0.84
108604             Treasure  Bruno Mars        0.83

In [64]:

from sklearn.cluster import KMeans

In [65]:

features = ['danceability','energy','liveness','time_signature','log_speech','log_instrument']
genre_subset = genre_subset.dropna()
X = genre_subset[features]

In [66]:

km = KMeans(n_clusters=len(genre)).fit(X)

In [67]:

genre_subset['prediction'] = km.predict(genre_subset[features])

In [69]:

genre_subset = genre_subset.drop(['date','rank'],axis=1)
genre_subset = genre_subset.drop_duplicates()

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-69-13aaff1005ae> in <module>()
----> 1 genre_subset = genre_subset.drop(['date','rank'],axis=1)
      2 genre_subset = genre_subset.drop_duplicates()

C:\Users\Matt\Anaconda\lib\site-packages\pandas\core\generic.pyc in drop(self, labels, axis, level, inplace, **kwargs)
   1462                 new_axis = axis.drop(labels, level=level)
   1463             else:
-> 1464                 new_axis = axis.drop(labels)
   1465             dropped = self.reindex(**{axis_name: new_axis})
   1466             try:

C:\Users\Matt\Anaconda\lib\site-packages\pandas\core\index.pyc in drop(self, labels)
   1808         mask = indexer == -1
   1809         if mask.any():
-> 1810             raise ValueError('labels %s not contained in axis' % labels[mask])
   1811         return self.delete(indexer)
   1812 

ValueError: labels ['date' 'rank'] not contained in axis

In [70]:

plt.figure(figsize=(20,20))
for i in range(len(features)):
    v = i +1
    ax1 = genre_subset.boxplot(features[i],by='genre',ax=subplot(5,2,v))
    ax1.set_title(str(features[i]),fontsize=15)
    ax1.set_xlabel('')
plt.suptitle('Feature Boxplots by Genre',size=20)

Out[70]:

<matplotlib.text.Text at 0x38ae8048>

In [71]:

for i in genre:
    cluster_genre = genre_subset[genre_subset['genre']==i]
    print 'Top 10 Songs for Genre: %s' %i
    print
    print cluster_genre[['title','artist','popularity']].sort('popularity',ascending=False).head(10)
    print

Top 10 Songs for Genre: r&b

                         title         artist  popularity
98244                All of Me    John Legend        0.96
109074               Bang Bang       Jessie J        0.91
111827  Me And My Broken Heart         Rixton        0.88
95037            Drunk in Love        Beyonc�        0.88
111037              Break Free  Ariana Grande        0.87
111017                 Problem  Ariana Grande        0.86
106012          Don't Tell 'Em        Jeremih        0.85
111970                    2 On        Tinashe        0.82
100347               New Flame    Chris Brown        0.82
104010                   Na Na     Trey Songz        0.81

Top 10 Songs for Genre: hip hop

                                   title                   artist  popularity
111810                           Classic                     MKTO        0.92
111660                Turn Down for What                 DJ Snake        0.91
111802                       Black Widow              Iggy Azalea        0.90
111792                             Fancy              Iggy Azalea        0.89
110791  Can't Hold Us - feat. Ray Dalton  Macklemore & Ryan Lewis        0.89
95283                              Happy        Pharrell Williams        0.87
96674                             Timber                  Pitbull        0.86
95318                    Come Get It Bae        Pharrell Williams        0.86
111779                             Fancy              Iggy Azalea        0.84
96708                     Wild Wild Love                  Pitbull        0.84

Top 10 Songs for Genre: country

                      title                artist  popularity
101556      Burnin' It Down          Jason Aldean        0.85
110642                 Dirt  Florida Georgia Line        0.84
85629         American Kids         Kenny Chesney        0.83
110624  This Is How We Roll  Florida Georgia Line        0.81
95474      Drunk On A Plane        Dierks Bentley        0.80
107489        Play It Again            Luke Bryan        0.80
109348             Beachin'             Jake Owen        0.79
106214            Bartender       Lady Antebellum        0.78
110572               Cruise  Florida Georgia Line        0.77
110516        Where It's At          Dustin Lynch        0.76

Top 10 Songs for Genre: rock

                      title      artist  popularity
94681   A Sky Full Of Stars    Coldplay        0.96
111899                 Rude      Magic!        0.94
94691                 Magic    Coldplay        0.91
105275    This Is How We Do  Katy Perry        0.90
105209           Dark Horse  Katy Perry        0.89
103708         Ain't It Fun    Paramore        0.87
103259                 Maps    Maroon 5        0.87
94655              Paradise    Coldplay        0.84
108576  When I Was Your Man  Bruno Mars        0.84
108604             Treasure  Bruno Mars        0.83

In [72]:

plt.figure(figsize=(20,20))
for i in range(len(features)):
    v = i +1
    ax1 = genre_subset.boxplot(features[i],by='prediction',ax=subplot(5,2,v))
    ax1.set_title(str(features[i]),fontsize=15)
    ax1.set_xlabel('')
plt.suptitle('Feature Boxplots by Predicted Genre',size=20)

Out[72]:

<matplotlib.text.Text at 0x368e4dd8>

In [73]:

for i in genre_subset['prediction'].unique():
    cluster_genre = genre_subset[genre_subset['prediction']==i]
    print 'Top 10 Songs for Cluster Genre %s' %i
    print
    print cluster_genre[['title','artist','popularity']].sort('popularity',ascending=False).head(10)
    print

Top 10 Songs for Cluster Genre 0

                                   title                   artist  popularity
111899                              Rude                   Magic!        0.94
109074                         Bang Bang                 Jessie J        0.91
105275                 This Is How We Do               Katy Perry        0.90
110791  Can't Hold Us - feat. Ray Dalton  Macklemore & Ryan Lewis        0.89
111792                             Fancy              Iggy Azalea        0.89
103259                              Maps                 Maroon 5        0.87
96674                             Timber                  Pitbull        0.86
106012                    Don't Tell 'Em                  Jeremih        0.85
108576               When I Was Your Man               Bruno Mars        0.84
110750          Thrift Shop - feat. Wanz  Macklemore & Ryan Lewis        0.83

Top 10 Songs for Cluster Genre 3

                         title             artist  popularity
98244                All of Me        John Legend        0.96
111810                 Classic               MKTO        0.92
105209              Dark Horse         Katy Perry        0.89
111827  Me And My Broken Heart             Rixton        0.88
95283                    Happy  Pharrell Williams        0.87
111779                   Fancy        Iggy Azalea        0.84
96708           Wild Wild Love            Pitbull        0.84
96337              No Mediocre               T.I.        0.83
105960       Really Don't Care        Demi Lovato        0.82
100347               New Flame        Chris Brown        0.82

Top 10 Songs for Cluster Genre 1

                      title           artist  popularity
94681   A Sky Full Of Stars         Coldplay        0.96
94691                 Magic         Coldplay        0.91
111660   Turn Down for What         DJ Snake        0.91
111802          Black Widow      Iggy Azalea        0.90
95037         Drunk in Love          Beyonc�        0.88
110920                 Sail       AWOLNATION        0.82
108409                human  Christina Perri        0.81
99648         Feel Good Inc         Gorillaz        0.79
69958         Enter Sandman        Metallica        0.79
94680     Princess of China         Coldplay        0.79

Top 10 Songs for Cluster Genre 2

                   title                artist  popularity
111037        Break Free         Ariana Grande        0.87
103708      Ain't It Fun              Paramore        0.87
95318    Come Get It Bae     Pharrell Williams        0.86
111017           Problem         Ariana Grande        0.86
101556   Burnin' It Down          Jason Aldean        0.85
94655           Paradise              Coldplay        0.84
110642              Dirt  Florida Georgia Line        0.84
108604          Treasure            Bruno Mars        0.83
85629      American Kids         Kenny Chesney        0.83
111946  Come With Me Now                KONGOS        0.82

In [ ]: