from bs4 import BeautifulSoup
import urllib2
from datetime import datetime
import pandas as pd
import json
data = pd.DataFrame(columns=('date','rank','uri'))
data2 = pd.DataFrame(columns=('title','artist','popularity','uri'))
for year in range(1960,2015):
for month in range(1,13):
if month<10:
month = str(0)+str(month)
for day in range(1,32):
if day < 10:
day = str(0)+str(day)
date = str(month)+'-'+str(day)+'-'+str(year)
try:
date1 = datetime.strptime(date, '%m-%d-%Y')
if datetime.weekday(date1) == 5:
try:
link = urllib2.urlopen('http://www.billboard.com/charts/%s-%s-%s/hot-100' % (year,month,day))
soup = BeautifulSoup(link)
spotsoup1 = soup.findAll('a')
splitsoup1 = str(spotsoup1[58]).split('\"')
splitsoup2= splitsoup1[3].split(':')
splitsoup3 = splitsoup2[2].split(',')
for i in range(40):
dic = {}
dic['date'] = date
dic['rank'] = str(i+1)
dic['uri'] = str(splitsoup3[i])
data = data.append(dic,ignore_index=True)
except TypeError:
pass
except ValueError:
pass
spoturl = 'http://ws.spotify.com/lookup/1/.json?uri=spotify:track:'
uri_list = data['uri'].unique()
for i in uri_list:
dic2 = {}
url = spoturl + str(i)
response = urllib2.urlopen(url)
json_object = json.load(response)
dic2['title'] = json_object['track']['name']
dic2['artist'] = json_object['track']['artists'][0]['name']
dic2['popularity'] = json_object['track']['popularity']
dic2['uri'] = i
data2 = data2.append(dic2,ignore_index=True)
project_data = pd.merge(data,data2,on='uri')
project_data['rank'] = project_data['rank'].astype('int')
project_data['popularity'] = project_data['popularity'].astype('float')
project_data.sort(['date','rank'])
project_data.head()
data3 = pd.DataFrame(columns=('danceability','duration','energy','instrumentalness','key','liveness',
'loudness','speechiness','tempo','time_signature','uri'))
#echonest data
echourl = 'http://developer.echonest.com/api/v4/track/profile?api_key=API=json&id=spotify:track:'
echourl2 ='&bucket=audio_summary'
for i in uri_list:
try:
uri = i
url = echourl+uri+echourl2
dic3 = {}
response = urllib2.urlopen(url)
json_object = json.load(response)
dic3['danceability'] = json_object['response']['track']['audio_summary']['danceability']
dic3['duration'] = json_object['response']['track']['audio_summary']['duration']
dic3['energy'] = json_object['response']['track']['audio_summary']['energy']
dic3['instrumentalness'] = json_object['response']['track']['audio_summary']['instrumentalness']
dic3['key'] = json_object['response']['track']['audio_summary']['key']
dic3['liveness'] = json_object['response']['track']['audio_summary']['liveness']
dic3['loudness'] = json_object['response']['track']['audio_summary']['loudness']
dic3['speechiness'] = json_object['response']['track']['audio_summary']['speechiness']
dic3['tempo'] = json_object['response']['track']['audio_summary']['tempo']
dic3['time_signature'] = json_object['response']['track']['audio_summary']['time_signature']
dic3['uri'] = i
data3 = data3.append(dic3,ignore_index=True)
except KeyError:
pass
song_data=pd.merge(project_data,data3,on='uri')
song_data.to_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project\song_data.csv')
spoturl = 'http://ws.spotify.com/lookup/1/.json?uri=spotify:track:'
for i in range(8033,11115):
dic = {}
url = spoturl + str(uri[i])
response = urllib2.urlopen(url)
json_object = json.load(response)
dic['artist_uri'] = json_object['track']['artists'][0]['href']
dic['uri'] = uri[i]
data = data.append(dic,ignore_index=True)
echourl = 'http://developer.echonest.com/api/v4/artist/terms?api_key=API&id='
echour2 = '&format=json'
num = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
for i in range(3414,3421):
try:
uri = artist[i]
url = echourl+uri+echour2
dic = {}
dic2 = {}
response = urllib2.urlopen(url)
json_object = json.load(response)
dic['genre'] = json_object['response']['terms']
term_rank = pd.DataFrame(columns=['frequency','name','weight'])
dic2['artist_uri'] = artist[i]
for genre in dic['genre']:
term_rank = term_rank.append(genre,ignore_index=True)
term_rank = (term_rank.sort('frequency',ascending=False)).reset_index()
if term_rank['name'][0]=='pop':
if term_rank['name'][1][0] in num:
dic2['genre'] = term_rank['name'][2]
else:
dic2['genre'] = term_rank['name'][1]
elif term_rank['name'][0][0] in num:
if term_rank['name'][1]=='pop':
dic2['genre'] = term_rank['name'][2]
else:
dic2['genre'] = term_rank['name'][1]
else:
dic2['genre'] = term_rank['name'][0]
genre_list = genre_list.append(dic2,ignore_index=True)
except IndexError:
pass
import pandas as pd
import matplotlib.pyplot as plt
import numpy
import seaborn as sns
songs = pd.read_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project\song_data.csv')
songs.head()
date | rank | uri | title | artist | popularity | danceability | duration | energy | instrumentalness | key | liveness | loudness | speechiness | tempo | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1/2/1960 | 1 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
1 | 1/9/1960 | 2 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
2 | 1/16/1960 | 2 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
3 | 1/23/1960 | 2 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
4 | 1/30/1960 | 3 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
#Convert dtype for date
songs['date'] = pd.to_datetime(songs['date'],format='%m/%d/%Y')
songs.describe()
rank | popularity | danceability | duration | energy | instrumentalness | key | liveness | loudness | speechiness | tempo | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 114422.000000 | 114422.000000 | 114422.000000 | 114422.000000 | 114422.000000 | 1.144220e+05 | 114422.000000 | 114422.000000 | 114422.000000 | 113607.000000 | 114422.000000 | 114422.000000 |
mean | 20.497098 | 0.399589 | 0.611887 | 231.943356 | 0.630440 | 3.373952e-02 | 5.228592 | 0.190379 | -8.468298 | 0.066200 | 120.196028 | 3.951277 |
std | 11.545553 | 0.214628 | 0.150314 | 61.443767 | 0.194966 | 1.388380e-01 | 3.560654 | 0.163452 | 3.525335 | 0.070969 | 27.585767 | 0.273730 |
min | 1.000000 | 0.000000 | 0.068750 | 61.266210 | 0.020085 | 4.090000e-15 | 0.000000 | 0.013549 | -41.613000 | 0.022336 | 41.409000 | 1.000000 |
25% | 10.000000 | 0.250000 | 0.514386 | 192.746210 | 0.490307 | 9.110000e-08 | 2.000000 | 0.088227 | -10.689000 | 0.031694 | 99.948750 | 4.000000 |
50% | 20.000000 | 0.420000 | 0.623672 | 229.559550 | 0.648759 | 8.550000e-06 | 5.000000 | 0.128692 | -7.996000 | 0.040176 | 118.972500 | 4.000000 |
75% | 31.000000 | 0.560000 | 0.717292 | 263.372880 | 0.788447 | 7.380170e-04 | 8.000000 | 0.243539 | -5.803000 | 0.062878 | 134.682500 | 4.000000 |
max | 40.000000 | 1.000000 | 0.984268 | 1367.092880 | 0.995899 | 9.702722e-01 | 11.000000 | 0.990848 | -0.073000 | 0.891714 | 217.748000 | 5.000000 |
fig = plt.figure(figsize =(30,25))
ax1 = fig.add_subplot(2,2,1)
ax1.hist(songs.popularity)
ax1.set_xlabel('Popularity',fontsize=20)
ax1.set_title('Histogram of Current Popularity Ratings',fontsize =25)
ax2 = fig.add_subplot(2,2,2)
ax2.set_ylim(-.1,1.1)
ax2.set_ylabel('Popularity',fontsize=20)
ax2.boxplot(songs.popularity)
ax2.set_title('Boxplot of Current Popularity Ratings',fontsize =25)
ax3 = fig.add_subplot(2,2,3)
ax3 = sns.violinplot(songs.popularity)
ax3.set_ylabel('Popularity',fontsize=20)
ax3.set_title('Violin Plot of Current Popularity Ratings',fontsize =25)
ax4 = fig.add_subplot(2,2,4)
ax4 = songs.popularity.plot(kind='kde')
ax4.set_title('KDE Plot of Current Popularity Ratings',fontsize =25)
ax4.set_ylabel('Popularity',fontsize=20)
fig.suptitle('Distributions of Current Popularity Ratings',fontsize =30)
<matplotlib.text.Text at 0x1662ef28>
pd.scatter_matrix(songs[['popularity','danceability','duration','tempo','loudness']],figsize=(20,20))
plt.suptitle('Scatterplot Matrix of Song Attributes')
<matplotlib.text.Text at 0x9ca5588>
#What song is the duration outlier?
big = songs[['title','artist']][songs['duration']>1000]
big.drop_duplicates()
title | artist | |
---|---|---|
32217 | Autobahn - 2009 Remastered Version | Kraftwerk |
#What songs currently have the highest popularity?
hits = songs[['title','artist','popularity']][songs['popularity']>.95]
hits.drop_duplicates()
#Recent songs appear to have the highest popularity scores in the set.
title | artist | popularity | |
---|---|---|---|
113442 | All of Me | John Legend | 0.96 |
113739 | Summer | Calvin Harris | 0.97 |
113807 | A Sky Full Of Stars | Coldplay | 0.96 |
114084 | Shower | Becky G | 0.97 |
114150 | All About That Bass | Meghan Trainor | 1.00 |
114231 | Don't | Ed Sheeran | 0.96 |
#What songs currently have the highest popularity?
hits = songs[['title','artist','popularity']][songs['popularity']>.95]
hits.drop_duplicates()
#Recent songs appear to have the highest popularity scores in the set.
title | artist | popularity | |
---|---|---|---|
113442 | All of Me | John Legend | 0.96 |
113739 | Summer | Calvin Harris | 0.97 |
113807 | A Sky Full Of Stars | Coldplay | 0.96 |
114084 | Shower | Becky G | 0.97 |
114150 | All About That Bass | Meghan Trainor | 1.00 |
114231 | Don't | Ed Sheeran | 0.96 |
songs_1 = songs[songs['rank'] == 1]
songs_10 = songs[songs['rank'] == 10]
songs_20 = songs[songs['rank'] == 20]
songs_40 = songs[songs['rank'] == 40]
songs_1.plot(x='date',y='popularity',figsize=(25,8))
plt.suptitle('Timeseries of Popularity for #1 Songs')
plt.ylabel('Current Popularity')
<matplotlib.text.Text at 0x195f83c8>
fig = plt.figure(figsize =(30,25))
fig.suptitle('Timeseries Comparison for #1s, #10s, and #40s')
ax1 = fig.add_subplot(3,1,1)
ax1.set_title('#1s',fontsize = 20)
ax1 = songs_1.plot(x='date',y='popularity')
ax1.set_ylabel('Popularity')
ax2 = fig.add_subplot(3,1,2)
ax2.set_title('#10s',fontsize = 20)
ax2.set_ylabel('Popularity')
ax2 = songs_10.plot(x='date',y='popularity')
ax3 = fig.add_subplot(3,1,3)
ax3.set_title('#40s',fontsize = 20)
ax3.set_ylabel('Popularity')
ax3 = songs_40.plot(x='date',y='popularity')
#Get songs' highest rank and the last date it achieved that rank
unique = songs.uri.unique()
top_rank = pd.DataFrame(columns=['uri','rank','date'])
ranker = {}
for i in unique:
ranker['uri'] = i
unique_song = songs[songs.uri == i]
min_rank = unique_song[unique_song['rank'] == unique_song['rank'].min()]
min_rank = min_rank.reset_index()
ranker['rank'] = min_rank['rank'][0]
max_date = min_rank[min_rank['date']==min_rank['date'].max()]
max_date = max_date.reset_index()
ranker['date'] = max_date['date'][0]
top_rank = top_rank.append(ranker,ignore_index=True)
top_rank.head()
top_songs = pd.read_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project/top_rank.csv')
top_songs['date'] = pd.to_datetime(top_songs['date'])
top_songs['time'] = top_songs.date.max() - top_songs.date
top_songs['days'] = (top_songs.time /np.timedelta64(1, 'D')).astype(int)
top_songs = top_songs.drop('Unnamed: 0',axis=1)
top_songs.head()
uri | rank | date | time | days | |
---|---|---|---|---|---|
0 | 3hvakqVpwaz4L7zN5HfTCY | 1 | 1960-01-02 | 19992 days | 19992 |
1 | 4f8hBeMXMvssn6HtFAtblo | 1 | 1960-01-16 | 19978 days | 19978 |
2 | 1XRXD6RmgxtySaKbrVBfzk | 3 | 1960-01-09 | 19985 days | 19985 |
3 | 22TgqnP9tyLU8i0eZnbuMR | 4 | 1960-01-02 | 19992 days | 19992 |
4 | 6TdAcAgVw7Z8pzU2KswtvH | 3 | 1960-01-16 | 19978 days | 19978 |
top_songs = pd.merge(songs,top_songs,on='uri')
import matplotlib.pyplot as plt
features = ['danceability','duration','energy','instrumentalness','key','liveness','loudness','tempo','time_signature','days']
pd.scatter_matrix(top_songs[features],figsize=(20,20))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001FA5A2E8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000221B8438>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023EDA320>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001FBB9780>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022CDE470>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000001FB75400>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023B4F5C0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023A932B0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023DECA58>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022D50748>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000022E37160>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023F41E80>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000023F5F550>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000240C5668>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000002417F0F0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024211BA8>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024317898>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024287F98>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024479AC8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000245D26D8>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000246B4908>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000247795F8>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024805E48>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000024914978>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000249A7208>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000025C60E10>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000025D24978>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000025E08518>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000025F0E208>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000025E3AA90>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000026072438>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026179F60>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026229278>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026320F28>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000263767B8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000264D92E8>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000002658FB38>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000002667F780>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000267842E8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000267B7710>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000268FFEF0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026A1B780>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026AC8320>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026B9DEF0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026C83BE0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026D458D0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026D03048>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026F2CB00>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000026FAD668>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000270A2940>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000271A7630>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000271F0E80>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000273429B0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027459240>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000274F8E48>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000275BB9B0>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000276A0550>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000277A5240>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000276D3940>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027929470>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000027A20F98>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027B292B0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027C1FF60>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027D0B7F0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027DDB320>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027EF3B70>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000027F9F7B8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000280A6320>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000280E99E8>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000281FDF60>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000028314710>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000283C05C0>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000284C7D30>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000002857CDA0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000002863D8D0>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000285F0A90>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028836C18>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000002893D828>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000289DFBE0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028AED470>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000028BD3080>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028CCCB70>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028DE0400>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028ED0EB8>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000028F97AC8>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000002905B710>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029160400>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000002908CB00>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029316630>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000293E9198>], [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000294CE470>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000295D4160>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000002965E9B0>, <matplotlib.axes._subplots.AxesSubplot object at 0x000000002976F4E0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029826D30>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000298E1978>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000299FA4E0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029B3A080>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029BEED30>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000029B6B470>]], dtype=object)
plt.figure(figsize=(20,20))
for i in range(len(features)):
v = i +1
ax1 = subplot(5,2,v)
ax1.hist(top_songs[features[i]])
ax1.set_title(str(features[i]),fontsize=15)
plt.hist(top_songs['popularity'])
plt.suptitle('Top Popularity')
<matplotlib.text.Text at 0x36e3e390>
the_1s = songs[songs['rank']==1]
plt.figure(figsize=(20,10))
the_1s.plot(x='date',y='popularity')
plt.suptitle('Current Popularity Ratings for #1 songs 1960-2014',fontsize = 25)
<matplotlib.text.Text at 0x38318828>
from sklearn import linear_model
clf = linear_model.LinearRegression()
X = top_songs[features]
X = X.values
y = top_songs['popularity']
y = y.values
from sklearn.cross_validation import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X, y)
model = clf.fit(xtrain,ytrain)
model.score(xtrain,ytrain)
0.32966268207020744
model.score(xtest,ytest)
0.32361192116190363
pd.DataFrame(zip(features,model.coef_.T),columns=['Variable','Coefficient'])
Variable | Coefficient | |
---|---|---|
0 | danceability | 0.001357 |
1 | duration | 0.000056 |
2 | energy | -0.046787 |
3 | instrumentalness | -0.113890 |
4 | key | -0.001034 |
5 | liveness | -0.057652 |
6 | loudness | 0.007267 |
7 | tempo | -0.000007 |
8 | time_signature | 0.003422 |
9 | days | -0.000018 |
from sklearn import feature_selection
f = feature_selection.f_regression(X,y)
pd.DataFrame(zip(features,f[1].T),columns=['Variable','P-Value'])
Variable | P-Value | |
---|---|---|
0 | danceability | 0.000000e+00 |
1 | duration | 0.000000e+00 |
2 | energy | 0.000000e+00 |
3 | instrumentalness | 0.000000e+00 |
4 | key | 5.298700e-01 |
5 | liveness | 9.832641e-152 |
6 | loudness | 0.000000e+00 |
7 | tempo | 3.581412e-02 |
8 | time_signature | 8.023966e-199 |
9 | days | 0.000000e+00 |
new_features = ['danceability','duration','energy','instrumentalness','liveness','loudness','time_signature']
X = top_songs[new_features]
X = X.values
y = top_songs['popularity']
y = y.values
xtrain,xtest,ytrain,ytest = train_test_split(X, y)
model = clf.fit(xtrain,ytrain)
model.score(xtrain,ytrain)
0.18319207407832316
model.score(xtest,ytest)
0.18412613746261552
pd.DataFrame(zip(new_features,model.coef_.T),columns=['Variable','Coefficient'])
Variable | Coefficient | |
---|---|---|
0 | danceability | 0.111771 |
1 | duration | 0.000597 |
2 | energy | -0.099886 |
3 | instrumentalness | -0.146259 |
4 | liveness | -0.084151 |
5 | loudness | 0.023090 |
6 | time_signature | 0.020949 |
pd.DataFrame(zip(new_features,f[1].T),columns=['Variable','P-Value'])
Variable | P-Value | |
---|---|---|
0 | danceability | 0.000000e+00 |
1 | duration | 0.000000e+00 |
2 | energy | 0.000000e+00 |
3 | instrumentalness | 0.000000e+00 |
4 | liveness | 5.298700e-01 |
5 | loudness | 9.832641e-152 |
6 | time_signature | 0.000000e+00 |
features = ['danceability','duration','energy','instrumentalness','key','liveness','loudness','tempo','time_signature','days']
plt.figure(figsize=(20,20))
for i in range(len(features)):
v = i +1
ax1 = subplot(5,2,v)
ax1.scatter(top_songs[features[i]],top_songs['popularity'])
ax1.set_title(str(features[i]),fontsize=15)
newer_features = ['days','loudness']
X = top_songs[newer_features]
X = X.values
y = top_songs['popularity']
y = y.values
model = clf.fit(xtrain,ytrain)
model.score(xtrain,ytrain)
0.18319207407832316
model.score(xtest,ytest)
0.18412613746261552
pd.DataFrame(zip(newer_features,model.coef_.T),columns=['Variable','Coefficient'])
Variable | Coefficient | |
---|---|---|
0 | days | 0.111771 |
1 | loudness | 0.000597 |
pd.DataFrame(zip(newer_features,f[1].T),columns=['Variable','P-Value'])
Variable | P-Value | |
---|---|---|
0 | days | 0 |
1 | loudness | 0 |
songs = pd.read_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project/song_genre.csv')
songs['log_speech'] = np.log(songs['speechiness'])
songs['log_speech'] = songs['log_speech'] - songs['log_speech'].min()
songs['log_instrument'] = np.log(songs['instrumentalness'])
songs['log_instrument'] = songs['log_instrument'] - songs['log_instrument'].min()
songs.head()
artist_uri | genre | uri | date | rank | title | artist | popularity | danceability | duration | energy | instrumentalness | key | liveness | loudness | speechiness | tempo | time_signature | log_speech | log_instrument | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | spotify:artist:5zNOI87gG4RttFmYAZWaxQ | rock | 3hvakqVpwaz4L7zN5HfTCY | 1/2/1960 | 1 | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 | 0.189548 | 14.067097 |
1 | spotify:artist:5zNOI87gG4RttFmYAZWaxQ | rock | 3hvakqVpwaz4L7zN5HfTCY | 1/9/1960 | 2 | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 | 0.189548 | 14.067097 |
2 | spotify:artist:5zNOI87gG4RttFmYAZWaxQ | rock | 3hvakqVpwaz4L7zN5HfTCY | 1/16/1960 | 2 | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 | 0.189548 | 14.067097 |
3 | spotify:artist:5zNOI87gG4RttFmYAZWaxQ | rock | 3hvakqVpwaz4L7zN5HfTCY | 1/23/1960 | 2 | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 | 0.189548 | 14.067097 |
4 | spotify:artist:5zNOI87gG4RttFmYAZWaxQ | rock | 3hvakqVpwaz4L7zN5HfTCY | 1/30/1960 | 3 | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 | 0.189548 | 14.067097 |
genres = pd.DataFrame(songs.genre.value_counts())
genres = songs['genre'].unique()
genres
array(['rock', 'country', "rock 'n roll", 'swing', 'jazz', 'choral music', 'r&b', 'soul', 'easy listening', 'male vocalist', 'doo-wop', 'ballad', 'rockabilly', 'blues', 'traditional pop', 'folk', 'dance', 'christian', 'break', 'instrumental', 'punk', 'polka', 'symphony', 'comedy', 'classic rock', '50s', 'adult contemporary', 'pop rock', 'brill building pop', 'soundtrack', 'latin', 'club', 'big band', 'ska', 'post-hardcore', 'reggae', 'tejano', '60s pop', '70s', 'lounge', 'singer-songwriter', 'electronic', 'funk', 'sunshine pop', 'house', 'hip hop', 'psychedelic', 'psychedelic rock', 'metal', 'northern soul', 'electronica', 'soft rock', 'new age', 'disco', 'folk rock', 'garage rock', 'motown', 'power pop', 'romantic', 'classical', 'ccm', 'new wave', 'eurodance', 'indie', 'synthpop', 'rap', 'boy band', '80s', 'dance pop', 'techno', 'hardcore', 'experimental', 'alternative', 'teen pop', 'gospel', 'trance', 'soca', 'crunk', 'opera', 'remix', 'indie pop'], dtype=object)
genre_list = ['rock','r&b','country','soul','hip hop','dance','rap','jazz','folk','disco','funk',
'reggae','latin','house','electronic','blues','metal']
song_genre = songs[songs['genre'].isin(genre_list)]
song_genre['date'] = pd.to_datetime(song_genre['date'],format='%m/%d/%Y')
-c:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead
genre_dummy = pd.get_dummies(song_genre['genre'])
song_date = song_genre['date']
song_date = genre_dummy.join(song_date)
song_group = song_date.groupby('date').agg('sum')
song_group.plot(kind='area',figsize=(30,18),colormap ='Paired')
plt.legend(loc=2,fontsize=15,ncol=3,markerscale=100)
plt.suptitle('Billboard Top 40 by Genre (Jan 1960 - Sep 2014)',fontsize=30)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.xlabel('date',fontsize=25)
<matplotlib.text.Text at 0x19708860>
rock = song_genre[song_genre['genre']=='rock']
def decade(date):
year = str(date).split('-')[0]
last_num = year[3]
decade = int(year) - int(last_num)
return decade
rock['decade'] = rock['date'].apply(decade)
rock = rock.drop(['rank','date'],axis=1)
rock = rock.drop_duplicates()
rock = rock.reset_index(drop=True)
features = ['danceability','duration','energy','key','liveness','loudness','tempo','time_signature','instrumentalness','speechiness']
plt.figure(figsize=(20,20))
for i in range(len(features)):
v = i +1
ax1 = subplot(5,2,v)
ax1.hist(rock[features[i]])
ax1.set_title(str(features[i]),fontsize=15)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-23-35224f1c7e8c> in <module>() 3 v = i +1 4 ax1 = subplot(5,2,v) ----> 5 ax1.hist(rock[features[i]]) 6 ax1.set_title(str(features[i]),fontsize=15) C:\Users\Matt\Anaconda\lib\site-packages\matplotlib\axes\_axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs) 5650 # this will automatically overwrite bins, 5651 # so that each histogram uses the same bins -> 5652 m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs) 5653 m = m.astype(float) # causes problems later if it's an int 5654 if mlast is None: C:\Users\Matt\Anaconda\lib\site-packages\numpy\lib\function_base.pyc in histogram(a, bins, range, normed, weights, density) 163 if (mn > mx): 164 raise AttributeError( --> 165 'max must be larger than min in range parameter.') 166 167 if not iterable(bins): AttributeError: max must be larger than min in range parameter.
features = ['danceability','duration','energy','key','liveness','loudness','tempo','time_signature','log_speech','log_instrument']
plt.figure(figsize=(20,20))
for i in range(len(features)):
v = i +1
ax1 = rock.boxplot(features[i],by='decade',ax=subplot(5,2,v))
ax1.set_title(str(features[i]),fontsize=15)
ax1.set_xlabel('')
plt.suptitle('Feature Boxplots by Decade',size=20)
<matplotlib.text.Text at 0x21d711d0>
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
rock = rock.dropna()
rock['loudness'] = rock['loudness']-rock['loudness'].min()
X = rock[features]
Y = rock['decade']
xtrain,xtest,ytrain,ytest = train_test_split(X,Y)
mnb = MultinomialNB().fit(xtrain,ytrain)
print 'Train: ', mnb.score(xtrain,ytrain)
print 'Test: ', mnb.score(xtest, ytest)
Train: 0.396097046414 Test: 0.384493670886
from sklearn import metrics
y_pred = mnb.predict(X)
decades = ['1960','1970','1980','1990','2000','2010']
print "Confusion Matrix"
matrix = pd.DataFrame(metrics.confusion_matrix(Y,y_pred),columns=decades,index=decades)
matrix
Confusion Matrix
1960 | 1970 | 1980 | 1990 | 2000 | 2010 | |
---|---|---|---|---|---|---|
1960 | 954 | 186 | 65 | 36 | 38 | 11 |
1970 | 394 | 309 | 250 | 236 | 25 | 3 |
1980 | 206 | 321 | 394 | 335 | 39 | 10 |
1990 | 62 | 98 | 168 | 228 | 30 | 7 |
2000 | 107 | 34 | 87 | 79 | 73 | 26 |
2010 | 55 | 36 | 34 | 28 | 62 | 30 |
plt.hist(ytrain)
(array([ 992., 0., 911., 0., 955., 0., 442., 0., 310., 182.]), array([ 1960., 1965., 1970., 1975., 1980., 1985., 1990., 1995., 2000., 2005., 2010.]), <a list of 10 Patch objects>)
new_features = ['liveness','loudness','energy','log_speech','danceability','log_instrument']
X = rock[new_features]
X = rock[new_features]
xtrain,xtest,ytrain,ytest = train_test_split(X,Y)
mnb = MultinomialNB().fit(xtrain,ytrain)
print 'Train: ', mnb.score(xtrain,ytrain)
print 'Test: ', mnb.score(xtest, ytest)
Train: 0.314609704641 Test: 0.340189873418
from sklearn.ensemble import RandomForestClassifier
X = rock[features]
Y = rock['decade']
xtrain,xtest,ytrain,ytest = train_test_split(X,Y)
RFC = RandomForestClassifier(n_estimators=800).fit(xtrain,ytrain)
print 'Train: ', RFC.score(xtrain,ytrain)
print 'Test: ', RFC.score(xtest, ytest)
Train: 0.985232067511 Test: 0.549841772152
from sklearn.metrics import classification_report
y_pred = RFC.predict(xtest)
decades = ['1960','1970','1980','1990','2000','2010']
print 'Train: ', RFC.score(xtrain,ytrain)
print 'Test: ', RFC.score(xtest, ytest)
print matrix, '\n'
print 'Classification Report'
print classification_report(ytest,y_pred,target_names=decades),"\n"
Train: 0.985232067511 Test: 0.549841772152 1960 1970 1980 1990 2000 2010 1960 954 186 65 36 38 11 1970 394 309 250 236 25 3 1980 206 321 394 335 39 10 1990 62 98 168 228 30 7 2000 107 34 87 79 73 26 2010 55 36 34 28 62 30 Classification Report precision recall f1-score support 1960 0.77 0.84 0.80 324 1970 0.46 0.41 0.43 299 1980 0.51 0.67 0.58 325 1990 0.27 0.10 0.15 139 2000 0.41 0.49 0.45 113 2010 0.48 0.22 0.30 64 avg / total 0.53 0.55 0.53 1264
genre = ['r&b','hip hop','country']
genre_subset = songs[songs['genre'].isin(genre)]
genre_subset = genre_subset.drop(['date','rank'],axis=1)
genre_subset = genre_subset.drop_duplicates()
plt.figure(figsize=(20,20))
for i in range(len(features)):
v = i +1
ax1 = genre_subset.boxplot(features[i],by='genre',ax=subplot(5,2,v))
ax1.set_title(str(features[i]),fontsize=15)
ax1.set_xlabel('')
plt.suptitle('Feature Boxplots by Genre',size=20)
<matplotlib.text.Text at 0x2fec4908>
genre_subset = genre_subset.dropna()
X = genre_subset[features]
def genre_num(lst):
index = genre.index(lst)
return index
genre_subset['genre_num'] = genre_subset['genre'].apply(genre_num)
genre_subset = genre_subset.dropna()
X = genre_subset[features]
Y = genre_subset['genre_num']
xtrain,xtest,ytrain,ytest = train_test_split(X,Y)
rf = RandomForestClassifier(n_estimators=800).fit(xtrain,ytrain)
print 'Train: ', rf.score(xtrain,ytrain)
print 'Test: ', rf.score(xtest, ytest)
Train: 1.0 Test: 0.729691876751
print "Confusion Matrix"
y_pred = rf.predict(xtest)
matrix = pd.DataFrame(metrics.confusion_matrix(ytest,y_pred),columns=genre,index=genre)
matrixgenre = ['r&b','hip hop','country','rock']
features = ['danceability','energy','liveness','time_signature','log_speech','log_instrument']
Confusion Matrix
r&b | hip hop | country | |
---|---|---|---|
r&b | 211 | 26 | 59 |
hip hop | 42 | 83 | 4 |
country | 60 | 2 | 227 |
genre = ['r&b','hip hop','country','rock']
features = ['danceability','energy','liveness','time_signature','log_speech','log_instrument']
genre_subset = songs[songs['genre'].isin(genre)]
genre_subset = genre_subset.drop(['date','rank'],axis=1)
genre_subset = genre_subset.drop_duplicates()
plt.figure(figsize=(20,20))
for i in range(len(features)):
v = i +1
ax1 = genre_subset.boxplot(features[i],by='genre',ax=subplot(5,2,v))
ax1.set_title(str(features[i]),fontsize=15)
ax1.set_xlabel('')
plt.suptitle('Feature Boxplots by Genre',size=20)
<matplotlib.text.Text at 0x383acc88>
for i in genre:
cluster_genre = genre_subset[genre_subset['genre']==i]
print 'Top 10 Songs for Genre: %s' %i
print
print cluster_genre[['title','artist','popularity']].sort('popularity',ascending=False).head(10)
print
Top 10 Songs for Genre: r&b title artist popularity 98244 All of Me John Legend 0.96 109074 Bang Bang Jessie J 0.91 107679 Wiggle (feat. Snoop Dogg) Jason Derulo 0.90 111827 Me And My Broken Heart Rixton 0.88 95037 Drunk in Love Beyonc� 0.88 111037 Break Free Ariana Grande 0.87 111017 Problem Ariana Grande 0.86 106012 Don't Tell 'Em Jeremih 0.85 111970 2 On Tinashe 0.82 100347 New Flame Chris Brown 0.82 Top 10 Songs for Genre: hip hop title artist popularity 111810 Classic MKTO 0.92 111660 Turn Down for What DJ Snake 0.91 111802 Black Widow Iggy Azalea 0.90 111792 Fancy Iggy Azalea 0.89 110791 Can't Hold Us - feat. Ray Dalton Macklemore & Ryan Lewis 0.89 95283 Happy Pharrell Williams 0.87 96674 Timber Pitbull 0.86 95318 Come Get It Bae Pharrell Williams 0.86 111779 Fancy Iggy Azalea 0.84 96708 Wild Wild Love Pitbull 0.84 Top 10 Songs for Genre: country title artist popularity 101556 Burnin' It Down Jason Aldean 0.85 110642 Dirt Florida Georgia Line 0.84 85629 American Kids Kenny Chesney 0.83 110624 This Is How We Roll Florida Georgia Line 0.81 107489 Play It Again Luke Bryan 0.80 95474 Drunk On A Plane Dierks Bentley 0.80 109348 Beachin' Jake Owen 0.79 106214 Bartender Lady Antebellum 0.78 110572 Cruise Florida Georgia Line 0.77 110516 Where It's At Dustin Lynch 0.76 Top 10 Songs for Genre: rock title artist popularity 94681 A Sky Full Of Stars Coldplay 0.96 111899 Rude Magic! 0.94 94691 Magic Coldplay 0.91 105275 This Is How We Do Katy Perry 0.90 105209 Dark Horse Katy Perry 0.89 103259 Maps Maroon 5 0.87 103708 Ain't It Fun Paramore 0.87 94655 Paradise Coldplay 0.84 108576 When I Was Your Man Bruno Mars 0.84 108604 Treasure Bruno Mars 0.83
from sklearn.cluster import KMeans
features = ['danceability','energy','liveness','time_signature','log_speech','log_instrument']
genre_subset = genre_subset.dropna()
X = genre_subset[features]
km = KMeans(n_clusters=len(genre)).fit(X)
genre_subset['prediction'] = km.predict(genre_subset[features])
genre_subset = genre_subset.drop(['date','rank'],axis=1)
genre_subset = genre_subset.drop_duplicates()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-69-13aaff1005ae> in <module>() ----> 1 genre_subset = genre_subset.drop(['date','rank'],axis=1) 2 genre_subset = genre_subset.drop_duplicates() C:\Users\Matt\Anaconda\lib\site-packages\pandas\core\generic.pyc in drop(self, labels, axis, level, inplace, **kwargs) 1462 new_axis = axis.drop(labels, level=level) 1463 else: -> 1464 new_axis = axis.drop(labels) 1465 dropped = self.reindex(**{axis_name: new_axis}) 1466 try: C:\Users\Matt\Anaconda\lib\site-packages\pandas\core\index.pyc in drop(self, labels) 1808 mask = indexer == -1 1809 if mask.any(): -> 1810 raise ValueError('labels %s not contained in axis' % labels[mask]) 1811 return self.delete(indexer) 1812 ValueError: labels ['date' 'rank'] not contained in axis
plt.figure(figsize=(20,20))
for i in range(len(features)):
v = i +1
ax1 = genre_subset.boxplot(features[i],by='genre',ax=subplot(5,2,v))
ax1.set_title(str(features[i]),fontsize=15)
ax1.set_xlabel('')
plt.suptitle('Feature Boxplots by Genre',size=20)
<matplotlib.text.Text at 0x38ae8048>
for i in genre:
cluster_genre = genre_subset[genre_subset['genre']==i]
print 'Top 10 Songs for Genre: %s' %i
print
print cluster_genre[['title','artist','popularity']].sort('popularity',ascending=False).head(10)
print
Top 10 Songs for Genre: r&b title artist popularity 98244 All of Me John Legend 0.96 109074 Bang Bang Jessie J 0.91 111827 Me And My Broken Heart Rixton 0.88 95037 Drunk in Love Beyonc� 0.88 111037 Break Free Ariana Grande 0.87 111017 Problem Ariana Grande 0.86 106012 Don't Tell 'Em Jeremih 0.85 111970 2 On Tinashe 0.82 100347 New Flame Chris Brown 0.82 104010 Na Na Trey Songz 0.81 Top 10 Songs for Genre: hip hop title artist popularity 111810 Classic MKTO 0.92 111660 Turn Down for What DJ Snake 0.91 111802 Black Widow Iggy Azalea 0.90 111792 Fancy Iggy Azalea 0.89 110791 Can't Hold Us - feat. Ray Dalton Macklemore & Ryan Lewis 0.89 95283 Happy Pharrell Williams 0.87 96674 Timber Pitbull 0.86 95318 Come Get It Bae Pharrell Williams 0.86 111779 Fancy Iggy Azalea 0.84 96708 Wild Wild Love Pitbull 0.84 Top 10 Songs for Genre: country title artist popularity 101556 Burnin' It Down Jason Aldean 0.85 110642 Dirt Florida Georgia Line 0.84 85629 American Kids Kenny Chesney 0.83 110624 This Is How We Roll Florida Georgia Line 0.81 95474 Drunk On A Plane Dierks Bentley 0.80 107489 Play It Again Luke Bryan 0.80 109348 Beachin' Jake Owen 0.79 106214 Bartender Lady Antebellum 0.78 110572 Cruise Florida Georgia Line 0.77 110516 Where It's At Dustin Lynch 0.76 Top 10 Songs for Genre: rock title artist popularity 94681 A Sky Full Of Stars Coldplay 0.96 111899 Rude Magic! 0.94 94691 Magic Coldplay 0.91 105275 This Is How We Do Katy Perry 0.90 105209 Dark Horse Katy Perry 0.89 103708 Ain't It Fun Paramore 0.87 103259 Maps Maroon 5 0.87 94655 Paradise Coldplay 0.84 108576 When I Was Your Man Bruno Mars 0.84 108604 Treasure Bruno Mars 0.83
plt.figure(figsize=(20,20))
for i in range(len(features)):
v = i +1
ax1 = genre_subset.boxplot(features[i],by='prediction',ax=subplot(5,2,v))
ax1.set_title(str(features[i]),fontsize=15)
ax1.set_xlabel('')
plt.suptitle('Feature Boxplots by Predicted Genre',size=20)
<matplotlib.text.Text at 0x368e4dd8>
for i in genre_subset['prediction'].unique():
cluster_genre = genre_subset[genre_subset['prediction']==i]
print 'Top 10 Songs for Cluster Genre %s' %i
print
print cluster_genre[['title','artist','popularity']].sort('popularity',ascending=False).head(10)
print
Top 10 Songs for Cluster Genre 0 title artist popularity 111899 Rude Magic! 0.94 109074 Bang Bang Jessie J 0.91 105275 This Is How We Do Katy Perry 0.90 110791 Can't Hold Us - feat. Ray Dalton Macklemore & Ryan Lewis 0.89 111792 Fancy Iggy Azalea 0.89 103259 Maps Maroon 5 0.87 96674 Timber Pitbull 0.86 106012 Don't Tell 'Em Jeremih 0.85 108576 When I Was Your Man Bruno Mars 0.84 110750 Thrift Shop - feat. Wanz Macklemore & Ryan Lewis 0.83 Top 10 Songs for Cluster Genre 3 title artist popularity 98244 All of Me John Legend 0.96 111810 Classic MKTO 0.92 105209 Dark Horse Katy Perry 0.89 111827 Me And My Broken Heart Rixton 0.88 95283 Happy Pharrell Williams 0.87 111779 Fancy Iggy Azalea 0.84 96708 Wild Wild Love Pitbull 0.84 96337 No Mediocre T.I. 0.83 105960 Really Don't Care Demi Lovato 0.82 100347 New Flame Chris Brown 0.82 Top 10 Songs for Cluster Genre 1 title artist popularity 94681 A Sky Full Of Stars Coldplay 0.96 94691 Magic Coldplay 0.91 111660 Turn Down for What DJ Snake 0.91 111802 Black Widow Iggy Azalea 0.90 95037 Drunk in Love Beyonc� 0.88 110920 Sail AWOLNATION 0.82 108409 human Christina Perri 0.81 99648 Feel Good Inc Gorillaz 0.79 69958 Enter Sandman Metallica 0.79 94680 Princess of China Coldplay 0.79 Top 10 Songs for Cluster Genre 2 title artist popularity 111037 Break Free Ariana Grande 0.87 103708 Ain't It Fun Paramore 0.87 95318 Come Get It Bae Pharrell Williams 0.86 111017 Problem Ariana Grande 0.86 101556 Burnin' It Down Jason Aldean 0.85 94655 Paradise Coldplay 0.84 110642 Dirt Florida Georgia Line 0.84 108604 Treasure Bruno Mars 0.83 85629 American Kids Kenny Chesney 0.83 111946 Come With Me Now KONGOS 0.82