import pandas as pd
from datetime import timedelta
import numpy as np
songs = pd.read_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project\song_data.csv')
songs.date = pd.to_datetime(songs.date,format='%m/%d/%Y')
songs.head()
date | rank | uri | title | artist | popularity | danceability | duration | energy | instrumentalness | key | liveness | loudness | speechiness | tempo | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1960-01-02 | 1 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
1 | 1960-01-09 | 2 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
2 | 1960-01-16 | 2 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
3 | 1960-01-23 | 2 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
4 | 1960-01-30 | 3 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
unique = songs.uri.unique()
top_rank = pd.DataFrame(columns=['uri','rank','date'])
ranker = {}
for i in unique:
ranker['uri'] = i
unique_song = songs[songs.uri == i]
min_rank = unique_song[unique_song['rank'] == unique_song['rank'].min()]
min_rank = min_rank.reset_index()
ranker['rank'] = min_rank['rank'][0]
max_date = min_rank[min_rank['date']==min_rank['date'].max()]
max_date = max_date.reset_index()
ranker['date'] = max_date['date'][0]
top_rank = top_rank.append(ranker,ignore_index=True)
top_rank.head()
uri | rank | date | |
---|---|---|---|
0 | 3hvakqVpwaz4L7zN5HfTCY | 1 | 1960-01-02 |
1 | 4f8hBeMXMvssn6HtFAtblo | 1 | 1960-01-16 |
2 | 1XRXD6RmgxtySaKbrVBfzk | 3 | 1960-01-09 |
3 | 22TgqnP9tyLU8i0eZnbuMR | 4 | 1960-01-02 |
4 | 6TdAcAgVw7Z8pzU2KswtvH | 3 | 1960-01-16 |
songs2 = songs.copy()
songs2 = songs2.drop(['date','rank'],axis=1)
songs2 = songs2.drop_duplicates()
top_songs = pd.merge(songs2,top_rank,on='uri')
top_songs['time'] = top_songs.date.max() - top_songs.date
top_songs['days'] = (top_songs.time /np.timedelta64(1, 'D')).astype(int)
top_songs.head()
uri | title | artist | popularity | danceability | duration | energy | instrumentalness | key | liveness | loudness | speechiness | tempo | time_signature | rank | date | time | days | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.540 | 0.026997 | 94.986 | 4 | 1 | 1960-01-02 | 19992 days | 19992 |
1 | 4f8hBeMXMvssn6HtFAtblo | El Paso | Marty Robbins | 0.54 | 0.653175 | 259.30621 | 0.452602 | 3.060000e-05 | 2 | 0.162871 | -9.710 | 0.030023 | 106.347 | 3 | 1 | 1960-01-16 | 19978 days | 19978 |
2 | 1XRXD6RmgxtySaKbrVBfzk | The Big Hurt | Miss Toni Fisher | 0.00 | 0.604731 | 134.09288 | 0.522267 | 1.030000e-05 | 0 | 0.074466 | -8.812 | 0.030300 | 123.961 | 4 | 3 | 1960-01-09 | 19985 days | 19985 |
3 | 22TgqnP9tyLU8i0eZnbuMR | It's Time To Cry | Paul Anka | 0.00 | 0.404905 | 144.27955 | 0.375461 | 2.610000e-06 | 3 | 0.381637 | -13.209 | 0.031920 | 111.225 | 3 | 4 | 1960-01-02 | 19992 days | 19992 |
4 | 6TdAcAgVw7Z8pzU2KswtvH | Way Down Yonder in New Orleans | Freddie Cannon | 0.15 | 0.451357 | 151.85288 | 0.836398 | 1.130000e-06 | 8 | 0.107817 | -5.761 | 0.050233 | 142.824 | 4 | 3 | 1960-01-16 | 19978 days | 19978 |
import matplotlib.pyplot as plt
features = ['danceability','duration','energy','instrumentalness','key','liveness','loudness','tempo','time_signature','days']
pd.scatter_matrix(top_songs[features],figsize=(20,20))
array([[<matplotlib.axes.AxesSubplot object at 0x00000000098044E0>, <matplotlib.axes.AxesSubplot object at 0x000000000C609D68>, <matplotlib.axes.AxesSubplot object at 0x000000000E9E5208>, <matplotlib.axes.AxesSubplot object at 0x0000000011EB12B0>, <matplotlib.axes.AxesSubplot object at 0x000000000C40B390>, <matplotlib.axes.AxesSubplot object at 0x0000000011CF11D0>, <matplotlib.axes.AxesSubplot object at 0x0000000011DAEB38>, <matplotlib.axes.AxesSubplot object at 0x0000000011EFE278>, <matplotlib.axes.AxesSubplot object at 0x0000000011FF7A58>, <matplotlib.axes.AxesSubplot object at 0x00000000120587B8>], [<matplotlib.axes.AxesSubplot object at 0x00000000121630B8>, <matplotlib.axes.AxesSubplot object at 0x00000000121955F8>, <matplotlib.axes.AxesSubplot object at 0x00000000129BF4A8>, <matplotlib.axes.AxesSubplot object at 0x0000000012A3AC88>, <matplotlib.axes.AxesSubplot object at 0x0000000012B28F28>, <matplotlib.axes.AxesSubplot object at 0x0000000012C3C6D8>, <matplotlib.axes.AxesSubplot object at 0x0000000012C96208>, <matplotlib.axes.AxesSubplot object at 0x0000000012D9EE48>, <matplotlib.axes.AxesSubplot object at 0x0000000012E1F5C0>, <matplotlib.axes.AxesSubplot object at 0x0000000012F9B128>], [<matplotlib.axes.AxesSubplot object at 0x0000000013054908>, <matplotlib.axes.AxesSubplot object at 0x00000000130F7978>, <matplotlib.axes.AxesSubplot object at 0x000000001320B128>, <matplotlib.axes.AxesSubplot object at 0x0000000012A634A8>, <matplotlib.axes.AxesSubplot object at 0x0000000012371A58>, <matplotlib.axes.AxesSubplot object at 0x0000000011A43278>, <matplotlib.axes.AxesSubplot object at 0x0000000011AEB7B8>, <matplotlib.axes.AxesSubplot object at 0x0000000011A69748>, <matplotlib.axes.AxesSubplot object at 0x000000000F3A7D68>, <matplotlib.axes.AxesSubplot object at 0x000000000BA5E9E8>], [<matplotlib.axes.AxesSubplot object at 0x000000000BA03D68>, <matplotlib.axes.AxesSubplot object at 0x000000000F211A58>, <matplotlib.axes.AxesSubplot object at 0x000000001197C278>, <matplotlib.axes.AxesSubplot object at 0x000000000BBEF9E8>, <matplotlib.axes.AxesSubplot object at 0x000000000C30D978>, <matplotlib.axes.AxesSubplot object at 0x000000000C37E0B8>, <matplotlib.axes.AxesSubplot object at 0x000000000EB27CF8>, <matplotlib.axes.AxesSubplot object at 0x000000000E4F7EF0>, <matplotlib.axes.AxesSubplot object at 0x000000000E6C7D68>, <matplotlib.axes.AxesSubplot object at 0x000000000E7D0588>], [<matplotlib.axes.AxesSubplot object at 0x000000000C02ACF8>, <matplotlib.axes.AxesSubplot object at 0x000000000C132C88>, <matplotlib.axes.AxesSubplot object at 0x000000000ECF53C8>, <matplotlib.axes.AxesSubplot object at 0x000000000EDBD048>, <matplotlib.axes.AxesSubplot object at 0x000000000ED12A58>, <matplotlib.axes.AxesSubplot object at 0x000000000EFBA0B8>, <matplotlib.axes.AxesSubplot object at 0x000000000F0B3898>, <matplotlib.axes.AxesSubplot object at 0x000000001331C048>, <matplotlib.axes.AxesSubplot object at 0x000000001338EF98>, <matplotlib.axes.AxesSubplot object at 0x000000000C336F98>], [<matplotlib.axes.AxesSubplot object at 0x0000000013510CF8>, <matplotlib.axes.AxesSubplot object at 0x000000001354E390>, <matplotlib.axes.AxesSubplot object at 0x00000000136FEC88>, <matplotlib.axes.AxesSubplot object at 0x000000000D18C4A8>, <matplotlib.axes.AxesSubplot object at 0x000000000D271748>, <matplotlib.axes.AxesSubplot object at 0x000000000D3B8EB8>, <matplotlib.axes.AxesSubplot object at 0x000000000D4129E8>, <matplotlib.axes.AxesSubplot object at 0x000000000D523668>, <matplotlib.axes.AxesSubplot object at 0x000000000D559DA0>, <matplotlib.axes.AxesSubplot object at 0x000000000D723908>], [<matplotlib.axes.AxesSubplot object at 0x000000000D7A7128>, <matplotlib.axes.AxesSubplot object at 0x000000000D88F198>, <matplotlib.axes.AxesSubplot object at 0x000000000D994908>, <matplotlib.axes.AxesSubplot object at 0x000000000DA2F438>, <matplotlib.axes.AxesSubplot object at 0x000000000DB020B8>, <matplotlib.axes.AxesSubplot object at 0x000000000DBBA7F0>, <matplotlib.axes.AxesSubplot object at 0x000000000DCEF358>, <matplotlib.axes.AxesSubplot object at 0x000000000DD64B38>, <matplotlib.axes.AxesSubplot object at 0x000000000DE6FBA8>, <matplotlib.axes.AxesSubplot object at 0x000000000DF83358>], [<matplotlib.axes.AxesSubplot object at 0x000000000DFCDE48>, <matplotlib.axes.AxesSubplot object at 0x000000000E0E0AC8>, <matplotlib.axes.AxesSubplot object at 0x000000001046D240>, <matplotlib.axes.AxesSubplot object at 0x0000000010553D68>, <matplotlib.axes.AxesSubplot object at 0x000000001065D588>, <matplotlib.axes.AxesSubplot object at 0x00000000106ED278>, <matplotlib.axes.AxesSubplot object at 0x00000000108356D8>, <matplotlib.axes.AxesSubplot object at 0x00000000107B1400>, <matplotlib.axes.AxesSubplot object at 0x00000000109E2240>, <matplotlib.axes.AxesSubplot object at 0x0000000010A9AB00>], [<matplotlib.axes.AxesSubplot object at 0x0000000010B4E2E8>, <matplotlib.axes.AxesSubplot object at 0x0000000010C49AC8>, <matplotlib.axes.AxesSubplot object at 0x0000000010D1D128>, <matplotlib.axes.AxesSubplot object at 0x0000000010DE0518>, <matplotlib.axes.AxesSubplot object at 0x0000000010D9D128>, <matplotlib.axes.AxesSubplot object at 0x0000000010FD00F0>, <matplotlib.axes.AxesSubplot object at 0x000000001108B9B0>, <matplotlib.axes.AxesSubplot object at 0x000000001113A198>, <matplotlib.axes.AxesSubplot object at 0x0000000011245978>, <matplotlib.axes.AxesSubplot object at 0x00000000112EAF98>], [<matplotlib.axes.AxesSubplot object at 0x00000000137E33C8>, <matplotlib.axes.AxesSubplot object at 0x0000000011377470>, <matplotlib.axes.AxesSubplot object at 0x00000000139C3F60>, <matplotlib.axes.AxesSubplot object at 0x0000000013ACD860>, <matplotlib.axes.AxesSubplot object at 0x0000000013B82048>, <matplotlib.axes.AxesSubplot object at 0x0000000013C37828>, <matplotlib.axes.AxesSubplot object at 0x0000000013D5EE48>, <matplotlib.axes.AxesSubplot object at 0x0000000013E30278>, <matplotlib.axes.AxesSubplot object at 0x0000000013D9FE48>, <matplotlib.axes.AxesSubplot object at 0x0000000013FE1E10>]], dtype=object)
plt.figure(figsize=(20,20))
for i in range(len(features)):
v = i +1
ax1 = subplot(5,2,v)
ax1.hist(top_songs[features[i]])
ax1.set_title(str(features[i]),fontsize=15)
plt.hist(top_songs['popularity'])
plt.suptitle('Top Popularity')
<matplotlib.text.Text at 0x13862048>
the_1s = songs[songs['rank']==1]
plt.figure(figsize=(20,10))
the_1s.plot(x='date',y='popularity')
plt.suptitle('Current Popularity Ratings for #1 songs 1960-2014',fontsize = 25)
<matplotlib.text.Text at 0x19d177f0>
from sklearn import linear_model
clf = linear_model.LinearRegression()
X = top_songs[features]
X = X.values
y = top_songs['popularity']
y = y.values
from sklearn.cross_validation import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X, y)
model = clf.fit(xtrain,ytrain)
model.score(xtrain,ytrain)
0.3330785279427978
model.score(xtest,ytest)
0.32410511104218653
pd.DataFrame(zip(features,model.coef_.T),columns=['Variable','Coefficient'])
Variable | Coefficient | |
---|---|---|
0 | danceability | 0.009144 |
1 | duration | 0.000103 |
2 | energy | -0.057294 |
3 | instrumentalness | -0.081905 |
4 | key | -0.001232 |
5 | liveness | -0.048610 |
6 | loudness | 0.007830 |
7 | tempo | -0.000059 |
8 | time_signature | 0.006686 |
9 | days | -0.000018 |
from sklearn import feature_selection
f = feature_selection.f_regression(X,y)
pd.DataFrame(zip(features,f[1].T),columns=['Variable','P-Value'])
Variable | P-Value | |
---|---|---|
0 | danceability | 2.247256e-49 |
1 | duration | 9.562488e-142 |
2 | energy | 2.802139e-114 |
3 | instrumentalness | 2.614871e-42 |
4 | key | 9.712986e-01 |
5 | liveness | 1.640046e-15 |
6 | loudness | 1.049723e-303 |
7 | tempo | 7.171618e-01 |
8 | time_signature | 5.298900e-21 |
9 | days | 0.000000e+00 |
new_features = ['danceability','duration','energy','instrumentalness','liveness','loudness','time_signature']
X = top_songs[new_features]
X = X.values
y = top_songs['popularity']
y = y.values
xtrain,xtest,ytrain,ytest = train_test_split(X, y)
model = clf.fit(xtrain,ytrain)
model.score(xtrain,ytrain)
0.19112449741141424
model.score(xtest,ytest)
0.15693599117286616
pd.DataFrame(zip(new_features,model.coef_.T),columns=['Variable','Coefficient'])
Variable | Coefficient | |
---|---|---|
0 | danceability | 0.103995 |
1 | duration | 0.000682 |
2 | energy | -0.098796 |
3 | instrumentalness | -0.133831 |
4 | liveness | -0.069326 |
5 | loudness | 0.021359 |
6 | time_signature | 0.016167 |
pd.DataFrame(zip(new_features,f[1].T),columns=['Variable','P-Value'])
Variable | P-Value | |
---|---|---|
0 | danceability | 2.247256e-49 |
1 | duration | 9.562488e-142 |
2 | energy | 2.802139e-114 |
3 | instrumentalness | 2.614871e-42 |
4 | liveness | 9.712986e-01 |
5 | loudness | 1.640046e-15 |
6 | time_signature | 1.049723e-303 |
features = ['danceability','duration','energy','instrumentalness','key','liveness','loudness','tempo','time_signature','days']
plt.figure(figsize=(20,20))
for i in range(len(features)):
v = i +1
ax1 = subplot(5,2,v)
ax1.scatter(top_songs[features[i]],top_songs['popularity'])
ax1.set_title(str(features[i]),fontsize=15)
newer_features = ['days','loudness']
X = top_songs[newer_features]
X = X.values
y = top_songs['popularity']
y = y.values
xtrain,xtest,ytrain,ytest = train_test_split(X, y)
model = clf.fit(xtrain,ytrain)
model.score(xtrain,ytrain)
0.32675146860557136
model.score(xtest,ytest)
0.31576555743520918
pd.DataFrame(zip(newer_features,model.coef_.T),columns=['Variable','Coefficient'])
Variable | Coefficient | |
---|---|---|
0 | days | -0.000019 |
1 | loudness | 0.005430 |
pd.DataFrame(zip(newer_features,f[1].T),columns=['Variable','P-Value'])
Variable | P-Value | |
---|---|---|
0 | days | 2.247256e-49 |
1 | loudness | 9.562488e-142 |