from bs4 import BeautifulSoup
import urllib2
from datetime import datetime
import pandas as pd
import json
data = pd.DataFrame(columns=('date','rank','uri'))
data2 = pd.DataFrame(columns=('title','artist','popularity','uri'))
for year in range(1960,2015):
for month in range(1,13):
if month<10:
month = str(0)+str(month)
for day in range(1,32):
if day < 10:
day = str(0)+str(day)
date = str(month)+'-'+str(day)+'-'+str(year)
try:
date1 = datetime.strptime(date, '%m-%d-%Y')
if datetime.weekday(date1) == 5:
try:
link = urllib2.urlopen('http://www.billboard.com/charts/%s-%s-%s/hot-100' % (year,month,day))
soup = BeautifulSoup(link)
spotsoup1 = soup.findAll('a')
splitsoup1 = str(spotsoup1[58]).split('\"')
splitsoup2= splitsoup1[3].split(':')
splitsoup3 = splitsoup2[2].split(',')
for i in range(40):
dic = {}
dic['date'] = date
dic['rank'] = str(i+1)
dic['uri'] = str(splitsoup3[i])
data = data.append(dic,ignore_index=True)
except TypeError:
pass
except ValueError:
pass
spoturl = 'http://ws.spotify.com/lookup/1/.json?uri=spotify:track:'
uri_list = data['uri'].unique()
for i in uri_list:
dic2 = {}
url = spoturl + str(i)
response = urllib2.urlopen(url)
json_object = json.load(response)
dic2['title'] = json_object['track']['name']
dic2['artist'] = json_object['track']['artists'][0]['name']
dic2['popularity'] = json_object['track']['popularity']
dic2['uri'] = i
data2 = data2.append(dic2,ignore_index=True)
project_data = pd.merge(data,data2,on='uri')
project_data['rank'] = project_data['rank'].astype('int')
project_data['popularity'] = project_data['popularity'].astype('float')
project_data.sort(['date','rank'])
project_data.head()
data3 = pd.DataFrame(columns=('danceability','duration','energy','instrumentalness','key','liveness',
'loudness','speechiness','tempo','time_signature','uri'))
#echonest data
echourl = 'http://developer.echonest.com/api/v4/track/profile?api_key=API=json&id=spotify:track:'
echourl2 ='&bucket=audio_summary'
for i in uri_list:
try:
uri = i
url = echourl+uri+echourl2
dic3 = {}
response = urllib2.urlopen(url)
json_object = json.load(response)
dic3['danceability'] = json_object['response']['track']['audio_summary']['danceability']
dic3['duration'] = json_object['response']['track']['audio_summary']['duration']
dic3['energy'] = json_object['response']['track']['audio_summary']['energy']
dic3['instrumentalness'] = json_object['response']['track']['audio_summary']['instrumentalness']
dic3['key'] = json_object['response']['track']['audio_summary']['key']
dic3['liveness'] = json_object['response']['track']['audio_summary']['liveness']
dic3['loudness'] = json_object['response']['track']['audio_summary']['loudness']
dic3['speechiness'] = json_object['response']['track']['audio_summary']['speechiness']
dic3['tempo'] = json_object['response']['track']['audio_summary']['tempo']
dic3['time_signature'] = json_object['response']['track']['audio_summary']['time_signature']
dic3['uri'] = i
data3 = data3.append(dic3,ignore_index=True)
except KeyError:
pass
song_data=pd.merge(project_data,data3,on='uri')
song_data.to_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project\song_data.csv')
import pandas as pd
import matplotlib.pyplot as plt
import numpy
import seaborn as sns
songs = pd.read_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project\song_data.csv')
songs.head()
date | rank | uri | title | artist | popularity | danceability | duration | energy | instrumentalness | key | liveness | loudness | speechiness | tempo | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1/2/1960 | 1 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
1 | 1/9/1960 | 2 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
2 | 1/16/1960 | 2 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
3 | 1/23/1960 | 2 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
4 | 1/30/1960 | 3 | 3hvakqVpwaz4L7zN5HfTCY | Why | Frankie Avalon | 0.27 | 0.422345 | 155.23955 | 0.409041 | 5.260000e-09 | 5 | 0.112646 | -8.54 | 0.026997 | 94.986 | 4 |
#Convert dtype for date
songs['date'] = pd.to_datetime(songs['date'],format='%m/%d/%Y')
songs.describe()
rank | popularity | danceability | duration | energy | instrumentalness | key | liveness | loudness | speechiness | tempo | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 114422.000000 | 114422.000000 | 114422.000000 | 114422.000000 | 114422.000000 | 1.144220e+05 | 114422.000000 | 114422.000000 | 114422.000000 | 113607.000000 | 114422.000000 | 114422.000000 |
mean | 20.497098 | 0.399589 | 0.611887 | 231.943356 | 0.630440 | 3.373952e-02 | 5.228592 | 0.190379 | -8.468298 | 0.066200 | 120.196028 | 3.951277 |
std | 11.545553 | 0.214628 | 0.150314 | 61.443767 | 0.194966 | 1.388380e-01 | 3.560654 | 0.163452 | 3.525335 | 0.070969 | 27.585767 | 0.273730 |
min | 1.000000 | 0.000000 | 0.068750 | 61.266210 | 0.020085 | 4.090000e-15 | 0.000000 | 0.013549 | -41.613000 | 0.022336 | 41.409000 | 1.000000 |
25% | 10.000000 | 0.250000 | 0.514386 | 192.746210 | 0.490307 | 9.110000e-08 | 2.000000 | 0.088227 | -10.689000 | 0.031694 | 99.948750 | 4.000000 |
50% | 20.000000 | 0.420000 | 0.623672 | 229.559550 | 0.648759 | 8.550000e-06 | 5.000000 | 0.128692 | -7.996000 | 0.040176 | 118.972500 | 4.000000 |
75% | 31.000000 | 0.560000 | 0.717292 | 263.372880 | 0.788447 | 7.380170e-04 | 8.000000 | 0.243539 | -5.803000 | 0.062878 | 134.682500 | 4.000000 |
max | 40.000000 | 1.000000 | 0.984268 | 1367.092880 | 0.995899 | 9.702722e-01 | 11.000000 | 0.990848 | -0.073000 | 0.891714 | 217.748000 | 5.000000 |
fig = plt.figure(figsize =(30,25))
ax1 = fig.add_subplot(2,2,1)
ax1.hist(songs.popularity)
ax1.set_xlabel('Popularity',fontsize=20)
ax1.set_title('Histogram of Current Popularity Ratings',fontsize =25)
ax2 = fig.add_subplot(2,2,2)
ax2.set_ylim(-.1,1.1)
ax2.set_ylabel('Popularity',fontsize=20)
ax2.boxplot(songs.popularity)
ax2.set_title('Boxplot of Current Popularity Ratings',fontsize =25)
ax3 = fig.add_subplot(2,2,3)
ax3 = sns.violinplot(songs.popularity)
ax3.set_ylabel('Popularity',fontsize=20)
ax3.set_title('Violin Plot of Current Popularity Ratings',fontsize =25)
ax4 = fig.add_subplot(2,2,4)
ax4 = songs.popularity.plot(kind='kde')
ax4.set_title('KDE Plot of Current Popularity Ratings',fontsize =25)
ax4.set_ylabel('Popularity',fontsize=20)
fig.suptitle('Distributions of Current Popularity Ratings',fontsize =30)
<matplotlib.text.Text at 0x315cbd68>
pd.scatter_matrix(songs[['popularity','danceability','duration','tempo','loudness']],figsize=(20,20))
plt.suptitle('Scatterplot Matrix of Song Attributes')
<matplotlib.text.Text at 0x3817c470>
#What song is the duration outlier?
big = songs[['title','artist']][songs['duration']>1000]
big.drop_duplicates()
title | artist | |
---|---|---|
32217 | Autobahn - 2009 Remastered Version | Kraftwerk |
#What songs currently have the highest popularity?
hits = songs[['title','artist','popularity']][songs['popularity']>.95]
hits.drop_duplicates()
#Recent songs appear to have the highest popularity scores in the set.
title | artist | popularity | |
---|---|---|---|
113442 | All of Me | John Legend | 0.96 |
113739 | Summer | Calvin Harris | 0.97 |
113807 | A Sky Full Of Stars | Coldplay | 0.96 |
114084 | Shower | Becky G | 0.97 |
114150 | All About That Bass | Meghan Trainor | 1.00 |
114231 | Don't | Ed Sheeran | 0.96 |
#What is the 'loudest' song?
ouch = songs[['title','artist','loudness']][songs['loudness'] == songs['loudness'].max()]
ouch
title | artist | loudness | |
---|---|---|---|
10670 | Run, Run, Run | The Gestures | -0.073 |
songs_1 = songs[songs['rank'] == 1]
songs_10 = songs[songs['rank'] == 10]
songs_20 = songs[songs['rank'] == 20]
songs_40 = songs[songs['rank'] == 40]
songs_1.plot(x='date',y='popularity',figsize=(25,8))
plt.suptitle('Timeseries of Popularity for #1 Songs')
plt.ylabel('Current Popularity')
<matplotlib.text.Text at 0x434683c8>
fig = plt.figure(figsize =(30,25))
fig.suptitle('Timeseries Comparison for #1s, #10s, and #40s')
ax1 = fig.add_subplot(3,1,1)
ax1.set_title('#1s',fontsize = 20)
ax1 = songs_1.plot(x='date',y='popularity')
ax1.set_ylabel('Popularity')
ax2 = fig.add_subplot(3,1,2)
ax2.set_title('#10s',fontsize = 20)
ax2.set_ylabel('Popularity')
ax2 = songs_10.plot(x='date',y='popularity')
ax3 = fig.add_subplot(3,1,3)
ax3.set_title('#40s',fontsize = 20)
ax3.set_ylabel('Popularity')
ax3 = songs_40.plot(x='date',y='popularity')