Disruptor---Linking People & Technology.
Charles---yes we can.
Send over all your data over. We'll make killer plots and let you visualize all your data very efficiently
anthony@plot.ly
-AWS
Sent from LinkedIn for iPhone http://lnkd.in/ios
Hello Anthony
I am doing a research on Crowdfunidng websites and I have some data to analyze, could you help me use plot.ly to extract the data I need for my thesis?
Hi Charles,
Thanks for the Add! I wanted to put Plot.ly on your radar--- a new data analysis & visualization platform allowing all engineers & management to collaborate & communicate efficiently. It's a complimentary product to MATLAB, which I know is widely used at Bombardier.
We just closed a deal with Space X where all their flight data is being displayed via Plotly and are really interested in growing every vertical within Aerospace/Engineering.
Is this of interest? Would love to get your eyes on it.
AWS
Hello Anthony
Here is my data,
I am trying to identify what make a the difference between a successful project on Kickstarter and the rest (Or the ones that almost succeeded)
A successful project is
An unsuccessful project is
I need to see if there is a correlation and significance between the data sets and the global average.
What makes a difference?
(PS: I was not able to identify the day of the year in numbers in excel because there are many years)
7. Does the day of the week make a difference?
8. Does the number of backers make a difference?
9. Does the number of comments make a difference?
10. Does the number of comments per backer make a difference?
11. Does the number of revisions make a difference?
12. Does the project duration make a difference?
13. Does the project location make a difference?
14. Does the presence of a photo make a difference?
15. Does the presence of a video make a difference?
16. Does the number of pledge make a difference?
17. How about the value of each pledge ?
I’ll wait to hear from you, thanks a lot! Best regards
Charles
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
import pandas as pd
import numpy as np
df_all = pd.read_excel("Kickstarter Data.xls", "All projects")
df_S = pd.read_excel("Kickstarter Data.xls", "Sucessful +5K + 50% +150 back")
df_U = pd.read_excel("Kickstarter Data.xls", "Unsucessfull +5K +100$ +10 back")
df_all.shape, df_S.shape, df_U.shape
((10159, 178), (1252, 178), (3728, 178))
Colors and grid style
col_S = '#99FF00'
col_U = '#CC0000'
col_diff = '#0099ff'
grid = dict(
showgrid=True,
gridcolor='#FFFFFF',
gridwidth=1.5
)
width = 650
plot_bgcolor = '#EFECEA'
Colorbrewer color scale to plotly color scale function
import colorbrewer as cb
def convert_cb_to_scl(cb_color,N=5):
'''
cb_color (positional): colorbrewer color dictionary
N (keyword): number of colors in color scale
'''
colors = cb_color[N] # get list of N color tuples from cb dict
levels = np.linspace(0,1,N).tolist() # get list of N levels
# Make color scale list of lists, conveting each tuple to 'rgb( , , )'
scl_cb = []
scl_cb += [[i, "rgb("+','.join(map(str,color))+")"]
for i,color in zip(levels,colors)]
return scl_cb
S = df_S['Project Title'].apply(len).values
U = df_U['Project Title'].apply(len).values
def stats_text(X):
X_mean = np.mean(X)
X_std = np.std(X)
return ["<b>Mean</b>: {:5.2f}<br><b>Stand. dev.:</b> {:5.2f}".format(X_mean,X_std)
for i in range(len(X))]
histnorm='percent'
opacity=0.5
height = 500
trace1 = Histogram(
x = S,
name = 'Succesful projects',
histnorm= histnorm,
marker= Marker(
color= col_S
),
opacity=opacity,
text= stats_text(S)
)
trace2 = Histogram(
x = U,
name = 'Unsuccesful projects',
histnorm= histnorm,
marker= Marker(
color= col_U
),
opacity= opacity,
text= stats_text(U)
)
data = Data([trace1, trace2])
layout = Layout(
title='Does the length of the title (# of characters) make a difference?',
barmode='overlay',
xaxis= XAxis(
title='Number of characters in title',
),
yaxis= YAxis(
grid,
title='Percentage of S/U projects',
),
legend= Legend(
x=0,
y=1,
bgcolor="rgba(0,0,0,0)"
),
autosize=False,
width=width,
height=height,
plot_bgcolor=plot_bgcolor
)
fig = Figure(data=data, layout=layout)
py.iplot(fig, filename='cbombardier-1')
Some info about regular expression.
import re
from collections import Counter
# Words to reject from counts
rejects = ['the','for','a','your','and','canceled','39','to','s','with','in','of',
'on','an','by','you','that','it','4','5','way','from']
def word_re(text):
p = re.compile('\w(?:[-\w]*\w)?') # words regex
text_lower = text.encode('ascii', 'replace').lower() # to lowercase ascii
Words = p.findall(text_lower) # get list of all words in text
return Words
def get_count(df):
text = ' '.join(df['Project Title'].tolist()) # join all titles
iterables = zip(*Counter(word_re(text)).most_common()) # get 1 count and 1 index list
return pd.Series(*iterables[::-1]) # output as pd Series
cutoff = 30
def to_plot(df0, df1, cutoff=cutoff):
Df = pd.concat([get_count(df0), get_count(df1)], axis=1) # merge S and U Series
Df = Df.drop(rejects) # delete rejects
Df = Df.fillna(0) # fill in nan with 0
Df['total'] = Df.ix[:,0:2].sum(axis=1) # use S+U totals to cutoff
df = Df.sort('total', ascending=False)[0:cutoff]
df['max'] = df.ix[:,0:2].max(axis=1) # use S+U totals to sort
df = df.sort('max', ascending=True) # in ascending order (for plot)
df['ratio'] = (df.ix[:,0]-df.ix[:,1])/df.ix[:,2] # compute S/U ratio
df['rel. perc.'] = df['ratio']*100 # and relative percentage
df['rank'] = df['rel. perc.'].rank(ascending=False) # rank by rel. perc.
return df
df_q2 = to_plot(df_S,df_U)
df_q2.tail()
0 | 1 | total | max | ratio | rel. perc. | rank | |
---|---|---|---|---|---|---|---|
system | 32 | 94 | 126 | 94 | -0.492063 | -49.206349 | 17 |
stand | 24 | 95 | 119 | 95 | -0.596639 | -59.663866 | 25 |
ipad | 60 | 173 | 233 | 173 | -0.484979 | -48.497854 | 16 |
case | 35 | 185 | 220 | 185 | -0.681818 | -68.181818 | 28 |
iphone | 95 | 298 | 393 | 298 | -0.516539 | -51.653944 | 20 |
5 rows × 7 columns
height= 800
opacity= 0.5
trace1 = Bar(
x = df_q2.ix[:,0].values,
y = df_q2.index.values,
orientation='h',
name = 'Succesful projects',
marker= Marker(
color= col_S
),
opacity= opacity
)
trace2 = Bar(
x = df_q2.ix[:,1].values,
y = df_q2.index.values,
orientation='h',
name = 'Unsuccesful projects',
marker= Marker(
color= col_U
),
opacity= opacity
)
data = Data([trace1, trace2])
layout = Layout(
title='Are there some specific keywords in the title that make a difference?',
barmode='group',
xaxis= XAxis(
grid,
title='Number of occurences in S/U projects titles',
),
yaxis= YAxis(
),
legend= Legend(
x=1,
y=0,
bgcolor="rgba(0,0,0,0)"
),
autosize=False,
width=width,
height=height,
plot_bgcolor=plot_bgcolor
)
fig = Figure(data=data, layout=layout)
py.iplot(fig, filename='cbombardier-2', height=height)
try:
del figb
except NameError:
pass
import copy
figb = copy.deepcopy(fig)
def make_text(df, cutoff=cutoff):
return '<br><b>Total # of occurences:</b> %s\
<br><b>Rank:</b> %s of out %s' % (int(df['total']), int(df['rank']), cutoff)
def make_color(X, cutoff=cutoff, N=6):
scl = convert_cb_to_scl(cb.PuBu, N+2)[:1:-1]
I_scl = np.floor(X/cutoff*(N-1))
return [scl[int(i_scl)][1] for i_scl in I_scl]
figb['data'] += [Bar(
x = df_q2['rel. perc.'].values,
y = df_q2.index.values,
orientation='h',
name = 'Relative difference',
text= df_q2.apply(make_text,axis=1).tolist(),
marker= Marker(
color= make_color(df_q2['rank'].values)
),
opacity=opacity,
xaxis='x2',
showlegend=False
)]
figb['layout']['xaxis'].update(
domain=[0, 0.47],
title= 'Number of S/U projects'
)
figb['layout'].update(
xaxis2 = XAxis(
grid,
domain=[0.53, 1],
title= 'Relative S/U difference [%]',
autotick=False,
dtick=20
),
)
figb['layout']['legend'].update(
x=0.45,
y=0,
xanchor='right'
)
py.iplot(figb, filename='cbombardier-2b', height=height)
S = df_S['Delay'].values
U = df_U['Delay'].values
def stats_text(X):
X_mean = np.mean(X)
X_std = np.std(X)
return ["<b>Mean</b>: {:5.2f}<br><b>Stand. dev.:</b> {:5.2f}".format(X_mean,X_std)
for i in range(len(X))]
histnorm='percent'
opacity=0.5
height = 500
bins = dict(
start=0,
end=365,
size=7
)
trace1 = Histogram(
x = S,
name = 'Succesful projects',
histnorm= histnorm,
marker= Marker(
color='#99FF00'
),
opacity=opacity,
text = stats_text(S),
autobinx=False,
xbins= XBins(bins)
)
trace2 = Histogram(
x = U,
name = 'Unsuccesful projects',
histnorm= histnorm,
marker= Marker(
color='#CC0000'
),
opacity= opacity,
text= stats_text(U),
autobinx=False,
xbins= XBins(bins)
)
data = Data([trace1, trace2])
layout = Layout(
title='Does the delay in days between the time it was created <br>\
and launched make a difference?',
barmode='overlay',
xaxis= XAxis(
grid,
title='Delay in days between creation and lauch times',
),
yaxis= YAxis(
grid,
title='Percentage of total occurrences',
range=[0,15.5]
),
legend= Legend(
x=1,
y=1,
bgcolor="rgba(0,0,0,0)"
),
autosize=False,
width=width,
height=height,
plot_bgcolor='#EFECEA'
)
fig = Figure(data=data, layout=layout)
py.iplot(fig, filename='cbombardier-5')
try:
del figb
except NameError:
pass
import copy
figb = copy.deepcopy(fig)
tmp = copy.deepcopy(fig['data'])
tmp.update(dict(
xaxis='x2',
yaxis='y2',
xbins= XBins(
start=0,
end=100,
size=1
),
showlegend=False
))
figb['data'] += tmp
figb['layout'].update(
xaxis2=XAxis(
grid,
domain=[0.52, 1],
range=[0,99],
anchor='y2',
autotick=False,
dtick=20
),
yaxis2=YAxis(
grid,
domain=[0.26, 0.82],
anchor='x2'
)
)
py.iplot(figb, filename='cbombardier-5b')
def location_count(df):
return df.groupby("Location").apply(lambda x: x.shape[0]) # count by Location
cutoff = 30
def to_plot(df0, df1, cutoff=cutoff):
Df = pd.concat([location_count(df0), location_count(df1)], axis=1)
Df = Df.fillna(0) # fill in nan with 0
Df['total'] = Df.ix[:,0:2].sum(axis=1)
df = Df.sort('total', ascending=False)[0:cutoff]
df['max'] = df.ix[:,0:2].max(axis=1) # use S+U totals to sort
df = df.sort('max', ascending=True) # in ascending order (for plot)
df['ratio'] = (df.ix[:,0]-df.ix[:,1])/df.ix[:,2] # compute S/U ratio
df['rel. perc.'] = df['ratio']*100 # and relative percentage
df['rank'] = df['rel. perc.'].rank(ascending=False) # rank by rel. perc.
return df
df_q13 = to_plot(df_S,df_U)
df_q13.tail()
0 | 1 | total | max | ratio | rel. perc. | rank | |
---|---|---|---|---|---|---|---|
Chicago, IL | 52 | 80 | 132 | 80 | -0.212121 | -21.212121 | 4 |
London, UK | 41 | 101 | 142 | 101 | -0.422535 | -42.253521 | 13 |
San Francisco, CA | 110 | 110 | 220 | 110 | 0.000000 | 0.000000 | 2 |
New York, NY | 51 | 136 | 187 | 136 | -0.454545 | -45.454545 | 14 |
Los Angeles, CA | 55 | 175 | 230 | 175 | -0.521739 | -52.173913 | 17 |
5 rows × 7 columns
height= 800
opacity= 0.5
trace1 = Bar(
x = df_q13.ix[:,0].values,
y = df_q13.index.values,
orientation='h',
name = 'Succesful projects',
marker= Marker(
color='#99FF00'
),
opacity=opacity
)
trace2 = Bar(
x = df_q13.ix[:,1].values,
y = df_q13.index.values,
orientation='h',
name = 'Unsuccesful projects',
marker= Marker(
color='#CC0000'
),
opacity=opacity
)
data = Data([trace1, trace2])
layout = Layout(
title='Does the project location make a difference?',
barmode='group',
xaxis= XAxis(
grid,
title='Number of S/U project in each city',
),
yaxis= YAxis(
),
legend= Legend(
x=1,
y=0,
bgcolor="rgba(0,0,0,0)"
),
autosize=False,
width=width,
height=height,
plot_bgcolor=plot_bgcolor,
margin= Margin(
l=135
)
)
fig = Figure(data=data, layout=layout)
py.iplot(fig, filename='cbombardier-13', height=height)
try:
del figb
except NameError:
pass
import copy
figb = copy.deepcopy(fig)
def make_text(df, cutoff=cutoff):
return '<br><b>Total # of projects:</b> %s\
<br><b>Rank:</b> %s of out %s' % (int(df['total']), int(df['rank']), cutoff)
def make_color(X, cutoff=cutoff, N=6):
scl = convert_cb_to_scl(cb.PuBu, N+2)[:1:-1]
I_scl = np.floor(X/cutoff*(N-1))
return [scl[int(i_scl)][1] for i_scl in I_scl]
figb['data'] += [Bar(
x = df_q13.ix[:,5].values,
y = df_q13.index.values,
orientation='h',
name = 'Relative difference',
text= df_q13.apply(make_text,axis=1).tolist(),
marker= Marker(
color= make_color(df_q13['rank'].values)
),
opacity=opacity,
xaxis='x2',
showlegend=False
)]
figb['layout']['xaxis'].update(
domain=[0, 0.47],
title= 'Number of S/U projects'
)
figb['layout'].update(
xaxis2 = XAxis(
grid,
domain=[0.53, 1],
title= 'Relative S/U difference [%]',
autotick=False,
dtick=20
),
)
figb['layout']['legend'].update(
x=0.45,
y=0,
xanchor='right'
)
py.iplot(figb, filename='cbombardier-13b', height=height)
About Plotly
Big thanks to
from IPython.display import display, HTML
import urllib2
url = 'https://raw.githubusercontent.com/plotly/python-user-guide/master/custom.css'
display(HTML(urllib2.urlopen(url).read()))