The data used in this analysis were fetched from Twitter using twick, searching for "#talkpay -filter:retweets" (to exclude retweets). The data run through approximately noon Pacific Time on Saturday, May 2. A raw CSV of the tweets can be found here. See the corresponding BuzzFeed News article here.
A few important caveats about the data and analysis:
What people tweet is not necessarily what they earn. In this analysis, we don't distinguish between what people say they make and what they say, for example, their bosses make.
Not everything that looks like a salary is one. In the analysis below, we're counting things that look like a salary, e.g., "50k", "$20k", "$350,000", "$15,500." The analysis throws out "401k"s, on the assumption that it represents a retirement account, rather than a salary figure. But there may be other "false positives" — and plenty of "false negatives," e.g., "I make twenty grand" — that we're missing.
The analysis of the distribution of salaries uses the maximum salary-looking figure from each tweet, based on the observation that many tweets narrated a progression of salaries over a series of jobs/positions.
The distribution analysis also ignores salaries reported in € and £.
%matplotlib inline
import pandas as pd
import re
salary_pat = re.compile(r"[\$£€][\d\.]{1,3}[,Kk]\d{0,3}|\d{1,3}k")
currency_pat = re.compile(r"[\$£€]")
link_pat = re.compile(r"http[^ ]+")
def is_likely_salary(text):
if "401k" in text.lower(): return False
if int(re.sub(r"[^\d]", "", text)) == 0: return False
return True
def is_likely_usd(text):
if text[0] in [ "£", "€" ]: return False
return True
def extract_salary_text(text):
without_links = re.sub(link_pat, "", text)
found = list(filter(is_likely_salary, re.findall(salary_pat, without_links)))
if len(found): return found
else: return None
def intify_salary(string):
x = re.sub(currency_pat, "", string.lower())
if "k" in x:
return int(float(x[:-1]) * 1000)
else:
return int(x.replace(",", ""))
tweets = pd.read_csv("../data/talkpay.csv", parse_dates=["created_at"])
print("Tweets collected: {0:,}".format(len(tweets)))
Tweets collected: 9,604
tweets["all_salaries"] = tweets["text"].apply(extract_salary_text)
tweets["usd_salaries"] = tweets["all_salaries"]\
.apply(lambda x: list(filter(is_likely_usd, x)) if x else None)
tweets["usd_salary_ints"] = tweets["usd_salaries"]\
.apply(lambda x: list(map(intify_salary, x)) if x else None)
salary_tweets = tweets[
tweets["all_salaries"].apply(lambda x: type(x) == list)
].copy()
print("""
Detected {0:,d} tweets containing something salary-looking, published by {1:,d} Twitter users.
""".strip().format(len(salary_tweets), salary_tweets["screen_name"].nunique()))
Detected 1,951 tweets containing something salary-looking, published by 1,300 Twitter users.
usd_salaries_max = salary_tweets["usd_salary_ints"].dropna().apply(max)
def make_chart(binsize, limit, ytick):
bins = range(0, limit + binsize * 2, binsize)
thresholded = usd_salaries_max.apply(lambda x: min(limit, x))
ax = thresholded.hist(bins=bins, normed=True,
figsize=(14, 8), color="#60ba2b", edgecolor="#ffffff")
ax.set_xlim(0, limit + binsize)
ax.xaxis.grid(False)
ax.set_axisbelow(True)
max_prop = max(ax.get_yticks())
ax.set_yticks(pd.np.arange(0, max_prop, ytick / binsize))
ax.set_yticklabels([ "{0:.0f}%".format(y * binsize * 100)
for y in ax.get_yticks() ], fontsize=14)
ax.set_xticklabels([ "${0:,.0f}{1}".format(x, "+" if x == limit else "")
for x in ax.get_xticks() ], fontsize=14)
ax.xaxis.set_tick_params(width=1, direction="out")
ax.yaxis.set_tick_params(width=1, direction="out")
ax.xaxis.set_ticks_position("bottom")
ax.yaxis.set_ticks_position("left")
ax.set_title("#talkpay | Estimated Salary Distribution", fontsize=28, y=1.05)
ax.figure.text(0.9, 0.03, 'Jeremy Singer-Vine / BuzzFeed News',
ha='right', color="#666666", fontsize=12)
ax.figure.text(0.12, 0.03, 'Note: This is a *very* rough estimate. Substantial caveats apply.', \
ha='left', color="#666666", fontsize=12)
return ax
ax = make_chart(25 * 1000, 300 * 1000, 0.05)