dem·o·graph·ics
statistical data relating to the population and particular groups within it.
E.g., age, ethnicity, gender, income, ...
Marketing
Social Media as Surveys
Health
** User profiles vary from site to site. **
# Guessing gender
# Collect 1000 tweets matching query "i"
import configparser
import sys
from TwitterAPI import TwitterAPI
def get_twitter(config_file):
""" Read the config_file and construct an instance of TwitterAPI.
Args:
config_file ... A config file in ConfigParser format with Twitter credentials
Returns:
An instance of TwitterAPI.
"""
config = configparser.ConfigParser()
config.read(config_file)
twitter = TwitterAPI(
config.get('twitter', 'consumer_key'),
config.get('twitter', 'consumer_secret'),
config.get('twitter', 'access_token'),
config.get('twitter', 'access_token_secret'))
return twitter
twitter = get_twitter('twitter.cfg')
tweets = []
n_tweets=1000
for r in twitter.request('statuses/filter', {'track': 'i'}):
tweets.append(r)
if len(tweets) % 100 == 0:
print('%d tweets' % len(tweets))
if len(tweets) >= n_tweets:
break
print('fetched %d tweets' % len(tweets))
100 tweets 200 tweets 300 tweets 400 tweets 500 tweets 600 tweets 700 tweets 800 tweets 900 tweets 1000 tweets fetched 1000 tweets
# not all tweets are returned
# https://dev.twitter.com/streaming/overview/messages-types#limit_notices
[t for t in tweets if 'user' not in t][:6]
[{'limit': {'track': 383, 'timestamp_ms': '1551714817758'}}, {'limit': {'track': 790, 'timestamp_ms': '1551714818769'}}, {'limit': {'track': 1150, 'timestamp_ms': '1551714819737'}}, {'limit': {'track': 1556, 'timestamp_ms': '1551714820748'}}, {'limit': {'track': 1998, 'timestamp_ms': '1551714821738'}}, {'limit': {'track': 2384, 'timestamp_ms': '1551714822750'}}]
# restrict to actual tweets
# (remove "deleted" tweets)
tweets = [t for t in tweets if 'user' in t]
print('fetched %d tweets' % len(tweets))
fetched 981 tweets
# Print last 10 names.
names = [t['user']['name'] for t in tweets]
names[-10:]
['Funky Kong', 'April and 228 other', '☕️', 'nikoll (ia)', 'Iniciativa PV', 'josie🕷', 'ؘ', 'yaancc 🌞', 'livyyy :)', 'by gaspadin_ ll 🇹🇷🇹🇷']
# Fetch census name data from:
# http://www2.census.gov/topics/genealogy/1990surnames/
import requests
from pprint import pprint
males_url = 'http://www2.census.gov/topics/genealogy/' + \
'1990surnames/dist.male.first'
females_url = 'http://www2.census.gov/topics/genealogy/' + \
'1990surnames/dist.female.first'
males = requests.get(males_url).text.split('\n')
females = requests.get(females_url).text.split('\n')
print('males:')
pprint(males[:10])
print('females:')
pprint(females[:10])
males: ['JAMES 3.318 3.318 1', 'JOHN 3.271 6.589 2', 'ROBERT 3.143 9.732 3', 'MICHAEL 2.629 12.361 4', 'WILLIAM 2.451 14.812 5', 'DAVID 2.363 17.176 6', 'RICHARD 1.703 18.878 7', 'CHARLES 1.523 20.401 8', 'JOSEPH 1.404 21.805 9', 'THOMAS 1.380 23.185 10'] females: ['MARY 2.629 2.629 1', 'PATRICIA 1.073 3.702 2', 'LINDA 1.035 4.736 3', 'BARBARA 0.980 5.716 4', 'ELIZABETH 0.937 6.653 5', 'JENNIFER 0.932 7.586 6', 'MARIA 0.828 8.414 7', 'SUSAN 0.794 9.209 8', 'MARGARET 0.768 9.976 9', 'DOROTHY 0.727 10.703 10']
# Get names.
male_names = set([m.split()[0].lower() for m in males if m])
female_names = set([f.split()[0].lower() for f in females if f])
print('%d male and %d female names' % (len(male_names), len(female_names)))
print('males:\n' + '\n'.join(list(male_names)[:10]))
print('\nfemales:\n' + '\n'.join(list(female_names)[:10]))
1219 male and 4275 female names males: raymundo willis jose thurman galen darrick roy matthew man ned females: min brittaney tonisha bailey shani roma beth stefanie tillie jeanetta
# Initialize gender of all tweets to unknown.
for t in tweets:
t['gender'] = 'unknown'
# label a Twitter user's gender by matching name list.
import re
def gender_by_name(tweets, male_names, female_names):
for t in tweets:
name = t['user']['name']
if name:
# remove punctuation.
name_parts = re.findall('\w+', name.split()[0].lower())
if len(name_parts) > 0:
first = name_parts[0].lower()
if first in male_names:
t['gender'] = 'male'
elif first in female_names:
t['gender'] = 'female'
else:
t['gender'] = 'unknown'
gender_by_name(tweets, male_names, female_names)
# What's wrong with this approach?
from collections import Counter
def print_genders(tweets):
counts = Counter([t['gender'] for t in tweets])
print('%.2f of accounts are labeled with gender' %
((counts['male'] + counts['female']) / sum(counts.values())))
print('gender counts:\n', counts)
for t in tweets[:20]:
print(t['gender'], t['user']['name'])
print_genders(tweets)
0.27 of accounts are labeled with gender gender counts: Counter({'unknown': 715, 'female': 143, 'male': 123}) unknown Arzunaz Üreyen female Ann unknown flexlex 🍒✨ male jules! female MARLA RT unknown Freya 🏳️🌈 female ًvictoria unknown Blue PandaNW unknown Pierrot Kwame unknown Öykü Su Unay unknown lots✈️bts wembley unknown 🍒 unknown 😽😸Meow Meow March😸😽 unknown fazli unknown Tyrion_Lannister unknown 💜~Nessa~OwO💜 unknown محمد المولد xboxone 🎮 🖋 unknown Cruiz_Senior male Carmen JMO💖🌟 unknown tyb
# What about ambiguous names?
def print_ambiguous_names(male_names, female_names):
ambiguous = [n for n in male_names if n in female_names] # names on both lists
print('found %d ambiguous names:\n'% len(ambiguous))
print('\n'.join(ambiguous[:20]))
print_ambiguous_names(male_names, female_names)
found 331 ambiguous names: jose roy matthew man dale gail sung thomas tommie charlie claude chong stephen patrick dorian angelo jay lewis dusty son
# Keep names that are more frequent in one gender than the other.
def get_percents(name_list):
# parse raw data to extract, e.g., the percent of males names John.
return dict([(n.split()[0].lower(), float(n.split()[1]))
for n in name_list if n])
males_pct = get_percents(males)
females_pct = get_percents(females)
# Assign a name as male if it is more common among males than femals.
male_names = set([m for m in male_names if m not in female_names or
males_pct[m] > females_pct[m]])
female_names = set([f for f in female_names if f not in male_names or
females_pct[f] > males_pct[f]])
print_ambiguous_names(male_names, female_names)
print('%d male and %d female names' % (len(male_names), len(female_names)))
found 0 ambiguous names: 1146 male and 4017 female names
# Relabel twitter users (compare with above)
gender_by_name(tweets, male_names, female_names)
print_genders(tweets)
0.27 of accounts are labeled with gender gender counts: Counter({'unknown': 715, 'female': 155, 'male': 111}) unknown Arzunaz Üreyen female Ann unknown flexlex 🍒✨ male jules! female MARLA RT unknown Freya 🏳️🌈 female ًvictoria unknown Blue PandaNW unknown Pierrot Kwame unknown Öykü Su Unay unknown lots✈️bts wembley unknown 🍒 unknown 😽😸Meow Meow March😸😽 unknown fazli unknown Tyrion_Lannister unknown 💜~Nessa~OwO💜 unknown محمد المولد xboxone 🎮 🖋 unknown Cruiz_Senior female Carmen JMO💖🌟 unknown tyb
# Who are the unknowns?
# "Filtered" data can have big impact on analysis.
unknown_names = Counter(t['user']['name']
for t in tweets if t['gender'] == 'unknown')
unknown_names.most_common(20)
[('.', 4), ('M', 3), ('Fra🌸', 2), ('🌙', 2), ('💜', 2), ('𝘲𝘶𝘦𝘦𝘯𝘯𝘢𝘪𝘫𝘢2𝘹🥰', 2), ('ً', 2), ('Marius Black. 🦋', 2), ('Krggzddd', 2), ('Mehmet Sarıaslan', 2), ('Urim Ejupi', 2), ('Arzunaz Üreyen', 1), ('flexlex 🍒✨', 1), ('Freya 🏳️\u200d🌈', 1), ('Blue PandaNW', 1), ('Pierrot Kwame', 1), ('Öykü Su Unay', 1), ('lots✈️bts wembley', 1), ('🍒', 1), ('😽😸Meow Meow March😸😽', 1)]
# How do the profiles of male Twitter users differ from
# those of female users?
male_profiles = [t['user']['description'] for t in tweets
if t['gender'] == 'male']
female_profiles = [t['user']['description'] for t in tweets
if t['gender'] == 'female']
#male_profiles = [t['text'] for t in tweets
# if t['gender'] == 'male']
#female_profiles = [t['text'] for t in tweets
# if t['gender'] == 'female']
import re
def tokenize(s):
return re.sub('\W+', ' ', s).lower().split() if s else []
male_words = Counter()
female_words = Counter()
for p in male_profiles:
male_words.update(Counter(tokenize(p)))
for p in female_profiles:
female_words.update(Counter(tokenize(p)))
print('Most Common Male Terms:')
pprint(male_words.most_common(10))
print('\nMost Common Female Terms:')
pprint(female_words.most_common(10))
Most Common Male Terms: [('of', 25), ('and', 24), ('i', 23), ('the', 14), ('my', 11), ('in', 10), ('me', 9), ('life', 9), ('you', 8), ('for', 8)] Most Common Female Terms: [('i', 38), ('the', 32), ('and', 28), ('a', 19), ('my', 19), ('you', 17), ('to', 16), ('of', 15), ('is', 14), ('in', 13)]
print(len(male_words))
print(len(female_words))
764 921
# Compute difference
diff_counts = dict([(w, female_words[w] - male_words[w])
for w in
set(female_words.keys()) | set(male_words.keys())])
sorted_diffs = sorted(diff_counts.items(), key=lambda x: x[1])
print('Top Male Terms (diff):')
pprint(sorted_diffs[:10])
print('\nTop Female Terms (diff):')
pprint(sorted_diffs[-10:])
Top Male Terms (diff): [('of', -10), ('father', -7), ('http', -6), ('for', -5), ('people', -5), ('university', -4), ('not', -4), ('about', -4), ('com', -4), ('play', -4)] Top Female Terms (diff): [('just', 7), ('she', 7), ('my', 8), ('s', 9), ('you', 9), ('to', 10), ('is', 11), ('a', 12), ('i', 15), ('the', 18)]
** A problem with difference of counts:**
What if we have more male than female words in total?
Instead, consider "the probability that a male user writes the word w"
** Odds Ratio (OR)**
The ratio of the probabilities for a word from each class:
OR(w)=p(w|female)p(w|male)def counts_to_probs(gender_words):
""" Compute probability of each term according to the frequency
in a gender. """
total = sum(gender_words.values())
return dict([(word, count / total)
for word, count in gender_words.items()])
male_probs = counts_to_probs(male_words)
female_probs = counts_to_probs(female_words)
print('p(w|male)')
pprint(sorted(male_probs.items(), key=lambda x: -x[1])[:10])
print('\np(w|female)')
pprint(sorted(female_probs.items(), key=lambda x: -x[1])[:10])
p(w|male) [('of', 0.021872265966754154), ('and', 0.02099737532808399), ('i', 0.020122484689413824), ('the', 0.012248468941382326), ('my', 0.009623797025371828), ('in', 0.008748906386701663), ('me', 0.007874015748031496), ('life', 0.007874015748031496), ('you', 0.00699912510936133), ('for', 0.00699912510936133)] p(w|female) [('i', 0.02615278733654508), ('the', 0.02202339986235375), ('and', 0.019270474879559532), ('a', 0.01307639366827254), ('my', 0.01307639366827254), ('you', 0.01169993117687543), ('to', 0.011011699931176875), ('of', 0.01032346868547832), ('is', 0.009635237439779766), ('in', 0.008947006194081212)]
def odds_ratios(male_probs, female_probs):
return dict([(w, female_probs[w] / male_probs[w])
for w in
set(male_probs) | set(female_probs)])
ors = odds_ratios(male_probs, female_probs)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-32-04756cf63b36> in <module>() 4 set(male_probs) | set(female_probs)]) 5 ----> 6 ors = odds_ratios(male_probs, female_probs) <ipython-input-32-04756cf63b36> in odds_ratios(male_probs, female_probs) 2 return dict([(w, female_probs[w] / male_probs[w]) 3 for w in ----> 4 set(male_probs) | set(female_probs)]) 5 6 ors = odds_ratios(male_probs, female_probs) <ipython-input-32-04756cf63b36> in <listcomp>(.0) 1 def odds_ratios(male_probs, female_probs): 2 return dict([(w, female_probs[w] / male_probs[w]) ----> 3 for w in 4 set(male_probs) | set(female_probs)]) 5 KeyError: 'state'
print(len(male_probs))
print(len(female_probs))
female_but_not_male = set(female_probs) - set(male_probs)
print('%d words in female_probs but not in male_probs' % len(female_but_not_male))
fem_word = list(female_but_not_male)[-10]
print(fem_word)
print(female_probs[fem_word])
#'selfcare' in male_probs
764 921 744 words in female_probs but not in male_probs arizona 0.0006882312456985547
** How to deal with 0-probabilities? **
p(w|male)=freq(w,male)∑ifreq(wi,male)freq(w,male)=0
Do we really believe there is 0 probability of a male using this term?
(Recall over-fitting discussion.)
** Additive Smoothing **
Reserve small amount of counts (e.g., 1) for unseen observations.
E.g., assume we've seen each word at least once in each class.
p(w|male)=1+freq(w,male)|W|+∑ifreq(wi,male)|W|: number of unique words.
# Additive smoothing. Add count of 1 for all words.
all_words = set(male_words) | set(female_words)
male_words.update(all_words)
female_words.update(all_words)
male_probs = counts_to_probs(male_words)
female_probs = counts_to_probs(female_words)
print('\n'.join(str(x) for x in
sorted(male_probs.items(), key=lambda x: -x[1])[:10]))
('of', 0.00980761976612599) ('and', 0.00943040362127499) ('i', 0.009053187476423991) ('the', 0.005658242172764994) ('my', 0.004526593738211996) ('in', 0.004149377593360996) ('me', 0.003772161448509996) ('life', 0.003772161448509996) ('you', 0.0033949453036589967) ('for', 0.0033949453036589967)
# Even though word doesn't appear, has non-zero probability.
print(male_probs[fem_word])
0.0003772161448509996
ors = odds_ratios(male_probs, female_probs)
sorted_ors = sorted(ors.items(), key=lambda x: -x[1])
print('Top Female Terms (OR):')
pprint(sorted_ors[:20])
print('\nTop Male Terms (OR):')
pprint(sorted_ors[-20:])
Top Female Terms (OR): [('she', 7.162445119891929), ('just', 7.162445119891929), ('mom', 6.267139479905437), ('her', 5.3718338399189465), ('so', 4.476528199932455), ('fishing', 4.476528199932455), ('can', 4.476528199932455), ('women', 4.476528199932455), ('great', 4.476528199932455), ('प', 4.476528199932455), ('we', 4.476528199932455), ('21', 4.476528199932455), ('am', 4.02887537993921), ('that', 3.5812225599459646), ('ever', 3.5812225599459646), ('got', 3.5812225599459646), ('eu', 3.5812225599459646), ('public', 3.5812225599459646), ('त', 3.5812225599459646), ('sc', 3.5812225599459646)] Top Male Terms (OR): [('ly', 0.2984352133288304), ('want', 0.2984352133288304), ('state', 0.2238264099966228), ('reflect', 0.2238264099966228), ('𝗍𝗁𝖾', 0.2238264099966228), ('education', 0.2238264099966228), ('all', 0.2238264099966228), ('never', 0.2238264099966228), ('retired', 0.2238264099966228), ('owner', 0.2238264099966228), ('youtube', 0.2238264099966228), ('school', 0.2238264099966228), ('two', 0.2238264099966228), ('proud', 0.2238264099966228), ('some', 0.2238264099966228), ('university', 0.17906112799729823), ('about', 0.17906112799729823), ('play', 0.17906112799729823), ('people', 0.1492176066644152), ('father', 0.1119132049983114)]