Notebook

CS579: Lecture 12¶

** Demographic Inference I**

Dr. Aron Culotta
Illinois Institute of Technology

dem·o·graph·ics

statistical data relating to the population and particular groups within it.

E.g., age, ethnicity, gender, income, ...

Why Demographics?¶

Marketing
- Who are my customers?
- Who are my competitors' customers?
- E.g., DemographicsPro
Social Media as Surveys
- E.g., 45% of tweets express positive sentiment toward Pres. Obama
- Who wrote those tweets?
Health
- 2% of Facebook users are expressing flu-like symptoms
- Are they representative of the full population?

** User profiles vary from site to site. **

rahm

rahm-fb

rahm-li

Approaches¶

Clever use of external data
- E.g., U.S. Census name lists for gender
Look for keywords in profile
- "African American Male"
- "Happy 21st birthday to me"
Machine Learning

In [2]:

# Guessing gender
# Collect 1000 tweets matching query "i"
import configparser
import sys
from TwitterAPI import TwitterAPI

def get_twitter(config_file):
    """ Read the config_file and construct an instance of TwitterAPI.
    Args:
      config_file ... A config file in ConfigParser format with Twitter credentials
    Returns:
      An instance of TwitterAPI.
    """
    config = configparser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

twitter = get_twitter('twitter.cfg')
tweets = []
n_tweets=1000
for r in twitter.request('statuses/filter', {'track': 'i'}):
    tweets.append(r)
    if len(tweets) % 100 == 0:
        print('%d tweets' % len(tweets))
    if len(tweets) >= n_tweets:
        break
print('fetched %d tweets' % len(tweets))

100 tweets
200 tweets
300 tweets
400 tweets
500 tweets
600 tweets
700 tweets
800 tweets
900 tweets
1000 tweets
fetched 1000 tweets

In [3]:

# not all tweets are returned
# https://dev.twitter.com/streaming/overview/messages-types#limit_notices
[t for t in tweets if 'user' not in t][:6]

Out[3]:

[{'limit': {'track': 383, 'timestamp_ms': '1551714817758'}},
 {'limit': {'track': 790, 'timestamp_ms': '1551714818769'}},
 {'limit': {'track': 1150, 'timestamp_ms': '1551714819737'}},
 {'limit': {'track': 1556, 'timestamp_ms': '1551714820748'}},
 {'limit': {'track': 1998, 'timestamp_ms': '1551714821738'}},
 {'limit': {'track': 2384, 'timestamp_ms': '1551714822750'}}]

In [4]:

# restrict to actual tweets
# (remove "deleted" tweets)
tweets = [t for t in tweets if 'user' in t]
print('fetched %d tweets' % len(tweets))

fetched 981 tweets

In [5]:

# Print last 10 names.
names = [t['user']['name'] for t in tweets]
names[-10:]

Out[5]:

['Funky Kong',
 'April and 228 other',
 '☕️',
 'nikoll (ia)',
 'Iniciativa PV',
 'josie🕷',
 'ؘ',
 'yaancc 🌞',
 'livyyy :)',
 'by gaspadin_ ll 🇹🇷🇹🇷']

In [6]:

# Fetch census name data from:
# http://www2.census.gov/topics/genealogy/1990surnames/
import requests
from pprint import pprint
males_url = 'http://www2.census.gov/topics/genealogy/' + \
            '1990surnames/dist.male.first'
females_url = 'http://www2.census.gov/topics/genealogy/' + \
              '1990surnames/dist.female.first'
males = requests.get(males_url).text.split('\n')
females = requests.get(females_url).text.split('\n')
print('males:')
pprint(males[:10])
print('females:')
pprint(females[:10])

males:
['JAMES          3.318  3.318      1',
 'JOHN           3.271  6.589      2',
 'ROBERT         3.143  9.732      3',
 'MICHAEL        2.629 12.361      4',
 'WILLIAM        2.451 14.812      5',
 'DAVID          2.363 17.176      6',
 'RICHARD        1.703 18.878      7',
 'CHARLES        1.523 20.401      8',
 'JOSEPH         1.404 21.805      9',
 'THOMAS         1.380 23.185     10']
females:
['MARY           2.629  2.629      1',
 'PATRICIA       1.073  3.702      2',
 'LINDA          1.035  4.736      3',
 'BARBARA        0.980  5.716      4',
 'ELIZABETH      0.937  6.653      5',
 'JENNIFER       0.932  7.586      6',
 'MARIA          0.828  8.414      7',
 'SUSAN          0.794  9.209      8',
 'MARGARET       0.768  9.976      9',
 'DOROTHY        0.727 10.703     10']

In [7]:

# Get names. 
male_names = set([m.split()[0].lower() for m in males if m])
female_names = set([f.split()[0].lower() for f in females if f])
print('%d male and %d female names' % (len(male_names), len(female_names)))
print('males:\n' + '\n'.join(list(male_names)[:10]))
print('\nfemales:\n' + '\n'.join(list(female_names)[:10]))

1219 male and 4275 female names
males:
raymundo
willis
jose
thurman
galen
darrick
roy
matthew
man
ned

females:
min
brittaney
tonisha
bailey
shani
roma
beth
stefanie
tillie
jeanetta

In [8]:

# Initialize gender of all tweets to unknown.
for t in tweets:
    t['gender'] = 'unknown'

In [9]:

# label a Twitter user's gender by matching name list.
import re
def gender_by_name(tweets, male_names, female_names):
    for t in tweets:
        name = t['user']['name']
        if name:
            # remove punctuation.
            name_parts = re.findall('\w+', name.split()[0].lower())
            if len(name_parts) > 0:
                first = name_parts[0].lower()
                if first in male_names:
                    t['gender'] = 'male'
                elif first in female_names:
                    t['gender'] = 'female'
                else:
                    t['gender'] = 'unknown'

gender_by_name(tweets, male_names, female_names)
# What's wrong with this approach?

In [10]:

from collections import Counter

def print_genders(tweets):
    counts = Counter([t['gender'] for t in tweets])
    print('%.2f of accounts are labeled with gender' % 
          ((counts['male'] + counts['female']) / sum(counts.values())))
    print('gender counts:\n', counts)
    for t in tweets[:20]:
        print(t['gender'], t['user']['name'])
    
print_genders(tweets)

0.27 of accounts are labeled with gender
gender counts:
 Counter({'unknown': 715, 'female': 143, 'male': 123})
unknown Arzunaz Üreyen
female Ann
unknown flexlex 🍒✨
male jules!
female MARLA RT
unknown Freya 🏳️‍🌈
female ًvictoria
unknown Blue PandaNW
unknown Pierrot Kwame
unknown Öykü Su Unay
unknown lots✈️bts wembley
unknown 🍒
unknown 😽😸Meow Meow March😸😽
unknown fazli
unknown Tyrion_Lannister
unknown 💜~Nessa~OwO💜
unknown محمد المولد xboxone 🎮 🖋
unknown Cruiz_Senior
male Carmen JMO💖🌟
unknown tyb

In [11]:

# What about ambiguous names?
def print_ambiguous_names(male_names, female_names):
    ambiguous = [n for n in male_names if n in female_names]  # names on both lists
    print('found %d ambiguous names:\n'% len(ambiguous))
    print('\n'.join(ambiguous[:20]))
    
print_ambiguous_names(male_names, female_names)

found 331 ambiguous names:

jose
roy
matthew
man
dale
gail
sung
thomas
tommie
charlie
claude
chong
stephen
patrick
dorian
angelo
jay
lewis
dusty
son

In [12]:

# Keep names that are more frequent in one gender than the other.
def get_percents(name_list):
    # parse raw data to extract, e.g., the percent of males names John.
    return dict([(n.split()[0].lower(), float(n.split()[1]))
                  for n in name_list if n])

males_pct = get_percents(males)
females_pct = get_percents(females)

# Assign a name as male if it is more common among males than femals.
male_names = set([m for m in male_names if m not in female_names or
              males_pct[m] > females_pct[m]])
female_names = set([f for f in female_names if f not in male_names or
              females_pct[f] > males_pct[f]])

print_ambiguous_names(male_names, female_names)
print('%d male and %d female names' % (len(male_names), len(female_names)))

found 0 ambiguous names:


1146 male and 4017 female names

In [13]:

# Relabel twitter users (compare with above)
gender_by_name(tweets, male_names, female_names)
print_genders(tweets)

0.27 of accounts are labeled with gender
gender counts:
 Counter({'unknown': 715, 'female': 155, 'male': 111})
unknown Arzunaz Üreyen
female Ann
unknown flexlex 🍒✨
male jules!
female MARLA RT
unknown Freya 🏳️‍🌈
female ًvictoria
unknown Blue PandaNW
unknown Pierrot Kwame
unknown Öykü Su Unay
unknown lots✈️bts wembley
unknown 🍒
unknown 😽😸Meow Meow March😸😽
unknown fazli
unknown Tyrion_Lannister
unknown 💜~Nessa~OwO💜
unknown محمد المولد xboxone 🎮 🖋
unknown Cruiz_Senior
female Carmen JMO💖🌟
unknown tyb

In [14]:

# Who are the unknowns?
# "Filtered" data can have big impact on analysis.
unknown_names = Counter(t['user']['name']
                        for t in tweets if t['gender'] == 'unknown')
unknown_names.most_common(20)

Out[14]:

[('.', 4),
 ('M', 3),
 ('Fra🌸', 2),
 ('🌙', 2),
 ('💜', 2),
 ('𝘲𝘶𝘦𝘦𝘯𝘯𝘢𝘪𝘫𝘢2𝘹🥰', 2),
 ('ً', 2),
 ('Marius Black. 🦋', 2),
 ('Krggzddd', 2),
 ('Mehmet Sarıaslan', 2),
 ('Urim Ejupi', 2),
 ('Arzunaz Üreyen', 1),
 ('flexlex 🍒✨', 1),
 ('Freya 🏳️\u200d🌈', 1),
 ('Blue PandaNW', 1),
 ('Pierrot Kwame', 1),
 ('Öykü Su Unay', 1),
 ('lots✈️bts wembley', 1),
 ('🍒', 1),
 ('😽😸Meow Meow March😸😽', 1)]

In [28]:

# How do the profiles of male Twitter users differ from
# those of female users?

male_profiles = [t['user']['description'] for t in tweets
                if t['gender'] == 'male']

female_profiles = [t['user']['description'] for t in tweets
                if t['gender'] == 'female']
#male_profiles = [t['text'] for t in tweets
#                if t['gender'] == 'male']

#female_profiles = [t['text'] for t in tweets
#                if t['gender'] == 'female']

import re
def tokenize(s):
    return re.sub('\W+', ' ', s).lower().split() if s else []

male_words = Counter()
female_words = Counter()

for p in male_profiles:
    male_words.update(Counter(tokenize(p)))
                      
for p in female_profiles:
    female_words.update(Counter(tokenize(p)))

print('Most Common Male Terms:')
pprint(male_words.most_common(10))
    
print('\nMost Common Female Terms:')
pprint(female_words.most_common(10))

Most Common Male Terms:
[('of', 25),
 ('and', 24),
 ('i', 23),
 ('the', 14),
 ('my', 11),
 ('in', 10),
 ('me', 9),
 ('life', 9),
 ('you', 8),
 ('for', 8)]

Most Common Female Terms:
[('i', 38),
 ('the', 32),
 ('and', 28),
 ('a', 19),
 ('my', 19),
 ('you', 17),
 ('to', 16),
 ('of', 15),
 ('is', 14),
 ('in', 13)]

In [29]:

print(len(male_words))
print(len(female_words))

764
921

In [30]:

# Compute difference
diff_counts = dict([(w, female_words[w] - male_words[w])
                    for w in
                    set(female_words.keys()) | set(male_words.keys())])

sorted_diffs = sorted(diff_counts.items(), key=lambda x: x[1])

print('Top Male Terms (diff):')
pprint(sorted_diffs[:10])

print('\nTop Female Terms (diff):')
pprint(sorted_diffs[-10:])

Top Male Terms (diff):
[('of', -10),
 ('father', -7),
 ('http', -6),
 ('for', -5),
 ('people', -5),
 ('university', -4),
 ('not', -4),
 ('about', -4),
 ('com', -4),
 ('play', -4)]

Top Female Terms (diff):
[('just', 7),
 ('she', 7),
 ('my', 8),
 ('s', 9),
 ('you', 9),
 ('to', 10),
 ('is', 11),
 ('a', 12),
 ('i', 15),
 ('the', 18)]

** A problem with difference of counts:**

What if we have more male than female words in total?

Instead, consider "the probability that a male user writes the word w"

$p(w|male) = \frac{freq(w, male)} {\sum_i freq(w_i, male)}$

** Odds Ratio (OR)**

The ratio of the probabilities for a word from each class:

$OR(w) = \frac{p(w|female)}{p(w|male)}$

High values --> more likely to be written by females
Low values --> more likely to be written by males

In [31]:

def counts_to_probs(gender_words):
    """ Compute probability of each term according to the frequency
    in a gender. """
    total = sum(gender_words.values())
    return dict([(word, count / total)
                 for word, count in gender_words.items()])

male_probs = counts_to_probs(male_words)
female_probs = counts_to_probs(female_words)

print('p(w|male)')
pprint(sorted(male_probs.items(), key=lambda x: -x[1])[:10])

print('\np(w|female)')
pprint(sorted(female_probs.items(), key=lambda x: -x[1])[:10])

p(w|male)
[('of', 0.021872265966754154),
 ('and', 0.02099737532808399),
 ('i', 0.020122484689413824),
 ('the', 0.012248468941382326),
 ('my', 0.009623797025371828),
 ('in', 0.008748906386701663),
 ('me', 0.007874015748031496),
 ('life', 0.007874015748031496),
 ('you', 0.00699912510936133),
 ('for', 0.00699912510936133)]

p(w|female)
[('i', 0.02615278733654508),
 ('the', 0.02202339986235375),
 ('and', 0.019270474879559532),
 ('a', 0.01307639366827254),
 ('my', 0.01307639366827254),
 ('you', 0.01169993117687543),
 ('to', 0.011011699931176875),
 ('of', 0.01032346868547832),
 ('is', 0.009635237439779766),
 ('in', 0.008947006194081212)]

In [32]:

def odds_ratios(male_probs, female_probs):
    return dict([(w, female_probs[w] / male_probs[w])
                 for w in
                 set(male_probs) | set(female_probs)])

ors = odds_ratios(male_probs, female_probs)

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-32-04756cf63b36> in <module>()
      4                  set(male_probs) | set(female_probs)])
      5 
----> 6 ors = odds_ratios(male_probs, female_probs)

<ipython-input-32-04756cf63b36> in odds_ratios(male_probs, female_probs)
      2     return dict([(w, female_probs[w] / male_probs[w])
      3                  for w in
----> 4                  set(male_probs) | set(female_probs)])
      5 
      6 ors = odds_ratios(male_probs, female_probs)

<ipython-input-32-04756cf63b36> in <listcomp>(.0)
      1 def odds_ratios(male_probs, female_probs):
      2     return dict([(w, female_probs[w] / male_probs[w])
----> 3                  for w in
      4                  set(male_probs) | set(female_probs)])
      5 

KeyError: 'state'

In [38]:

print(len(male_probs))
print(len(female_probs))
female_but_not_male = set(female_probs) - set(male_probs)
print('%d words in female_probs but not in male_probs' % len(female_but_not_male))
fem_word = list(female_but_not_male)[-10]
print(fem_word)
print(female_probs[fem_word])
#'selfcare' in male_probs

764
921
744 words in female_probs but not in male_probs
arizona
0.0006882312456985547

** How to deal with 0-probabilities? **

$p(w|male) = \frac{freq(w, male)} {\sum_i freq(w_i, male)}$

$freq(w, male) = 0$

Do we really believe there is 0 probability of a male using this term?

(Recall over-fitting discussion.)

** Additive Smoothing **

Reserve small amount of counts (e.g., 1) for unseen observations.

E.g., assume we've seen each word at least once in each class.

$p(w|male) = \frac{1 + freq(w, male)} {|W| + \sum_i freq(w_i, male)}$

$|W|$ : number of unique words.

In [39]:

# Additive smoothing. Add count of 1 for all words.
all_words = set(male_words) | set(female_words)
male_words.update(all_words)  
female_words.update(all_words)

male_probs = counts_to_probs(male_words)
female_probs = counts_to_probs(female_words)
print('\n'.join(str(x) for x in 
                sorted(male_probs.items(), key=lambda x: -x[1])[:10]))

('of', 0.00980761976612599)
('and', 0.00943040362127499)
('i', 0.009053187476423991)
('the', 0.005658242172764994)
('my', 0.004526593738211996)
('in', 0.004149377593360996)
('me', 0.003772161448509996)
('life', 0.003772161448509996)
('you', 0.0033949453036589967)
('for', 0.0033949453036589967)

In [41]:

# Even though word doesn't appear, has non-zero probability.
print(male_probs[fem_word])

0.0003772161448509996

In [42]:

ors = odds_ratios(male_probs, female_probs)

sorted_ors = sorted(ors.items(), key=lambda x: -x[1])

print('Top Female Terms (OR):')
pprint(sorted_ors[:20])

print('\nTop Male Terms (OR):')
pprint(sorted_ors[-20:])

Top Female Terms (OR):
[('she', 7.162445119891929),
 ('just', 7.162445119891929),
 ('mom', 6.267139479905437),
 ('her', 5.3718338399189465),
 ('so', 4.476528199932455),
 ('fishing', 4.476528199932455),
 ('can', 4.476528199932455),
 ('women', 4.476528199932455),
 ('great', 4.476528199932455),
 ('प', 4.476528199932455),
 ('we', 4.476528199932455),
 ('21', 4.476528199932455),
 ('am', 4.02887537993921),
 ('that', 3.5812225599459646),
 ('ever', 3.5812225599459646),
 ('got', 3.5812225599459646),
 ('eu', 3.5812225599459646),
 ('public', 3.5812225599459646),
 ('त', 3.5812225599459646),
 ('sc', 3.5812225599459646)]

Top Male Terms (OR):
[('ly', 0.2984352133288304),
 ('want', 0.2984352133288304),
 ('state', 0.2238264099966228),
 ('reflect', 0.2238264099966228),
 ('𝗍𝗁𝖾', 0.2238264099966228),
 ('education', 0.2238264099966228),
 ('all', 0.2238264099966228),
 ('never', 0.2238264099966228),
 ('retired', 0.2238264099966228),
 ('owner', 0.2238264099966228),
 ('youtube', 0.2238264099966228),
 ('school', 0.2238264099966228),
 ('two', 0.2238264099966228),
 ('proud', 0.2238264099966228),
 ('some', 0.2238264099966228),
 ('university', 0.17906112799729823),
 ('about', 0.17906112799729823),
 ('play', 0.17906112799729823),
 ('people', 0.1492176066644152),
 ('father', 0.1119132049983114)]