Goals of this talk

  • Explore NLTK
  • Have fun thinking about Lexical Graphs
  • Explore the posiblity of comparing differences between langaugas using NLP
In [1]:
import nltk

nltk ships with a download utility for downloading grammers and corpora.

In [2]:
nltk.download()
showing info http://nltk.github.com/nltk_data/

Out[2]:
True

Density

In [3]:
from nltk import word_tokenize as tokenize
In [4]:
nltk.pos_tag(tokenize("The quick brown fox jumps over the lazy dog."))
Out[4]:
[('The', 'DT'),
 ('quick', 'NN'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumps', 'NNS'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'NN'),
 ('dog', 'NN'),
 ('.', '.')]
In [5]:
nltk.pos_tag(tokenize("If I were you I wouldn't do that with these."))
Out[5]:
[('If', 'IN'),
 ('I', 'PRP'),
 ('were', 'VBD'),
 ('you', 'PRP'),
 ('I', 'PRP'),
 ('would', 'MD'),
 ("n't", 'RB'),
 ('do', 'VB'),
 ('that', 'DT'),
 ('with', 'IN'),
 ('these', 'DT'),
 ('.', '.')]

Create a density checker

In [6]:
import re

matches = lambda x, re_parts: any([re.findall(y, x) for y in re_parts])

def density(lang_code, data):
    if lang_code == 'en':
        content_tags = ("^NN", "^JJ", "^V")    
    elif lang_code == 'pt':
        content_tags = ("\+n", "\+adj", "\+v")
    elif lang_code == "es":
        content_tags = ("^v", "^a", "^n")
    return len(filter(lambda x: matches(x[1], content_tags), data)) / float(len(data))

assert density('en', [(1, "NN"), (2, "XX")]) == .5
assert matches("H+n", ("n", "adj"))
assert density('pt', [(1, "H+n"), (2, "H+xxx")]) == .5
assert density('es', [(1, "xxxx"), (2, "vmip3s0")]) == .5

back to our original example, one phrase is more dense than the other.

In [19]:
assert density("en", nltk.pos_tag(tokenize("If I were you I wouldn’t do that with these."))) \
       < density("en", nltk.pos_tag(tokenize("The quick brown fox jumps over the lazy dog.")))

Tag Spanish Text

In [20]:
from nltk.corpus import cess_esp
sents = cess_esp.tagged_sents()

Split into training and test set

In [21]:
training_dx = int(len(sents)*90/100)
training = sents[:training_dx]
test = sents[training_dx+1:]

train tagger and check accuracy (this takes 40 seconds or so) ...

In [22]:
from nltk import HiddenMarkovModelTagger
spanish_tagger = HiddenMarkovModelTagger.train(training)
'accuracy %.1f %%' % (spanish_tagger.evaluate(test) * 100)
Out[22]:
'accuracy 84.9 %'
In [23]:
spanish_tagger.tag(tokenize("A buen entendedor, pocas palabras bastan."))
Out[23]:
[('A', 'sps00'),
 ('buen', 'aq0ms0'),
 ('entendedor', 'np0000p'),
 (',', 'Fc'),
 ('pocas', 'di0fp0'),
 ('palabras', 'ncfp000'),
 ('bastan', 'aq0fp0'),
 ('.', 'Fp')]
In [24]:
spanish_tagger.tag(tokenize("El gato blanco se sentó en la alfombra."))
Out[24]:
[('El', 'da0ms0'),
 ('gato', 'ncms000'),
 ('blanco', 'aq0ms0'),
 ('se', 'p0300000'),
 ('sent\xc3\xb3', 'vmip3s0'),
 ('en', 'sps00'),
 ('la', 'da0fs0'),
 ('alfombra', 'ncfs000'),
 ('.', 'Fp')]

Now Portuguese

In [25]:
from nltk.corpus import floresta
sents = floresta.tagged_sents()

# Split
training_dx = int(len(sents)*90/100)
training = sents[:training_dx]
test = sents[training_dx+1:]

#train
port_tagger = HiddenMarkovModelTagger.train(training)
'accuracy %.1f %%' % (port_tagger.evaluate(test) * 100)
Out[25]:
'accuracy 85.5 %'

Cross Language Testing

Language Test data sent from Transifex

In [26]:
strings = [dict(
           en_source="Without a secure random number generator an attacker"
                     " may be able to predict password reset tokens and take"
                     " over your account.",
           pt=[
               # portuguese translation 1
               "Sem nenhum gerador seguro de números aleatórios, uma pessoa mal"
               " intencionada pode prever a sua password, reiniciar as seguranças"
               " adicionais e tomar conta da sua conta."
               ],
           es=[
               # spanish translation 1
               "Sin un generador de números aleatorios seguro, un atacante podría"
               " predecir los tokens de restablecimiento de contraseñas y tomar"
               " el control de su cuenta.",
               # spanish translation 2
               "Sin un generador de números aleatorios seguro un atacante podría"
               " predecir los tokens de reinicio de su contraseña y tomar control"
               " de su cuenta."
               
               ])]
In [28]:
import numpy as np

for data_dict in strings:
    en_density = density("en", nltk.pos_tag(tokenize(data_dict['en_source'])))
    print "english", en_density
    es_densities = np.mean([density("es", spanish_tagger.tag(tokenize(x))) for x in data_dict['es']])
    print "spanish", es_densities
    pt_densities = np.mean([density("pt", port_tagger.tag(tokenize(x))) for x in data_dict['pt']])
    print "portuguese", pt_densities
english 0.590909090909
spanish 0.431538461538
portuguese 0.379310344828

In [29]:
.59 - .43
Out[29]:
0.15999999999999998
In [30]:
.59 - .37
Out[30]:
0.21999999999999997
In [31]:
.43 - .37
Out[31]:
0.06

Other fun stuff

http://nltk.org/book/

 Natural Language Processing with Python

--- Analyzing Text with the Natural Language Toolkit

Steven Bird, Ewan Klein, and Edward Loper

In [8]:
from nltk.book import *
*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908

In [10]:
nltk.book.text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])
In [12]:
def tabulate(cfdist, words, categories):
    print '%-16s' % 'Category',
    for word in words:                                  # column headings
        print '%6s' % word,
    print
    for category in categories:
        print '%-16s' % category,                       # row heading
        for word in words:                              # for each word
            print '%6d' % cfdist[category][word],       # print table cell
        print                                           # end the row
In [13]:
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist(
          (genre, word)
          for genre in brown.categories()
          for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)
Category            can  could    may  might   must   will
news                 93     86     66     38     50    389
religion             82     59     78     12     54     71
hobbies             268     58    131     22     83    264
science_fiction      16     49      4     12      8     16
romance              74    193     11     51     45     43
humor                16     30      8      8      9     13

In [14]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')
Building word-context index...
man day time year car moment world family house boy child country job
state girl place war way case question

In [15]:
text.similar('bought')
made done put said found had seen given left heard been brought got
set was called felt in that told

grammer and logic

In [16]:
v = """
bertie => b
olive => o
cyril => c
boy => {b}
girl => {o}
dog => {c}
walk => {o, c}
see => {(b, o), (c, b), (o, c)}
"""
val = nltk.parse_valuation(v)
g = nltk.Assignment(val.domain)
m = nltk.Model(val.domain, val)
sent = 'Cyril sees every boy'
grammar_file = 'grammars/book_grammars/simple-sem.fcfg'
results = nltk.batch_evaluate([sent], grammar_file, m, g)[0]
for (syntree, semrep, value) in results:
    print semrep
    print value
all z1.(boy(z1) -> see(cyril,z1))
True

In [17]:
sent = 'Cyril sees a boy'
results = nltk.batch_evaluate([sent], grammar_file, m, g)[0]
for (syntree, semrep, value) in results:
    print semrep
    print value
exists z2.(boy(z2) & see(cyril,z2))
True

In [*]:
nltk.download()
In [ ]: