#!/usr/bin/env python
# coding: utf-8

# # Tokenización con NLTK

# Cargamos los archivos de texto:

# In[50]:


from nltk.corpus import PlaintextCorpusReader


# In[51]:


corpus = PlaintextCorpusReader('.', '1.txt')


# Vemos que, por ejemplo, la abreviación 'Sr.' se tokeniza mal:

# In[52]:


corpus.sents()[0]


# La ayuda de PlaintextCorpusReader nos muestra que podemos definir nuestro propio tokenizador:

# In[ ]:


help(PlaintextCorpusReader)


# Un tokenizador posible es el RegexpTokenizer:

# In[54]:


from nltk.tokenize import RegexpTokenizer


# In[ ]:


help(RegexpTokenizer)


# Más documentación en:
# - http://www.nltk.org/book/ch03.html#regular-expressions-for-tokenizing-text
# - http://www.nltk.org/api/nltk.tokenize.html
# 
# Por ejemplo, en la documentación podemos encontrar el siguiente patrón de tokenización:

# In[55]:


pattern = r'''(?x)    # set flag to allow verbose regexps
     (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(?:-\w+)*        # words with optional internal hyphens
   | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''


# Instanciamos el tokenizer con este pattern, instanciamos el pattern en el corpus y vemos qué da:

# In[56]:


tokenizer = RegexpTokenizer(pattern)


# In[57]:


corpus = PlaintextCorpusReader('.', '1.txt', word_tokenizer=tokenizer)


# In[58]:


corpus.sents()


# Sigue mal, pero podemos arreglar el patrón de tokenización para que tome la abreviación 'Sr.' (y 'sra.' ya que estamos).
# Además, le indicamos que ignore mayúsculas agregando el flag 'i' al principio:

# In[59]:


pattern = r'''(?ix)    # set flag to allow verbose regexps
      (?:sr\.|sra\.)
    | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
    | \w+(?:-\w+)*        # words with optional internal hyphens
    | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
    | \.\.\.            # ellipsis
    | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''


# In[60]:


tokenizer = RegexpTokenizer(pattern)


# In[61]:


corpus = PlaintextCorpusReader('.', '1.txt', word_tokenizer=tokenizer)


# In[62]:


corpus.sents()[0]


# ¡Listo!