import sys
print sys.getdefaultencoding()

s='€';print sSyntaxError: Non-ASCII character '\xe2' in file encoding_test.py on line 2, but no encoding declared; see http://www.python.org/peps/pep-0263.html for details

import locale
print locale.getpreferredencoding()

!cat ../data/ASCII_file.txt

!cat ../data/latin_1_file.txt

!cat ../data/mac_os_roman_file.txt

!cat ../data/utf-8_file.txt

# coding:utf-8
tokyo='東京'
print tokyo


import unicodedata
u=unichr(8364)
print u,ord(u),unicodedata.category(u),unicodedata.name(u)

u=u'This string includes an \u20AC sign'
print u

# coding:utf-8
u=u'This string includes an € sign'
print u

u=unicode('This strings is ascii')
print u, type(u)

u=unicode('This string contains \xE2\x82\xAC', encoding='utf-8')
print u,type(u)

u=unicode('This string contains \xE2\x82\xAC')
print u,type(u)


u=unicode('This string contains €')
print u,type(u)


ascii_file =open('../data/ASCII_file.txt')
for line in ascii_file:
        print line

utf_8_file =open('../data/utf-8_file.txt')
for line in utf_8_file:
        print line, type(line)

latin_1_file =open('../data/latin_1_file.txt')
for line in latin_1_file:
        print line

latin_1_file =open('../data/latin_1_file.txt')
for line in latin_1_file:
        print line.decode('latin-1')


import codecs
utf_8_file =codecs.open('../data/UTF-8_file.txt', encoding='utf-8')
for line in utf_8_file:
        print type(line),line


s=u'Alegr\u00EDa c\u00E1mara ilusi\u00F3n'

print s, type(s)

s1=s.encode('latin-1')
print s1,type(s1)

print s.encode()

# coding:utf-8
tokyo=u'This is 東京, boy!'
print tokyo.encode('latin-1')


print tokyo.encode('latin-1','ignore')
print tokyo.encode('latin-1','replace')


print tokyo.encode('latin-1','backslashreplace')


# You must have NLTK installed, treetagger installed and the TREETAGGER_HOME env variable set for this to work
# export TREETAGGER_HOME='/path/to/your/TreeTagger/'
#
from treetagger import TreeTagger


tt=TreeTagger(language='english',encoding='latin-1')
tagged_sent=tt.tag('What is the airspeed of an unladen swallow? And what about the € sign?')
print tagged_sent
print '\nReadable version:'+' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent])


print len('€')

print len('What is the airspeed of an unladen swallow? And what about the € sign?')
print len('What is the airspeed of an unladen swallow? And what about the $ sign?')

tagged_sent=tt.tag(u'What is the airspeed of an unladen swallow? And what about the € sign?')
print tagged_sent
print '\nReadable version:'+' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent])


tt=TreeTagger(language='spanish',encoding='utf8')
tagged_sent=tt.tag(u'¿Podremos taggear esto? ¿Y qué pasa con el signo de €? ')
print '\nReadable version:'+' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent])


tt=TreeTagger(language='spanish',encoding='utf8')
tagged_sent=tt.tag('¿Podremos taggear esto? ¿Y qué pasa con el signo de €? ')
print '\nReadable version:'+' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent])


# There is a bug in the Python treetagger module
# Since it assumes that when a language has only one encoding, is latin-1
# Bulgarian, for example, only allows utf-8
# We tell the module that we are encoding latin
tt=TreeTagger(language='bulgarian',encoding='latin-1')
tagged_sent=tt.tag('Това е моят дом')
print '\nReadable version:'+' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent])


import codecs
tt=TreeTagger(language='spanish',encoding='utf8')
f = codecs.open('../data/latin_1_file.txt', encoding='latin-1')
sents=f.readlines()
spanish_sent=sents[2]
tagged_sent=tt.tag(spanish_sent)
print type(spanish_sent),spanish_sent, ' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent])

f = codecs.open('../data/utf-8_file.txt', encoding='utf-8')
sents=f.readlines()
for sent in sents:
    tagged_sent=tt.tag(sent)
    print '\n',type(sent),sent, ' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent])