import sys print sys.getdefaultencoding() s='€';print sSyntaxError: Non-ASCII character '\xe2' in file encoding_test.py on line 2, but no encoding declared; see http://www.python.org/peps/pep-0263.html for details import locale print locale.getpreferredencoding() !cat ../data/ASCII_file.txt !cat ../data/latin_1_file.txt !cat ../data/mac_os_roman_file.txt !cat ../data/utf-8_file.txt # coding:utf-8 tokyo='東京' print tokyo import unicodedata u=unichr(8364) print u,ord(u),unicodedata.category(u),unicodedata.name(u) u=u'This string includes an \u20AC sign' print u # coding:utf-8 u=u'This string includes an € sign' print u u=unicode('This strings is ascii') print u, type(u) u=unicode('This string contains \xE2\x82\xAC', encoding='utf-8') print u,type(u) u=unicode('This string contains \xE2\x82\xAC') print u,type(u) u=unicode('This string contains €') print u,type(u) ascii_file =open('../data/ASCII_file.txt') for line in ascii_file: print line utf_8_file =open('../data/utf-8_file.txt') for line in utf_8_file: print line, type(line) latin_1_file =open('../data/latin_1_file.txt') for line in latin_1_file: print line latin_1_file =open('../data/latin_1_file.txt') for line in latin_1_file: print line.decode('latin-1') import codecs utf_8_file =codecs.open('../data/UTF-8_file.txt', encoding='utf-8') for line in utf_8_file: print type(line),line s=u'Alegr\u00EDa c\u00E1mara ilusi\u00F3n' print s, type(s) s1=s.encode('latin-1') print s1,type(s1) print s.encode() # coding:utf-8 tokyo=u'This is 東京, boy!' print tokyo.encode('latin-1') print tokyo.encode('latin-1','ignore') print tokyo.encode('latin-1','replace') print tokyo.encode('latin-1','backslashreplace') # You must have NLTK installed, treetagger installed and the TREETAGGER_HOME env variable set for this to work # export TREETAGGER_HOME='/path/to/your/TreeTagger/' # from treetagger import TreeTagger tt=TreeTagger(language='english',encoding='latin-1') tagged_sent=tt.tag('What is the airspeed of an unladen swallow? And what about the € sign?') print tagged_sent print '\nReadable version:'+' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent]) print len('€') print len('What is the airspeed of an unladen swallow? And what about the € sign?') print len('What is the airspeed of an unladen swallow? And what about the $ sign?') tagged_sent=tt.tag(u'What is the airspeed of an unladen swallow? And what about the € sign?') print tagged_sent print '\nReadable version:'+' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent]) tt=TreeTagger(language='spanish',encoding='utf8') tagged_sent=tt.tag(u'¿Podremos taggear esto? ¿Y qué pasa con el signo de €? ') print '\nReadable version:'+' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent]) tt=TreeTagger(language='spanish',encoding='utf8') tagged_sent=tt.tag('¿Podremos taggear esto? ¿Y qué pasa con el signo de €? ') print '\nReadable version:'+' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent]) # There is a bug in the Python treetagger module # Since it assumes that when a language has only one encoding, is latin-1 # Bulgarian, for example, only allows utf-8 # We tell the module that we are encoding latin tt=TreeTagger(language='bulgarian',encoding='latin-1') tagged_sent=tt.tag('Това е моят дом') print '\nReadable version:'+' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent]) import codecs tt=TreeTagger(language='spanish',encoding='utf8') f = codecs.open('../data/latin_1_file.txt', encoding='latin-1') sents=f.readlines() spanish_sent=sents[2] tagged_sent=tt.tag(spanish_sent) print type(spanish_sent),spanish_sent, ' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent]) f = codecs.open('../data/utf-8_file.txt', encoding='utf-8') sents=f.readlines() for sent in sents: tagged_sent=tt.tag(sent) print '\n',type(sent),sent, ' '.join([word+'/'+pos for (word,pos,lemma) in tagged_sent])