import urllib2
f = urllib2.urlopen('http://www.stanford.edu/~rjweiss/public_html/IRiSS2013/text1/extra/esperanto.txt')
foo = f.readlines()
f.close()
foo[0]
"Nul d\xc3\xa0 for\xc3\xa9n fo\xc3\xbbnd. Far eksa du\xc5\x93nd\xc3\xaff\xc3\xafna mi, miloj \xc3\xb4kcid\xc3\xa8nte n\xc3\xaa far, g\xc3\xabtto \xc3\xaflion men oj. Oz p\xc3\xa9r olog kvin \xc3\xa8st\xc3\xaf\xc3\xa9l, \xc3\xaesm ja c\xc3\xa9nt t\xc3\xa0g\xc5\x93. Su\xc5\x93m\xc3\xafo r\xc3\xa8spond\xc3\xab ba ena, \xc3\xa0j jen am\xc3\xa9n nett\xc3\xaa, sor d\xc3\xaavus multe duont\xc3\xb4n\xc5\x93 aj. Ki\xc3\xa0n f\xc3\xbbnd\xc3\xa2m\xc3\xa8nto bv p\xc3\xaar, plej\xc3\xa2 log'\xc3\xb4 \xc3\xaeomete la ojd, o\xc3\xaed in \xc3\xa9kkri\xc3\xb4 \xc3\xaenf\xc3\xaenit\xc3\xafv\xc5\x93. Ing pli\xc3\xa9 franjo rilativ\xc3\xb4 nv.\n"
#all of that mumbo jumbo means that we're dealing with some text that the python console won't render into glyphs...it's in some kind of encoding
#but we didn't get an error, so there's nothing WRONG here, it's just a rendering issue.
#let's try printing it
print foo[0]
Nul dà forén foûnd. Far eksa duœndïfïna mi, miloj ôkcidènte nê far, gëtto ïlion men oj. Oz pér olog kvin èstïél, îsm ja cént tàgœ. Suœmïo rèspondë ba ena, àj jen amén nettê, sor dêvus multe duontônœ aj. Kiàn fûndâmènto bv pêr, plejâ log'ô îomete la ojd, oîd in ékkriô înfînitïvœ. Ing plié franjo rilativô nv.
# right on, look at all those weirdo characters
# i so happened to have made this file, so i know that it's in utf-8 encoding.
foo_prime = foo[0]
print type(foo_prime)
<type 'str'>
ascii_foo = foo_prime.decode('ascii')
# what the heck is happening?!?!?1
--------------------------------------------------------------------------- UnicodeDecodeError Traceback (most recent call last) <ipython-input-4-27a81d0ae3d5> in <module>() ----> 1 ascii_foo = foo_prime.decode('ascii') 2 # what the heck is happening?!?!?1 UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 5: ordinal not in range(128)
#ok, can't decode a utf-8 string into unicode by claiming it's ascii.
utf8_foo = foo_prime.decode('utf-8')
unicode_foo = foo[0].decode('utf-8')
print type(unicode_foo)
unicode_foo #ahhh! some of the characters now are unicode code points. See how there are some \u####?
<type 'unicode'>
u"Nul d\xe0 for\xe9n fo\xfbnd. Far eksa du\u0153nd\xeff\xefna mi, miloj \xf4kcid\xe8nte n\xea far, g\xebtto \xeflion men oj. Oz p\xe9r olog kvin \xe8st\xef\xe9l, \xeesm ja c\xe9nt t\xe0g\u0153. Su\u0153m\xefo r\xe8spond\xeb ba ena, \xe0j jen am\xe9n nett\xea, sor d\xeavus multe duont\xf4n\u0153 aj. Ki\xe0n f\xfbnd\xe2m\xe8nto bv p\xear, plej\xe2 log'\xf4 \xeeomete la ojd, o\xeed in \xe9kkri\xf4 \xeenf\xeenit\xefv\u0153. Ing pli\xe9 franjo rilativ\xf4 nv.\n"
#ok, wonderful. why do i care about this? well, let's see what happens if i try to force this unicode string into ASCII
unicode_foo.encode('ascii')
--------------------------------------------------------------------------- UnicodeEncodeError Traceback (most recent call last) <ipython-input-7-645942daeb8e> in <module>() 1 #ok, wonderful. why do i care about this? well, let's see what happens if i try to force this unicode string into ASCII ----> 2 unicode_foo.encode('ascii') UnicodeEncodeError: 'ascii' codec can't encode character u'\xe0' in position 5: ordinal not in range(128)
#burn! explain why this is happening