This notebook was put together by Jake Vanderplas for UW's Astro 599 course. Source and license info is on GitHub.
string
module¶We can get a preview of what's available by examining the built-in string
module
import string
dir(string)
['Formatter', 'Template', '_TemplateMetaclass', '__builtins__', '__doc__', '__file__', '__name__', '__package__', '_float', '_idmap', '_idmapL', '_int', '_long', '_multimap', '_re', 'ascii_letters', 'ascii_lowercase', 'ascii_uppercase', 'atof', 'atof_error', 'atoi', 'atoi_error', 'atol', 'atol_error', 'capitalize', 'capwords', 'center', 'count', 'digits', 'expandtabs', 'find', 'hexdigits', 'index', 'index_error', 'join', 'joinfields', 'letters', 'ljust', 'lower', 'lowercase', 'lstrip', 'maketrans', 'octdigits', 'printable', 'punctuation', 'replace', 'rfind', 'rindex', 'rjust', 'rsplit', 'rstrip', 'split', 'splitfields', 'strip', 'swapcase', 'translate', 'upper', 'uppercase', 'whitespace', 'zfill']
s = "HeLLo tHEre MY FriEND"
s.upper()
'HELLO THERE MY FRIEND'
s.lower()
'hello there my friend'
s.title()
'Hello There My Friend'
s.capitalize()
'Hello there my friend'
s.swapcase()
'hEllO TheRE my fRIend'
s.split()
['HeLLo', 'tHEre', 'MY', 'FriEND']
L = s.capitalize().split()
print L
['Hello', 'there', 'my', 'friend']
s = '_'.join(L)
print s
Hello_there_my_friend
s.split('_')
['Hello', 'there', 'my', 'friend']
''.join(s.split('_'))
'Hellotheremyfriend'
s = " Too many spaces! "
s.strip()
'Too many spaces!'
s = "*~*~*~*Super!!**~*~**~*~**~"
s.strip('*~')
'Super!!'
s.rstrip('*~')
'*~*~*~*Super!!'
s.lstrip('*~')
'Super!!**~*~**~*~**~'
s.replace('*', '')
'~~~Super!!~~~~~'
s.replace('*', '').replace('~', '')
'Super!!'
s = "The quick brown fox jumped"
s.find("fox")
16
s[16:]
'fox jumped'
s.find('booyah')
-1
s.startswith('The')
True
s.endswith('jumped')
True
s.endswith('fox')
False
'1234'.isdigit()
True
'123.45'.isdigit()
False
'ABC'.isalpha()
True
'ABC123'.isalpha()
False
"ABC123".isalnum()
True
'ABC easy as 123'.isalnum()
False
'hello'.islower()
True
'HELLO'.isupper()
True
'Hello'.istitle()
True
' '.isspace()
True
from math import pi
"my favorite integer is %d, but my favorite float is %f." % (42, pi)
'my favorite integer is 42, but my favorite float is 3.141593.'
"in exponential notation it's %e" % pi
"in exponential notation it's 3.141593e+00"
"to choose smartly if exponential is needed: %g" % pi
'to choose smartly if exponential is needed: 3.14159'
"or with a bigger number: %g" % 123456787654321.0
'or with a bigger number: 1.23457e+14'
"rounded to three decimal places it's %.3f" % pi
"rounded to three decimal places it's 3.142"
"an integer padded with spaces: %10d" % 42
'an integer padded with spaces: 42'
"an integer padded on the right: %-10d" % 42
'an integer padded on the right: 42 '
"an integer padded with zeros: %010d" % 42
'an integer padded with zeros: 0000000042'
"we can also name our arguments: %(value)d" % dict(value=3)
'we can also name our arguments: 3'
"Escape the percent sign with an extra symbol: the %d%%" % 99
'Escape the percent sign with an extra symbol: the 99%'
Read more about formats in the Python docs
New-style string formatting uses curly braces {}
to contain the formats, which can be referenced by argument number and name:
"{0} {name}".format(first, name=second)"
"{}{}".format("ABC", 123)
'ABC123'
"{0}{1}".format("ABC", 123)
'ABC123'
"{0}{0}".format("ABC", 123)
'ABCABC'
"{1}{0}".format("ABC", 123)
'123ABC'
Formatting comes after the :
("%.2f" % 3.14159) == "{:.2f}".format(3.14159)
True
"{0:d} is an integer; {1:.3f} is a float".format(42, pi)
'42 is an integer; 3.142 is a float'
"{the_answer:010d} is an integer; {pi:.5g} is a float".format(the_answer=42,
pi=pi)
'0000000042 is an integer; 3.1416 is a float'
'{desire} to {place}'.format(desire='Fly me',
place='The Moon')
'Fly me to The Moon'
# using a pre-defined dictionary
f = {"desire": "Won't you take me",
"place": "funky town?"}
'{desire} to {place}'.format(**f)
"Won't you take me to funky town?"
# format also supports binary numbers
"int: {0:d}; hex: {0:x}; oct: {0:o}; bin: {0:b}".format(42)
'int: 42; hex: 2a; oct: 52; bin: 101010'
Let's create a file for us to read:
%%file inout.dat
Here is a nice file
with a couple lines of text
it is a haiku
Overwriting inout.dat
f = open('inout.dat')
print f.read()
f.close()
Here is a nice file with a couple lines of text it is a haiku
f = open('inout.dat')
print f.readlines()
f.close()
['Here is a nice file\n', 'with a couple lines of text\n', 'it is a haiku']
for line in open('inout.dat'):
print line.split()
['Here', 'is', 'a', 'nice', 'file'] ['with', 'a', 'couple', 'lines', 'of', 'text'] ['it', 'is', 'a', 'haiku']
# write() is the opposite of read()
contents = open('inout.dat').read()
out = open('my_output.dat', 'w')
out.write(contents.replace(' ', '_'))
out.close()
!cat my_output.dat
Here_is_a_nice_file with_a_couple_lines_of_text it_is_a_haiku
# writelines() is the opposite of readlines()
lines = open('inout.dat').readlines()
out = open('my_output.dat', 'w')
out.writelines(lines)
out.close()
!cat my_output.dat
Here is a nice file with a couple lines of text it is a haiku
Here is some code that creates a comma-delimited file of numbers with random precision, leading spaces, and formatting:
# Don't modify this: it simply writes the example file
f = open('messy_data.dat', 'w')
import random
for i in range(100):
for j in range(5):
f.write(' ' * random.randint(0, 6))
f.write('%0*.*g' % (random.randint(8, 12),
random.randint(5, 10),
100 * random.random()))
if j != 4:
f.write(',')
f.write('\n')
f.close()
# Look at the first four lines of the file:
!head -4 messy_data.dat
00000095.945, 0000096.1158, 014.15002, 0050.46316, 000014.6082 0000070.778,00073.821, 57.85960388, 0008.85737, 00000092.04 077.012237,0038.6466, 34.87242, 0000003.3876, 25.07738969 00068.3471, 00009.9584, 020.02878, 65.9716241, 00063.43892
Your task: Write a program that reads in the contents of "messy_data.dat"
and extracts the numbers from each line, using the string manipulations we used above (remember that float()
will convert a suitable string to a floating-point number).
Next write out a new file named "clean_data.dat"
. The new file should contain the same data as the old file, but with uniform formatting and aligned columns.
# your solution here
What you did above with text wrangling, numpy
can do much more easily:
import numpy as np
data = np.loadtxt("messy_data.dat", delimiter=',')
np.savetxt("clean_data.dat", data,
delimiter=',', fmt="%8.4f")
!head -5 clean_data.dat
95.9450, 96.1158, 14.1500, 50.4632, 14.6082 70.7780, 73.8210, 57.8596, 8.8574, 92.0400 77.0122, 38.6466, 34.8724, 3.3876, 25.0774 68.3471, 9.9584, 20.0288, 65.9716, 63.4389 87.9833, 7.8228, 60.3212, 82.9680, 22.4530
Still, text manipulation is a very good skill to have under your belt!