#!/usr/bin/env python # coding: utf-8 # # More Python Goodness (1) # *** # ## Table of contents # # 1. [Working with scripts](#scripts) # 2. [The standard library](#stdlib) # 3. [String methods](#stringmethods) # 4. [Comments and docstrings](#docstrings) # 5. [Detour: PEP8 and other PEPs](#peps) # 6. [Errors and exceptions](#exceptions) # 7. Working with modules # 8. Examples from the standard library # 9. Reading and writing files # 10. Assignment: Finding the most common 7-mer in a FASTA file # 11. Further reading # # ## Working with scripts # # Interpreters are great for *prototyping*, but not really suitable if you want to **share** or **release** code. To do so, we write our Python commands in scripts (and later, modules). # # A **script** is a simple text file containing Python instructions to execute. # ### Executing scripts # # There are two common ways to execute a script: # # 1. As an argument of the Python interpreter command. # 2. As a standalone executable (with the appropriate shebang line & file mode). # # IPython gives you a third option: # #

As an argument of the `%run` magic. #

# ### Writing your script # # Let's start with a simple GC calculator. Open your text editor, and write the following Python statements (remember your indentations): # ```python # def calc_gc_percent(seq): # at_count, gc_count = 0, 0 # for char in seq: # if char in ('A', 'T'): # at_count += 1 # elif char in ('G', 'C'): # gc_count += 1 # # return gc_count * 100.0 / (gc_count + at_count) # # print "The sequence 'CAGG' has a %GC of {:.2f}".format( # calc_gc_percent("CAGG")) # ``` # Save the file as `seq_toolbox.py` (you can use any other name if you like) and go to your shell. # ### Running the script # # Let's try the first method: using your script as an argument: # # $ python seq_toolbox.py # # Is the output as you expect? # # For the second method, we need to do two more things: # # 1. Open the script in your editor and add the following line to the very top: # # #!/usr/bin/env python # # 2. Save the file, go back to the shell, and allow the file to be executed: # # $ chmod +x seq_toolbox.py # # You can now execute the file directly: # # $ ./seq_toolbox.py # # Is the output the same as the previous method? # # Finally, try out the third method. Open an IPython interpreter session and do: # # %run seq_toolbox.py # # ## The standard library # # Our script is nice and dandy, but we don't want to edit the source file everytime we calculate a sequence's GC. # # The **standard library** is a collection of Python modules (or functions, for now) that comes packaged with a default Python installation. They're not part of the language per se, more like a *batteries included* thing. # ### Our first standard library module: `sys` # # We'll start by using the simple `sys` module to make our script more flexible. # # Standard library (and other modules, as we'll see later) can be used via the `import` statement, for example: # In[1]: import sys # Like other objects so far, we can peek into the documentation of these modules using `help`, or the IPython `?` shortcut. For example: # In[2]: get_ipython().run_line_magic('pinfo', 'sys') # ### The `sys.argv` list # # The `sys` module provides a way to capture command line arguments with its `argv` object. This is a list of arguments supplied when invoking the current Python session. Not really useful for an interpreter session, but very handy for scripts. # In[3]: sys.argv # In[4]: sys.argv[:3] # ### Improving our script with `sys.argv` # # To use `sys.argv` in our script, open a text editor and edit the script by adding an import statement, capturing the `sys.argv` value, and editing our last `print` line: # # ```python # #!/usr/bin/env python # import sys # # def calc_gc_percent(seq): # at_count, gc_count = 0, 0 # for char in seq: # if char in ('A', 'T'): # at_count += 1 # elif char in ('G', 'C'): # gc_count += 1 # # return gc_count * 100.0 / (gc_count + at_count) # # input_seq = sys.argv[1] # print "The sequence '{}' has a %GC of {:.2f}".format( # input_seq, calc_gc_percent(input_seq)) # ``` # To test it, you can run the following command in your shell: # # $ python seq_toolbox.py CAGG # # Try it with `./seq_toolbox.py` instead. What happens? # # ## String methods # # Try running the script with `'cagg'` as the input sequence. What happens? # # As we saw earlier, many objects, like those of type `list`, `dict`, or `str`, have useful methods defined on them. One way to squash this potential bug is by using Python's string method `upper`. Let's first check out some commonly used string functions. # # In[5]: my_str = 'Hello again, ipython!' # In[6]: my_str.upper() # In[7]: my_str.lower() # In[8]: my_str.title() # In[9]: my_str.startswith('H') # In[10]: my_str.startswith('h') # In[11]: my_str.split(',') # In[12]: my_str.replace('ipython', 'lumc') # In[13]: my_str.count('n') # ### Improving our script with `upper()` # # Let's use `upper()` to fortify our function. It should now look something like this: # ```python # def calc_gc_percent(seq): # at_count, gc_count = 0, 0 # for char in seq.upper(): # if char in ('A', 'T'): # at_count += 1 # elif char in ('G', 'C'): # gc_count += 1 # # return gc_count * 100.0 / (gc_count + at_count) # ``` # And run it (in whichever way you prefer). Do you get the expected output? # # ## Comments and docstrings # # There's a golden rule in programming: write code for humans (this includes you in 6 months). Python provides two ways to accomplish this: comments and docstrings. # # ### Comments # # Any lines prepended with `#` are **comments**, making them ignored by the interpreter. Comments can be freeform text; anything that helps in understanding the code # # ### Docstrings # # **Docstrings** are Python's way of attaching proper documentation to objects. Officially, the first string literal that occurs in a module, function, class, or method definition is used as that object's docstring. # # In practice, *triple-quoted strings* are used, to handle newlines easier. # # Remember how we used the `help` function (or IPython's `?` shortcut) to get information about an object, function, or module? This actually prints that object's docstring. # ### Improving our script with comments and docstrings # # Open your script again in a text editor, and add the following comments and docstrings: # ```python # #!/usr/bin/env python # import sys # # def calc_gc_percent(seq): # """ # Calculates the GC percentage of the given sequence. # # Arguments: # - seq - the input sequence (string). # # Returns: # - GC percentage (float). # # The returned value is always <= 100.0 # """ # at_count, gc_count = 0, 0 # # Change input to all caps to allow for non-capital # # input sequence. # for char in seq.upper(): # if char in ('A', 'T'): # at_count += 1 # elif char in ('G', 'C'): # gc_count += 1 # # return gc_count * 100.0 / (gc_count + at_count) # # input_seq = sys.argv[1] # print "The sequence '{}' has a %GC of {:.2f}".format( # input_seq, calc_gc_percent(input_seq)) # ``` # # ## Detour: PEP8 and other PEPs # # Since comments and docstrings are basically free-form text, whether it's useful or not depends heavily on the developer. To mitigate this, the Python community has come up with practical conventions. They are documented in a document called **PEP8**. # # Complementary to PEP8, there is **PEP257** which is for docstrings specifically. It's not a must to follow these conventions, but *very* encouraged to do so. # # Python Enhancement Proposals, or **PEP**s, are how Python grows. There are hundreds of them now, all have to be approved by our BDFL. # # > [PEP8: Style Guide for Python Code](http://www.python.org/dev/peps/pep-0008/) # # > [PEP257: Docstring Conventions](http://www.python.org/dev/peps/pep-0257/) # # ## Errors and exceptions # # Try running the script with `ACTG123` as the argument. What happens? Is this acceptable behavior? # # Sometimes we want to put safeguards to handle invalid inputs. In this case we only accept `ACTG`, all other characters are invalid. # # Python provides a way to break out of the normal execution flow, by raising what's called as an **exception**. We can raise exceptions ourselves as well, by using the `raise` statement. # ### The `ValueError` built-in exception # # One of the most often used exceptions is the builtin exception `ValueError`. It is used on occasions where inappropriate argument values are used, for example when trying to convert the string `A` to an integer: # In[14]: int('A') # `ValueError` is the appropriate exception to raise when your function is called with argument values it cannot handle. # ### Improving our script by handling invalid inputs # # Open your script, and edit the `if` clause to add our exception: # ```python # def calc_gc_percent(seq): # """ # Calculates the GC percentage of the given sequence. # # Arguments: # - seq - the input sequence (string). # # Returns: # - GC percentage (float). # # The returned value is always <= 100.0 # """ # at_count, gc_count = 0, 0 # # Change input to all caps to allow for non-capital # # input sequence. # for char in seq.upper(): # if char in ('A', 'T'): # at_count += 1 # elif char in ('G', 'C'): # gc_count += 1 # else: # raise ValueError( # "Unexpeced character found: {}. Only " # "ACTGs are allowed.".format(char)) # # return gc_count * 100.0 / (gc_count + at_count) # ``` # Try running the script again with `ACTG123` as the argument. What happens now? # ### Handling corner cases # # Try running the script with `''` (two quote signs) as the argument. What happens? Why? Is this a valid input? # # We don't always want to let exceptions stop program flow, sometimes we want to provide alternative flow. The `try ... except` block allows you to do this. # # The syntax is: # ```python # try: # # Statements that may raise exceptions. # # [...] # except {exception type}: # # What to do when the exceptionis raised. # # [...] # ``` # ### Improving our script by handling corner cases # # Let's change our script by adding a `try ... except` block: # # ```python # def calc_gc_percent(seq): # """ # Calculates the GC percentage of the given sequence. # # Arguments: # - seq - the input sequence (string). # # Returns: # - GC percentage (float). # # The returned value is always <= 100.0 # """ # at_count, gc_count = 0, 0 # # Change input to all caps to allow for non-capital # # input sequence. # for char in seq.upper(): # if char in ('A', 'T'): # at_count += 1 # elif char in ('G', 'C'): # gc_count += 1 # else: # raise ValueError( # "Unexpeced character found: {}. Only " # "ACTGs are allowed.".format(char)) # # # Corner case handling: empty input sequence. # try: # return gc_count * 100.0 / (gc_count + at_count) # except ZeroDivisionError: # return 0.0 # ``` # ### Detour: Exception handling best practices # # #### Aim for a minimal `try` block # # We want to be able to pinpoint the statements that may raise the exceptions so we can tailor our handling. # # Example of code that violates this principle: # ```python # try: # my_function() # my_other_function() # except ValueError: # my_fallback_function() # ``` # A better way would be: # ```python # try: # my_function() # except ValueError: # my_fallback_function() # my_other_function() # ``` # #### Be specific when handling exceptions # # The following code is syntactically valid, but *never* use it in your real scripts / programs: # ```python # try: # my_function() # except: # my_fallback_function() # ``` # *Always* use the full exception name when handling exceptions, to make for a much cleaner code: # ```python # try: # my_function() # except ValueError: # my_fallback_function() # except TypeError: # my_other_fallback_function() # except IndexError: # my_final_function() # ``` # #### Look Before You Leap (LBYL) vs Easier to Ask for Apology (EAFP) # # We could have written our last exception block like so: # ```python # if gc_count + at_count == 0: # return 0.0 # return gc_count * 100.0 / (gc_count + at_count) # ``` # Both approaches are correct and have their own plus and minuses in general. However in this case, I would argue that EAFP is better since it makes the code more readable. # ### Improving our script by handling more corner cases # # Now try running your script without any arguments at all. What happens? # # Armed with what you now know, how would you handle this situation? # In[15]: from IPython.core.display import HTML def custom_style(): style = open('styles/notebook.css', 'r').read() return HTML('') def custom_script(): script = open('styles/notebook.js', 'r').read() return HTML('') # In[16]: custom_style() # In[17]: custom_script() # Acknowledgements # ======== # # [Wibowo Arindrarto](mailto:w.arindrarto@lumc.nl) # # Martijn Vermaat # # [Jeroen Laros](mailto:j.f.j.laros@lumc.nl) # # Based on # --------- # [Python Scientific Lecture Notes](http://scipy-lectures.github.io/) # # License # -------- # [Creative Commons Attribution 3.0 License (CC-by)](http://creativecommons.org/licenses/by/3.0)