#!/usr/bin/env python # coding: utf-8 # [![Py4Life](https://raw.githubusercontent.com/Py4Life/TAU2015/gh-pages/img/Py4Life-logo-small.png)](http://py4life.github.io/TAU2015/) # ## Lecture 10 - 27.5.2015 # ### Last update: 10.5.2015 # ### Tel-Aviv University / 0411-3122 / Spring 2015 # In[8]: get_ipython().run_line_magic('matplotlib', 'inline') from IPython.display import HTML, YouTubeVideo import numpy as np import matplotlib.pyplot as plt import pandas as pd import urllib.request import os.path import zipfile from scipy.integrate import odeint from scipy.optimize import curve_fit from scipy.integrate import odeint from scipy.stats import linregress import seaborn as sns sns.set_style('ticks') sns.set_context('talk') sns.set_palette("Set1", 8, .75) # # Population growth dynamics # # Our focus todays will be on modeling population growth. # We will start with human populations and use data from the [World Bank](http://www.worldbank.org/) on the [population size of countries](http://data.worldbank.org/indicator/SP.POP.TOTL) around the world from 1960 until 2013. # # ## Data preparation # In[9]: HTML('

In Data Science, 80% of time spent prepare data, 20% of time spent complain about need for prepare data.
— Big Data Borat (@BigDataBorat) פברואר 27, 2013

') # So let's prepare the data from *scratch*. # # Start by retrieving the data as a zip file (if not already downloaded). Follow the link to the World Bank page on [Population, total](http://data.worldbank.org/indicator/SP.POP.TOTL), click `DOWNLOAD DATA`, choose `CSV`, a zip file will be downloaded. # # Even better, click the `CSV` button with the right button and click `Copy link address` in the context menu. This link can be used to get the file directly from Python using `urllib`: # In[10]: fname = 'world_growth.zip' if not os.path.exists(fname): urllib.request.urlretrieve('http://api.worldbank.org/v2/en/indicator/sp.pop.totl?downloadformat=csv', fname) # Open the file using the `zipfile` module and read it using _pandas_, skipping the first two rows and dropping the last two columns that have `NaN` values: # In[11]: with zipfile.ZipFile('world_growth.zip') as z: f = z.open('sp.pop.totl_Indicator_en_csv_v2.csv') df = pd.read_csv(f, skiprows=2) df.drop('2014', axis=1, inplace=True) df.drop('Unnamed: 59', axis=1, inplace=True) df.head() # In[12]: df.tail() # We now want to **tidy our data** - transform the dataset to a **DataFrame** format: # # > Tidy datasets are easy to manipulate, model and visualize, and have a specific structure: each variable is a column, each observation is a row, and each type of observational unit is a table - [Hadley Wickham (2014), _Tidy Data_, J. Stat. Soft.](http://www.jstatsoft.org/v59/i10) # # # This means that every row in the table has exactly one measurement population size with all the relevant variables: country name and year. Read more on [tidy data](http://www.jstatsoft.org/v59/i10). # # We do this using _pandas_ `melt` function: # In[13]: print(df.shape) df = pd.melt(df, id_vars=('Country Name','Country Code','Indicator Name', 'Indicator Code'), var_name='Year', value_name='Population') print(df.shape) df.head() # In[14]: df.tail() # We get rid of rows that have `NA` values: # In[15]: df.dropna(axis=0, inplace=True) df.head() # Note that for some reason the `Year` variable has a type `O` (object?) instead of `int`, so we change that: # In[16]: print(df.Year.dtype, df.Population.dtype) df.Year = df.Year.astype(int) print(df.Year.dtype, df.Population.dtype) # ## Data exploration with plots # # Let's start with the three biggest population in the dataset - the entire world, China and India. # We will plot their population size over time: # In[17]: world = df[df['Country Name'] == 'World'] china = df[df['Country Name'] == 'China'] india = df[df['Country Name'] == 'India'] plt.plot(world.Year, world.Population, label='World') plt.plot(china.Year, china.Population, label='China') plt.plot(india.Year, india.Population, label='India') plt.legend(loc='upper left') plt.xlabel('Year') plt.ylabel('Population'); # Alternatively we can also plot using the `DataFrame.plot` method: # In[18]: ax = world.plot('Year', 'Population') china.plot('Year', 'Population', ax=ax) india.plot('Year', 'Population', ax=ax) ax.legend(['World', 'China', 'India'], loc='upper left') ax.set_ylabel('Population'); # ## Exponential growth model # # According to [Malthus](http://en.wikipedia.org/wiki/Thomas_Robert_Malthus) (but that was a long time ago, so don't count on it!), populations grow exponentialy ([how to solve this](https://www.youtube.com/watch?v=mhO9eL9Nuz0)): # # $$ # \frac{dN(t)}{dt} = r N(t) \Rightarrow \\ # N(t) = N(0) e^{rt} \Rightarrow \\ # \log{N(t)} = \log{N(0)} + rt # $$ # # where $N(t)$ is the population size at time $t$ and $r$ is the growth rate. # # That means that that logarithm of the population should be a linear function of time. # In[19]: print("Exponential growth") YouTubeVideo("https://www.youtube.com/watch?v=c6pcRR5Uy6w") # Let's check if the world population growth is according to the exponential model. # First define a new column for the log of the population size: # In[20]: df['LogPopulation'] = np.log(df.Population) # ![G8 flags](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?url=http://environmentalgovernance.org/wp-content/uploads/2012/06/g8logo.jpg&container=focus&resize_w=150&refresh=2592000) # # Let's concentrate on the [G8 countires](http://en.wikipedia.org/wiki/G8) (well, not exactly, but good enough for the purpose of this presentation): # In[21]: G8_countries = ('France', 'Germany', 'Italy', 'Japan', 'United Kingdom', 'United States', 'Canada', 'European Union') # 'Russian Federation' was suspended in 2014 G8 = df[df['Country Name'].isin(G8_countries)] G8.head() # Here we do a _hack_ to sort the countires names based on the largest population in 2013 so that the next plot will have a legend that is ordered by country size. # # We start by splitting the data frame according to country name and extracting the max size for each country so that we will have a `pandas.Series` with the country as index and the max population size as the value. We then sort by the value (the population max size) and extract the index (the country). We will use this sorted list for ordering the colors in the next plot. # In[22]: G8_countries = G8.groupby('Country Name').Population.max() G8_countries.sort(ascending=False) G8_countries = G8_countries.index print(G8_countries.tolist()) # We now plot the log of the population sizes. Note that only the first 3 arguments are neccessary and the rest are are just used to make a more aesthetic and professional looking plot. # In[23]: sns.lmplot(x='Year', y='LogPopulation', data=G8, hue='Country Name', size=7, aspect=1.5, line_kws={'alpha':0.5}, hue_order=G8_countries); # This is interesting. While some countries fit very well to the Malthusian model (United States, France, United Kingdom), some countries don't fit as well: Japan seems to have stopped growing during the 1980's, and, European Union, Italy and Canada also seem to decelerate their growth. But this can be due to these countires being "developed". What about some "developing" countries? # # ## Predictions using linear regression # # Let's focus on the four largest countries, only one of the is "developed", but as we saw above it fits the Malthusian model. # We will use _SciPy_'s [`linregress`](http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.linregress.html) function to perform linear regression. This will allow us to **extrapolate** - to predict what will be the population of these countries if the rate at which they grow remain constant under the Malthusian model. # # Note that the linear regression of $y$ as a function of $x$ is $y = a x + b$, where $a$ is the slope of the regression and $b$ is the interception. # But then we have to exponentiate both sides of the equation as we want actual population size and not log population - so we get $e^y = e^{ax+b}$. # # Let's extrapolate to the end of the century: # In[37]: years = np.linspace(df.Year.min(), 2100) for country_name in ('India','China','United States', 'Brazil'): country_df = df[df['Country Name'] == country_name] slope, intercept, r_value, p_value, std_err = linregress(country_df.Year, country_df.LogPopulation) print("Coefficient of correlation for %s: %.4f" % ( country_name, r_value), end=", ") print("P-value for %s: %.2g" % ( country_name, p_value)) population = np.exp(intercept + slope * years) plt.plot(years, population, label=country_name) # prediction plt.scatter(country_df.Year, country_df.Population, marker='.') #data plt.xlabel('Year') plt.ylabel('Population') plt.yscale('log') plt.ylim(5e7,1e10) plt.legend(loc='upper left') plt.axvline(x=2032, color='gray', linestyle='--') plt.axvline(x=2056, color='gray', linestyle='--'); # From our prediction, India will pass China around 2032 and Brazil will pass United States around 2056. # # From the results of `linregress` we can estimate confidence intervals on the intercept and slope -- look for it online. We can also [calculate the coefficient of determination](http://stackoverflow.com/a/895063/1063612) ($R^2$). # ## Logistic growth model # # The Malthusian/exponential growth model can't always be correct: many times growth decelerates and effectively stops when reaching a certain size _K_ (carrying capacity, maximum yield, maximum population density etc.). # This is true for fish body size (Schnute 1981), microbial population size in a constant volume (Zwietering 1990), and natural animal populations. # In these cases it is common to use the [**logistic growth model**](http://en.wikipedia.org/wiki/Logistic_function#In_ecology:_modeling_population_growth) in which the size of the population inhibits growth, leading to a maximum population size after which growth stops: # # $$ # \frac{dN}{dt} = r N \Big( 1 - \frac{N}{K} \Big) # $$ # In[38]: print("Logistic growth") YouTubeVideo("https://www.youtube.com/watch?v=rXlyYFXyfIM") # Let's start by plotting the solution to this _ordinary differential equation_ (ODE). # We need to define the ODE as a function of _N_, _t_ and whatever other parameters are needed. It is important that _N_ comes before _t_: # In[18]: def logistic_ode(N, t, r, K): return r * N * (1 - N / K) # Integrating ODEs is really easy with Python. After defining the ODE function we can define a new function that given $t$, $r$, $K$ and $N_0$ (initial population size) calculates $N(t)$ by integration. For this we use _SciPy_'s `odeint` (ODE integration). # In[20]: def integrate_logistic(t, N0, r, K): N = odeint(logistic_ode, N0, t, args=(r, K)) return N t = np.linspace(0,10) N = integrate_logistic(t, N0=1, r=1, K=10) plt.plot(t, N) plt.xlabel('Time') plt.ylabel('Population') sns.despine(); # Incredibly, there is a solution to the [logistic equation](http://en.wikipedia.org/wiki/Logistic_function) that doesn't require numerical integration: # # $$ # \frac{dN}{dt} = r N(t)(1 - N/K) \Rightarrow \\ # N(t) = \frac{K N_0 e^{rt}}{K + N_0(e^{rt}-1)} # $$ # In[28]: def logistic(t, N0, r, K): return K * N0 * np.exp(r*t) / ( K + N0 * (np.exp(r*t) - 1)) # In[29]: t = np.linspace(0,10) N_integration = integrate_logistic(t, 1, 1, 10) N_solution = logistic(t, 1, 1, 10) plt.plot(t, N_integration, label='ODE') plt.plot(t, N_solution, 'o', label='Solution') plt.legend(loc='upper left'); # ## Fitting a logistic model # # So we have a standard growth model, but how do we find the right model (_i.e._ parameters: `N0`, `r`, `K`) for our data? To do this we need to **fit the logistic model** to our data. # # We will use a _SciPy_ function called [`curve_fit`](http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.optimize.curve_fit.html) that accepts a model (a function of `t` and some other arguments/parameters that returns `N`), data (`t` and `N`), initial guesses of the model parameters, and some other optional arguments. It evaluates the model with the initial guesses and calculates the _fit_ of the result -- in standrad fitting this is done by the sum of the square of the differences between the model and the data. It then makes another guess of the model parameters, evaluates the model result and calculates the distance, and again and again. It stops when it reaches a minimal distance, and that's why this operation is called **least squares**. This operation isn't perfect, and when there are even just 3 or 4 model parameters there might be more than one minimum or the function might not find a minimum in a reasonable time. # # ## Simulated data # # Before we try to work with real data we want to [try it with simulated data](http://stackoverflow.com/a/16241965/1063612) and see that we know what we are doing. # To simulate data we generate data from the logistic function and add some normally distributed noise: # In[37]: simulated = logistic(t, N0=1.5, r=0.75, K=8.5) + np.random.normal(loc=0, scale=0.7, size=t.shape) plt.scatter(t, simulated) sns.despine() # Now we can call `curve_fit` and give it the simulated data and get back an estimate of the model parameters. As you can see, although there is a lot of noise, we get a pretty good estimate of the parameters. # In[38]: params, cov = curve_fit(f=logistic, xdata=t, ydata=simulated, p0=(1,1,10)) N0,r,K = params plt.scatter(t, simulated) plt.plot(t, logistic(t, N0, r, K), '-') sns.despine() print('N0=%.3f, r=%.3f, K=%.3f' % (N0,r,K)) # We can also get confidence intervals for the parameter values from the `cov` return value. But if you need to work with confidence interals you might prefer to work with the package [lmfit](http://lmfit.github.io/lmfit-py/) which is easier to use if you need more than the basics. # ## Real data - bacterial growth curves # # We will use growth curves of _E. coli_ bacteria. [This data](https://github.com/Py4Life/TAU2015/blob/master/Yoav_311214c_only_OD.csv) was generated by growing bacteria in a 96-well microtiter plate. # # ![96-well plate](http://www.ddw-online.com/library/sid32/microplates.jpg) # In[44]: if not os.path.exists('Yoav_311214c_only_OD.csv'): urllib.request.urlretrieve('https://raw.githubusercontent.com/Py4Life/TAU2015/master/Yoav_311214c_only_OD.csv', 'Yoav_311214c_only_OD.csv') df = pd.read_csv('Yoav_311214c_only_OD.csv') df.head() # We don't need the `Time [s]` column (we got `Time` in hours) and we don't need `Temp. [°C]`, so we drop these columns. # # Then we [melt](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html) the data. That means that we transform our data from a matrix format (one row per timepoint, one column per well + columns for other variables such as time) into a data frame format (one row for each measurement, with one column for each measurement variable). # In[45]: df = df.drop(axis=1, labels=['Time [s]', 'Temp. [°C]']) df = pd.melt(df, id_vars=('Time','Cycle Nr.'), var_name='Well', value_name='OD') df.head() # Now let's add a column for the plate row (A - H) and the plate columns (1 - 12). We will use a [list comprehension](http://python-3-patterns-idioms-test.readthedocs.org/en/latest/Comprehensions.html) for that: # In[48]: def well2row(well): return well[0] def well2col(well): return int(well[1:]) df['Row'] = [well2row(x) for x in df.Well] df['Col'] = [well2col(x) for x in df.Well] df.head() # Finally the data is **prepared** and we are ready to explore it. # Let's start by plotting the OD over time curve of every well - just to make sure we know what we have in our hands. # # We use `seaborn.FacetGrid` which allows us to easily-ish make faceted plots. We need to specific the DataFrame columns on which we want to facet (these are the `Col` and `Row` we just created) and we can specify a bunch of styling options too. `FacetGrid` will break the data into smaller data sets according to our instructions. We can also specify which column controls the `hue` (color) of the plots. # # Then we can `map` a function to each of the facets. We give the function and then the columns that should be used as the function arguments. # In[51]: g = sns.FacetGrid(df, col='Col', row='Row', hue='Well', sharex=True, sharey=True, size=0.75, aspect=12./8, despine=True, margin_titles=True) g.map(plt.plot, 'Time', 'OD') plt.locator_params(nbins=4) # 4 ticks is enough g.set_axis_labels('','') # remove facets axis labels g.fig.text(0.5, 0, 'Time', size='x-large') # xlabel g.fig.text(-0.01, 0.5, 'OD', size='x-large', rotation='vertical') # ylabel g.fig.tight_layout() # You might notice that the curves in G12 and H12 are constant - these are the blank/control wells. Let's get rid of them: # In[ ]: df = df[(df.Well != 'G12') & (df.Well != 'H12')] # In order to fit a logistic curve to the data we need to calculate the average curve. This is easy (once you learn how to do it) in _pandas_. The idea is: we _split_ the data by one or more columns. Rows with the same values in these columns will be _grouped_ together. We then _apply_ an aggregation to these groups, meaning we make a single row out of each group. We then _combine_ the rows back together into a single data frame. # # This strategy is known as _split-apply_combine_ and it is a very powerful strategy to work with data frames. One of it's powers is that it easily extends to parallel computing because the _apply_ part can be done in parallel over the different groups. # # Using [_pandas_](http://pandas.pydata.org/pandas-docs/stable/groupby.html) we will split using the `groupby` method; we will aggregate using `mean` (for general aggregations use `aggregate`); we will then resetthe indices (which were lost during the aggregation) with `reset_index`. # # We want the mean OD value for each time point so we group by `Time` and aggregate with `mean`. # In[54]: grouped = df.groupby('Time') aggregated = grouped.OD.mean().reset_index() aggregated.head() # Let's plot: # In[60]: t = aggregated.Time N = aggregated.OD plt.scatter(t, N) plt.xlabel('Time') plt.ylabel('Mean OD') plt.axvline(x=10, color='gray') sns.despine() # You might notice that after about 10 hours the population size starts to decrease. This is an artifact of the experiments, due to the wells starting to dry out. Let's drop this data: # In[61]: N = N[t<10] t = t[t<10] plt.scatter(t, N) plt.xlabel('Time') plt.ylabel('Mean OD') sns.despine() # Now, finally, to the fitting (80% is data preparation!): # In[62]: popt, cov = curve_fit(logistic, t, N, (1., N.max(), N.min())) N0, r, K = popt print('N0=%.3f, r=%.6f, K=%.3f' % (N0, r, K)) plt.scatter(t, N) plt.plot(t, logistic(t, N0, r, K)) plt.xlabel('Time') plt.ylabel('OD') sns.despine(); # You can see that the fit is so-so. It's not that bad but it's not perfect, especially at the begining. This is because the logistic model probably isn't the best choice (although it's a common, simple choice). # ## Other functions # The [Gompertz model](http://en.wikipedia.org/wiki/Gompertz_function) is a different growth model. Let's try it: # In[63]: def gompertz(t, N0, r, K): return K * np.exp( np.log(N0/K) * np.exp(-r*t) ) # In[64]: popt, cov = curve_fit(gompertz, t, N, (r, N.max(), N.min())) N0gomp, rgomp, Kgomp = popt print('N0=%.3f, r=%.6f, K=%.3f' % (N0gomp, rgomp,Kgomp)) plt.scatter(t, N, label='data') plt.plot(t, logistic(t, N0, r, K), '-', label='logistic') plt.plot(t, gompertz(t, N0gomp, rgomp, Kgomp), '--', label='gompertz') plt.legend(loc='upper left'); # Not much better... # # Let's try the [Richards model](http://en.wikipedia.org/wiki/Generalised_logistic_function), a generalized logsitic model: # In[65]: def richards(t, N0, r, K, nu): Q = -1 + (K/N0)**nu return K / ( 1 + Q * np.exp(-r * nu * t) )**(1/nu) # The difference between the logistic and the Richards model is in the deceleration of growth: $1-(K/N)^{\nu}$. When $\nu=1$ Richards model is just the logistic model; when $\nu>1$ growth decelerates later in the growth; when $\nu<1$ it decelerates earlier. # In[70]: plt.plot(t, logistic(t, N0, r, K), '--', lw=5, label='logistic') plt.plot(t, richards(t, N0, r, K, 0.5), label='richards, nu=0') plt.plot(t, richards(t, N0, r, K, 1), label='richards, nu=1') plt.plot(t, richards(t, N0, r, K, 2), label='richards, nu=2') plt.legend(loc='upper left') sns.despine() # Let's try to fit the data with Richards model: # In[74]: popt, cov = curve_fit(richards, t, N, (r, N.max(), N.min(), 1)) N0rich, rrich, Krich, nurich = popt print('N0=%.3f, r=%.6f, K=%.3f, nu=%.3f' % (N0rich, rrich, Krich, nurich)) plt.scatter(t, N, label='data') plt.plot(t, logistic(t, N0, r, K), '--', label='logistic') plt.plot(t, richards(t, N0rich, rrich, Krich, nurich), '--', label='richards') plt.legend(loc='upper left') sns.despine() # Richards model indeed provides a better fit to the data. We can measure this fit using [AIC](http://en.wikipedia.org/wiki/Akaike_information_criterion) (the lower the better): # In[81]: n = len(t) print("Logistic AIC:", n * np.log(((logistic(t, N0, r, K)-N)**2).sum()/n) + 2*3) print("Richards AIC:", n * np.log(((richards(t, N0rich, rrich, Krich, nurich)-N)**2).sum()/n) + 2*4) # # References # # - Schnute, J., 1981. A Versatile Growth Model with Statistically Stable Parameters. Can. J. Fish. Aquat. Sci. 38, 1128–1140. # - Zwietering, M.H., Jongenburger, I., Rombouts, F.M., van ’t Riet, K., 1990. Modeling of the bacterial growth curve. Appl. Environ. Microbiol. 56, 1875–81. # ## Fin # This notebook is part of the _Python Programming for Life Sciences Graduate Students_ course given in Tel-Aviv University, Spring 2015. # # The notebook was written using [Python](http://pytho.org/) 3.4.1 and [IPython](http://ipython.org/) 3.1.0 (download from [PyZo](http://www.pyzo.org/downloads.html)). # # The code is available at https://github.com/Py4Life/TAU2015/blob/master/lecture10.ipynb. # # The notebook can be viewed online at http://nbviewer.ipython.org/github/Py4Life/TAU2015/blob/master/lecture10.ipynb. # # This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. # # ![Python logo](https://www.python.org/static/community_logos/python-logo.png)