#!/usr/bin/env python # coding: utf-8 # ## Example notebook for the %%stata cell magic by the IPyStata package. # **Author:** Ties de Kok # **Homepage:** https://github.com/TiesdeKok/ipystata # **PyPi:** https://pypi.python.org/pypi/ipystata # ## Note: this example notebook uses the `Stata Batch Mode` method. # See Github for an example notebook using the Windows-only `Stata automation` method. # ## Import packages # In[1]: import pandas as pd # In[2]: import ipystata # ## Configure ipystata # In[1]: from ipystata.config import config_stata config_stata('/home/user/stata15/stata-se') #config_stata("D:\Software\stata15\StataSE-64.exe", force_batch=True) # **Note:** for this change to take effect you need to `Kernel` --> `Restart` the notebook. # ## Check whether IPyStata is working # In[4]: get_ipython().run_cell_magic('stata', '', '\ndisplay "Hello, I am printed by Stata."\n') # # Some examples based on the Stata 13 manual # ## Load the dataset "auto.dta" in Stata return it back to Python as a Pandas dataframe # The code cell below runs the Stata command **`sysuse auto.dta`** to load the dataset and returns it back to Python via the **`-o car_df`** argument. # In[5]: get_ipython().run_cell_magic('stata', '-o car_df', 'sysuse auto.dta\n') # **`car_df`** is a regular Pandas dataframe on which Python / Pandas actions can be performed. # In[6]: car_df.head() # ## Basic descriptive statistics # The argument **`-d or --data`** is used to define which dataframe should be set as dataset in Stata. # In the example below the Stata function **`tabulate`** is used to generate some descriptive statistics for the dataframe **`car_df`**. # In[7]: get_ipython().run_cell_magic('stata', '-d car_df', 'tabulate foreign headroom\n') # These descriptive statistics can be replicated in Pandas using the **`crosstab`** fuction, see the code below. # In[8]: pd.crosstab(car_df['foreign'], car_df['headroom'], margins=True) # ## Stata graphs # **Note:** due to a limitation of Stata it currently returns the graph as a PDF. # This is a temporary workaround that I hope to find a more suitable fix for in the future. # In[9]: get_ipython().run_cell_magic('stata', '-gr', 'use https://stats.idre.ucla.edu/stat/data/hsb2.dta, clear\ngraph twoway scatter read math\n') # ## Use Python lists as Stata macros # In many situations it is convenient to define values or variable names in a Python list or equivalently in a Stata macro. # The **`-i or --input`** argument makes a Python list available for use in Stata as a local macro. # For example, **`-i main_var`** converts the Python list **`['mpg', 'rep78']`** into the following Stata macro: **``main_var'`**. # In[10]: main_var = ['mpg', 'rep78'] control_var = ['gear_ratio', 'trunk', 'weight', 'displacement'] # In[11]: get_ipython().run_cell_magic('stata', '-d car_df -i main_var -i control_var', '\ndisplay "`main_var\'"\ndisplay "`control_var\'"\n\nregress price `main_var\' `control_var\', vce(robust)\n') # ## Modify dataset in Stata and return it to Python # It is possible create new variables or modify the existing dataset in Stata and have it returned as a Pandas dataframe. # In the example below the output **`-o car_df`** will overwrite the data **`-d car_df`**, effectively modifying the dataframe in place. # Note, the argument **`-np or --noprint`** can be used to supress any output below the code cell. # In[12]: get_ipython().run_cell_magic('stata', '-d car_df -o car_df -np', '\ngenerate weight_squared = weight^2\ngenerate log_weight = log(weight)\n') # In[13]: car_df.head(3) # ## Set a custom working directory for this Stata code cell # ### Using a directory defined in a variable (this is useful if you need it for many cells) # In[14]: directory = '~/sandbox' # In[15]: get_ipython().run_cell_magic('stata', '-cwd directory -np', 'display "`c(pwd)\'"\n') # ### It is also possible to provide the directory as an argument # In[16]: get_ipython().run_cell_magic('stata', "-cwd '~/sandbox' -np", 'display "`c(pwd)\'"\n') # ## An example case # Create the variable **`large`** in Python and use it as the dependent variable for a binary choice estimation by Stata. # In[17]: car_df['large'] = [1 if x > 3 and y > 200 else 0 for x, y in zip(car_df['headroom'], car_df['length'])] # In[18]: car_df[['headroom', 'length', 'large']].head(7) # In[19]: get_ipython().run_cell_magic('stata', '-d car_df -i main_var -i control_var', "\nlogit large `main_var' `control_var', vce(cluster make)\n")