Representing Data in R -- Python equivalent¶

In [1]:

import pandas as pd
import numpy as np

In [2]:

# 'characters' is equivalent to string
firstName = 'jeff'
print type(firstName), firstName

<type 'str'> jeff

In [3]:

# 'numeric' is equivalent to float
heightCM = 188.2
print type(heightCM), heightCM

<type 'float'> 188.2

In [4]:

# integer is equivalent to integer
numberSons = 1
print type(numberSons), numberSons

<type 'int'> 1

In [5]:

# 'logical' is equivalent to Boolean
teachingCoursera = True
print type(teachingCoursera), teachingCoursera

<type 'bool'> True

In [6]:

# 'vectors' is equivalent to numpy array or Python list (I will use array everywhere for consistency)
heights = np.array([188.2, 181.3, 193.4])
print heights

firstNames = np.array(['jeff', 'roger', 'andrew', 'brian'])
print firstNames

[ 188.2  181.3  193.4]
['jeff' 'roger' 'andrew' 'brian']

In [7]:

# 'list' is equivalent to dictionary in Python
vector1 = np.array([188.2, 181.3, 193.4])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])
myList = dict(heights = vector1, firstNames = vector2)
print myList

print myList['heights']
print myList['firstNames']

{'firstNames': array(['jeff', 'roger', 'andrew', 'brian'], 
      dtype='|S6'), 'heights': array([ 188.2,  181.3,  193.4])}
[ 188.2  181.3  193.4]
['jeff' 'roger' 'andrew' 'brian']

In [8]:

# 'matrices' is equivalent to two-dimensional numpy array
myMatrix = np.array([[1, 2], [3, 4]])
print myMatrix

[[1 2]
 [3 4]]

In [10]:

# data frame is equivalent to Pandas DataFrame
# this example doesn't work because the input array lengths are not the same
vector1 = np.array([188.2, 181.3, 193.4])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])

# ValueError: arrays must all be same length
# 
myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-10-58e1535d1fac> in <module>()
      6 # ValueError: arrays must all be same length
      7 #
----> 8 myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))

/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    383             mgr = self._init_mgr(data, index, columns, dtype=dtype, copy=copy)
    384         elif isinstance(data, dict):
--> 385             mgr = self._init_dict(data, index, columns, dtype=dtype)
    386         elif isinstance(data, ma.MaskedArray):
    387             mask = ma.getmaskarray(data)

/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
    515 
    516         return _arrays_to_mgr(arrays, data_names, index, columns,
--> 517                               dtype=dtype)
    518 
    519     def _init_ndarray(self, values, index, columns, dtype=None,

/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5343     # figure out the index, if necessary
   5344     if index is None:
-> 5345         index = extract_index(arrays)
   5346     else:
   5347         index = _ensure_index(index)

/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in extract_index(data)
   5395             lengths = list(set(raw_lengths))
   5396             if len(lengths) > 1:
-> 5397                 raise ValueError('arrays must all be same length')
   5398 
   5399             if have_dicts:

ValueError: arrays must all be same length

In [11]:

# data frame -- fixed
vector1 = np.array([188.2, 181.3, 193.4, 192.3])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])

myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))
myDataFrame

Out[11]:

	firstNames	heights
0	jeff	188.2
1	roger	181.3
2	andrew	193.4
3	brian	192.3

In [12]:

# factors is equivalent to pandas Categorical
smoker = np.array(['yes', 'no', 'yes', 'yes'])
smokerFactor = pd.Categorical.from_array(smoker)
smokerFactor

Out[12]:

Categorical: 
array(['yes', 'no', 'yes', 'yes'], dtype=object)
Levels (2): Index(['no', 'yes'], dtype=object)

In [13]:

# R's NA missing values is equivalent to NaN
vector1 = np.array([188.2, 181.3, 193.4, NaN])
print vector1
print isnan(vector1)

[ 188.2  181.3  193.4    nan]
[False False False  True]

In [14]:

# subsetting
vector1 = np.array([188.2, 181.3, 193.4, 192.3])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])

myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))

print '------------------'
print vector1[0]
print '------------------'
print vector1[[0, 1, 3]]
print '------------------'
print myDataFrame.ix[0, 0:2] # appears transposed as compared to R
print '------------------'
print myDataFrame['firstNames'] # there's no 'Levels' as in R
print '------------------'
print myDataFrame[myDataFrame['firstNames'] == 'jeff']
print '------------------'
print myDataFrame[myDataFrame['heights'] < 190]

------------------
188.2
------------------
[ 188.2  181.3  192.3]
------------------
firstNames     jeff
heights       188.2
Name: 0
------------------
0      jeff
1     roger
2    andrew
3     brian
Name: firstNames
------------------
  firstNames  heights
0       jeff    188.2
------------------
  firstNames  heights
0       jeff    188.2
1      roger    181.3

In [ ]: