import pandas as pd
import numpy as np
# 'characters' is equivalent to string
firstName = 'jeff'
print type(firstName), firstName
<type 'str'> jeff
# 'numeric' is equivalent to float
heightCM = 188.2
print type(heightCM), heightCM
<type 'float'> 188.2
# integer is equivalent to integer
numberSons = 1
print type(numberSons), numberSons
<type 'int'> 1
# 'logical' is equivalent to Boolean
teachingCoursera = True
print type(teachingCoursera), teachingCoursera
<type 'bool'> True
# 'vectors' is equivalent to numpy array or Python list (I will use array everywhere for consistency)
heights = np.array([188.2, 181.3, 193.4])
print heights
firstNames = np.array(['jeff', 'roger', 'andrew', 'brian'])
print firstNames
[ 188.2 181.3 193.4] ['jeff' 'roger' 'andrew' 'brian']
# 'list' is equivalent to dictionary in Python
vector1 = np.array([188.2, 181.3, 193.4])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])
myList = dict(heights = vector1, firstNames = vector2)
print myList
print myList['heights']
print myList['firstNames']
{'firstNames': array(['jeff', 'roger', 'andrew', 'brian'], dtype='|S6'), 'heights': array([ 188.2, 181.3, 193.4])} [ 188.2 181.3 193.4] ['jeff' 'roger' 'andrew' 'brian']
# 'matrices' is equivalent to two-dimensional numpy array
myMatrix = np.array([[1, 2], [3, 4]])
print myMatrix
[[1 2] [3 4]]
# data frame is equivalent to Pandas DataFrame
# this example doesn't work because the input array lengths are not the same
vector1 = np.array([188.2, 181.3, 193.4])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])
# ValueError: arrays must all be same length
#
myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-10-58e1535d1fac> in <module>() 6 # ValueError: arrays must all be same length 7 # ----> 8 myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2)) /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy) 383 mgr = self._init_mgr(data, index, columns, dtype=dtype, copy=copy) 384 elif isinstance(data, dict): --> 385 mgr = self._init_dict(data, index, columns, dtype=dtype) 386 elif isinstance(data, ma.MaskedArray): 387 mask = ma.getmaskarray(data) /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype) 515 516 return _arrays_to_mgr(arrays, data_names, index, columns, --> 517 dtype=dtype) 518 519 def _init_ndarray(self, values, index, columns, dtype=None, /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype) 5343 # figure out the index, if necessary 5344 if index is None: -> 5345 index = extract_index(arrays) 5346 else: 5347 index = _ensure_index(index) /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in extract_index(data) 5395 lengths = list(set(raw_lengths)) 5396 if len(lengths) > 1: -> 5397 raise ValueError('arrays must all be same length') 5398 5399 if have_dicts: ValueError: arrays must all be same length
# data frame -- fixed
vector1 = np.array([188.2, 181.3, 193.4, 192.3])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])
myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))
myDataFrame
firstNames | heights | |
---|---|---|
0 | jeff | 188.2 |
1 | roger | 181.3 |
2 | andrew | 193.4 |
3 | brian | 192.3 |
# factors is equivalent to pandas Categorical
smoker = np.array(['yes', 'no', 'yes', 'yes'])
smokerFactor = pd.Categorical.from_array(smoker)
smokerFactor
Categorical: array(['yes', 'no', 'yes', 'yes'], dtype=object) Levels (2): Index(['no', 'yes'], dtype=object)
# R's NA missing values is equivalent to NaN
vector1 = np.array([188.2, 181.3, 193.4, NaN])
print vector1
print isnan(vector1)
[ 188.2 181.3 193.4 nan] [False False False True]
# subsetting
vector1 = np.array([188.2, 181.3, 193.4, 192.3])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])
myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))
print '------------------'
print vector1[0]
print '------------------'
print vector1[[0, 1, 3]]
print '------------------'
print myDataFrame.ix[0, 0:2] # appears transposed as compared to R
print '------------------'
print myDataFrame['firstNames'] # there's no 'Levels' as in R
print '------------------'
print myDataFrame[myDataFrame['firstNames'] == 'jeff']
print '------------------'
print myDataFrame[myDataFrame['heights'] < 190]
------------------ 188.2 ------------------ [ 188.2 181.3 192.3] ------------------ firstNames jeff heights 188.2 Name: 0 ------------------ 0 jeff 1 roger 2 andrew 3 brian Name: firstNames ------------------ firstNames heights 0 jeff 188.2 ------------------ firstNames heights 0 jeff 188.2 1 roger 181.3