#!/usr/bin/env python # coding: utf-8 # Overview of indexing semantics when using `[]` (`__getitem__`). # # This does not yet handle with all extra special cases (like duplicate labels, non-monotonic, contiguous or not, etc cases). # In[1]: import pandas as pd import numpy as np # In[2]: print "pandas: ", pd.__version__ # In[3]: s_int = pd.Series(range(5), index=[0,1,2,3,4]) s_int2 = pd.Series(range(5), index=[0,1,4,5,6]) # In[4]: s_float = pd.Series(range(5), index=[0.0,0.1,0.2,0.3,0.4]) # In[5]: s_date = pd.Series(range(5), index=pd.date_range('2012-01-01', periods=5)) # In[6]: s_string = pd.Series(range(5), index=list('abcde')) # ## Slicing # Slicing is **integer location based** for an integer axis: # In[7]: print s_int[0:3] print s_int.ix[0:3] print s_int.loc[0:3] print s_int.iloc[0:3] # In[8]: s_int2[0:3] # But for an axis with a float type ... only **label based**: # In[9]: print s_float[0:3] print s_float[0:0.3] # In[10]: print s_float.ix[0:3] # For other types, logically it is **integer location based** when having integer slice labels: # In[11]: s_date[0:3] # In[12]: s_string[0:3] # and **label based** when having slice labels of the correct type: # In[13]: s_date["2012-01-01":"2012-01-03"] # In[14]: s_string["a":"c"] # **Summary** for **slicing**: # # * Slicing with integer labels is: # * always *integer location based* # * except for a *float indexer* where it is label based # * Slicing with other types of labels is always label based if it is of appropriate type for the indexer. # # So, you can say that the behaviour is equivalent to `.ix`, except that the behaviour for integer labels is different for integer indexers (swapped). (For `.ix`, when having an integer axis, it is always label based and no fallback to integer location based). # ## Single label # In[15]: print s_int[4] print s_int2[4] # In[16]: print s_int2[3] # In[17]: print s_float[2] # In[18]: print s_float[0.2] # In[19]: s_date["2012-01-03"] # In[20]: s_date[2] # In[21]: s_string["c"] # In[22]: s_string[2] # **Summary** for **single label**: # # * Indexing with a single label is **always label based** # * But, there is fallback to integer location based, except for integer and float indexers # ## List of labels # In[23]: s_int[[3,4]] # In[24]: print s_int2[[3,4]] print s_int2.loc[[3,4]] # In[25]: print s_int2[[2, 3]] print s_int2.loc[[2,3]] # So with `[]` using a list is a pure reindex, also if no label of the list is found, you just get an all NaN series (which contrasts with `loc`, where at least one label should be found) # In[26]: print s_int2[[8,9]] # In[27]: s_float[[2,3]] # In[38]: s_float.ix[[2,3]] # In[28]: s_float[[0.2,0.3]] # So also for a float indexer, it is purely reindex, label based # For a datetime index, it has also integer location fallback: # In[29]: s_date[[2,3]] # But now, the index values cannot be out of bound (which follows `iloc`): # In[30]: s_date[[3,9]] # And apparantly indexing with a string does not work, when using lists: # In[31]: s_date # In[32]: s_date[['2012-01-03']] # In[33]: s_date['2012-01-03'] # In[34]: s_date[['2012-01-03', '2012-01-04']] # In[35]: _.index # In[36]: s_string[[2,3]] # In[37]: s_string[["c", "f"]] # **Summary** for indexing with **list of labels**: # # * It is primarily *label based*, but: # * There is fallback to integer location based apart from int/float integer axis # * It is a pure reindex, also if no label of the list is found, you just get an all NaN series (which contrasts with loc, where at least one label should be found) # * String parsing for a datetime index does not seem to work # # This mainly follows `ix`, apart from points 2 and 3 # ## Boolean indexing # In[39]: s_int[[True, False, True, False, True]] # It does not need to be of the correct length (as is the same with `ix/loc/iloc`): # In[40]: s_int[[True, False, True, False, True, False]] # In[41]: s_float[[True, False, True, False, True]] # In[42]: s_date[[True, False, True, False, True]] # **Summary** for **boolean indexing**: # # * This is simple, it just works as expected # # ## Specialties for DataFrames # In[43]: df = pd.DataFrame(np.arange(25).reshape(5,5)) df2 = pd.DataFrame(np.arange(25).reshape(5,5), columns=list('abcde')) df3 = pd.DataFrame(np.arange(25).reshape(5,5), columns=[0.0,0.1,0.2,0.3,0.4]) df3b = pd.DataFrame(np.arange(25).reshape(5,5), columns=[0.0,0.1,0.2,0.3,0.4], index=[0.0,0.1,0.2,0.3,0.4]) # In[44]: df # Single label: 'information' axis (axis=1): # In[45]: df[0] # In[46]: df[5] # But no fallback to integer location based when having a non-numeric index: # In[47]: df2[2] # In[56]: df2.ix[:,2] # Slicing: rows (axis=0): # In[48]: df[0:2] # In[49]: df3[0:2] # In[50]: df3b[0:2] # And this seems to follow the same peculiarities as series[] # List of indexers is again axis=1: # In[51]: df[[1,2]] # But now all labels must be present (no pure reindex as with series): # In[52]: df[[1,6]] # In[53]: df.loc[:,[1,6]] # And also fallback to integer location: # In[54]: df2[[1,2]] # Boolean indexing is again row (axis = 0) oriented: # In[55]: df[[True, False, False, True, False]] # In[57]: df2[[True, False, False, True, False]] # **Summary for DataFrames**: # # * It uses the 'information' axis (axis 1) for: # * single labels # * list of labels # * It uses the rows (axis 0) for: # * slicing # * boolean indexing # # This is as documented (only the boolean case is not explicitely documented I think). # # For the rest (on the choses axis), it follows the same semantics as `[]` on a series, **but**: # # * for a list of labels, now all labels must be present (no pure reindex as with series) # * for single labels: no fallback to integer location based for non-numeric index (but this *does* fallback for a list of labels ...)