#!/usr/bin/env python
# coding: utf-8

# Overview of indexing semantics when using `[]` (`__getitem__`).
# 
# This does not yet handle with all extra special cases (like duplicate labels, non-monotonic, contiguous or not, etc cases).

# In[1]:


import pandas as pd
import numpy as np


# In[2]:


print "pandas: ", pd.__version__


# In[3]:


s_int = pd.Series(range(5), index=[0,1,2,3,4])
s_int2 = pd.Series(range(5), index=[0,1,4,5,6])


# In[4]:


s_float = pd.Series(range(5), index=[0.0,0.1,0.2,0.3,0.4])


# In[5]:


s_date = pd.Series(range(5), index=pd.date_range('2012-01-01', periods=5))


# In[6]:


s_string = pd.Series(range(5), index=list('abcde'))


# ## Slicing

# Slicing is **integer location based** for an integer axis:

# In[7]:


print s_int[0:3]
print s_int.ix[0:3]
print s_int.loc[0:3]
print s_int.iloc[0:3]


# In[8]:


s_int2[0:3]


# But for an axis with a float type ... only **label based**:

# In[9]:


print s_float[0:3]
print s_float[0:0.3]


# In[10]:


print s_float.ix[0:3]


# For other types, logically it is **integer location based** when having integer slice labels:

# In[11]:


s_date[0:3]


# In[12]:


s_string[0:3]


# and **label based** when having slice labels of the correct type:

# In[13]:


s_date["2012-01-01":"2012-01-03"]


# In[14]:


s_string["a":"c"]


# **Summary** for **slicing**:
#     
# * Slicing with integer labels is:
#     * always *integer location based*
#     * except for a *float indexer* where it is label based
# * Slicing with other types of labels is always label based if it is of appropriate type for the indexer.
# 
# So, you can say that the behaviour is equivalent to `.ix`, except that the behaviour for integer labels is different for integer indexers (swapped). (For `.ix`, when having an integer axis, it is always label based and no fallback to integer location based).

# ## Single label

# In[15]:


print s_int[4]
print s_int2[4]


# In[16]:


print s_int2[3]


# In[17]:


print s_float[2]


# In[18]:


print s_float[0.2]


# In[19]:


s_date["2012-01-03"]


# In[20]:


s_date[2]


# In[21]:


s_string["c"]


# In[22]:


s_string[2]


# **Summary** for **single label**:
#     
# * Indexing with a single label is **always label based**
# * But, there is fallback to integer location based, except for integer and float indexers

# ## List of labels

# In[23]:


s_int[[3,4]]


# In[24]:


print s_int2[[3,4]]
print s_int2.loc[[3,4]]


# In[25]:


print s_int2[[2, 3]]
print s_int2.loc[[2,3]]


# So with `[]` using a list is a pure reindex, also if no label of the list is found, you just get an all NaN series (which contrasts with `loc`, where at least one label should be found)

# In[26]:


print s_int2[[8,9]]


# In[27]:


s_float[[2,3]]


# In[38]:


s_float.ix[[2,3]]


# In[28]:


s_float[[0.2,0.3]]


# So also for a float indexer, it is purely reindex, label based

# For a datetime index, it has also integer location fallback:

# In[29]:


s_date[[2,3]]


# But now, the index values cannot be out of bound (which follows `iloc`):

# In[30]:


s_date[[3,9]]


# And apparantly indexing with a string does not work, when using lists:

# In[31]:


s_date


# In[32]:


s_date[['2012-01-03']]


# In[33]:


s_date['2012-01-03']


# In[34]:


s_date[['2012-01-03', '2012-01-04']]


# In[35]:


_.index


# In[36]:


s_string[[2,3]]


# In[37]:


s_string[["c", "f"]]


# **Summary** for indexing with **list of labels**:
# 
# * It is primarily *label based*, but:
#     * There is fallback to integer location based apart from int/float integer axis
#     * It is a pure reindex, also if no label of the list is found, you just get an all NaN series (which contrasts with loc, where at least one label should be found)
#     * String parsing for a datetime index does not seem to work
# 
# This mainly follows `ix`, apart from points 2 and 3

# ## Boolean indexing

# In[39]:


s_int[[True, False, True, False, True]]


# It does not need to be of the correct length (as is the same with `ix/loc/iloc`):

# In[40]:


s_int[[True, False, True, False, True, False]]


# In[41]:


s_float[[True, False, True, False, True]]


# In[42]:


s_date[[True, False, True, False, True]]


# **Summary** for **boolean indexing**:
# 
# * This is simple, it just works as expected
# 

# ## Specialties for DataFrames

# In[43]:


df = pd.DataFrame(np.arange(25).reshape(5,5))
df2 = pd.DataFrame(np.arange(25).reshape(5,5), columns=list('abcde'))
df3 = pd.DataFrame(np.arange(25).reshape(5,5), columns=[0.0,0.1,0.2,0.3,0.4])
df3b = pd.DataFrame(np.arange(25).reshape(5,5), columns=[0.0,0.1,0.2,0.3,0.4], index=[0.0,0.1,0.2,0.3,0.4])


# In[44]:


df


# Single label: 'information' axis (axis=1):

# In[45]:


df[0]


# In[46]:


df[5]


# But no fallback to integer location based when having a non-numeric index:

# In[47]:


df2[2]


# In[56]:


df2.ix[:,2]


# Slicing: rows (axis=0):

# In[48]:


df[0:2]


# In[49]:


df3[0:2]


# In[50]:


df3b[0:2]


# And this seems to follow the same peculiarities as series[]

# List of indexers is again axis=1:

# In[51]:


df[[1,2]]


# But now all labels must be present (no pure reindex as with series):

# In[52]:


df[[1,6]]


# In[53]:


df.loc[:,[1,6]]


# And also fallback to integer location:

# In[54]:


df2[[1,2]]


# Boolean indexing is again row (axis = 0) oriented:

# In[55]:


df[[True, False, False, True, False]]


# In[57]:


df2[[True, False, False, True, False]]


# **Summary for DataFrames**:
# 
# * It uses the 'information' axis (axis 1) for:
#     * single labels
#     * list of labels
# * It uses the rows (axis 0) for:
#     * slicing
#     * boolean indexing
# 
# This is as documented (only the boolean case is not explicitely documented I think).
# 
# For the rest (on the choses axis), it follows the same semantics as `[]` on a series, **but**:
# 
# * for a list of labels, now all labels must be present (no pure reindex as with series)
# * for single labels: no fallback to integer location based for non-numeric index (but this *does* fallback for a list of labels ...)