Pandas set up

In [1]:
# Import libraries
from pandas import DataFrame
import pandas as pd
import datetime
%matplotlib inline

print 'Pandas version: ' + pd.__version__
Pandas version: 0.12.0
In [2]:
# Create dummy data for examples
df = DataFrame(data = {'Column1':[1,2,3],
                       'Column2':['a','b','c'],
                       'Column3':[pd.datetime(2013,1,1), pd.datetime(2013,1,1), pd.datetime(2013,1,1)],
                       'Column4':[1,2,3]})

df
Out[2]:
Column1 Column2 Column3 Column4
0 1 a 2013-01-01 00:00:00 1
1 2 b 2013-01-01 00:00:00 2
2 3 c 2013-01-01 00:00:00 3
In [3]:
# How to declare variables
# How to update variables
# How to update a table
# How to get current date, yesterday, last year
# How to get first of month or last day of month
# How to insert into a table from another table
# How to join two tables
# How to select n number of rows
# How to select rows in ascending/descending order
# How to select unique vales (no dups)
# How to write a case statement within an update
# How to check for NULL values
# How to use the Keyword "IN"
# How to count all of the rows in a table
# How to delete contents of a table
# How to select the smallest/largest value in a column
# How to string match

How to declare variables

In [4]:
## SQL

"""
Declare @Var1 datetime
Declare @Var2 char(20)
Declare @Var3 nvarchar(20)
Declare @Var4 int
Declare @Var5 decimal (18,2)
Declare @Var6 bit
"""

## Pandas

Var1 = pd.to_datetime(datetime.date.today())
Var2 = 'hello'
Var3 = 1
Var4 = 1.5

How to update variables

In [5]:
## SQL

"""
Declare @Var2 char(20)
	Set @Var2 = 'hello'
"""

## Pandas

Var2 = 'hi' #declare
Var2 = 'hello' # update

How to update a table

In [6]:
## SQL

"""
--ver1:
UPDATE tblName
	SET Column1 = 1,
	Column2 = 'NERevenueCode',
	Column3 = getdate()
WHERE Column4 = 'NEAccSummaryLoadMain'

--ver2:
UPDATE cc
	SET cc.Column1 = 1,
	cc.Column2 = 'NERevenueCode',
	cc.Column3 = getdate()
FROM tblName cc
WHERE cc.Column4 = 'NEAccSummaryLoadMain'
"""

## Pandas

# Create where statement
# mask = where Column4 is greater than 1
mask = df['Column4'] > 1

# Update Column where Column4 is greater than 1
df['Column1'][mask] = 1
df['Column2'][mask] = 'abc'
df['Column3'][mask] = pd.to_datetime(datetime.date.today())
df
Out[6]:
Column1 Column2 Column3 Column4
0 1 a 2013-01-01 00:00:00 1
1 1 abc 2013-10-02 00:00:00 2
2 1 abc 2013-10-02 00:00:00 3

How to get current date, yesterday, last year

In [7]:
## SQL

"""
-- Date functions
SELECT getdate() -- Todays date with time stamp
SELECT dateadd (dd, datediff (dd, 0, GetDate()), 0) --Todays date with no time stamp 
SELECT dateadd (dd, datediff (dd, 0, GetDate()), -1) --Yesterdays date with no time stamp 
SELECT DateAdd (yy, -1, DateDiff(dd, 0, GetDate())) --One year ago no timestamp
"""

## Pandas

TodayTimeStamp = pd.to_datetime(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
Today = pd.to_datetime(datetime.datetime.now().strftime("%Y-%m-%d"))
YesterdayTimeStamp = pd.to_datetime(datetime.datetime.today()-datetime.timedelta(days=1))
Yesterday = pd.to_datetime(datetime.date.today()-datetime.timedelta(days=1))
OneYearTimeStamp = pd.to_datetime(datetime.datetime.today().replace(year=datetime.datetime.today().year - 1))
OneYear = pd.to_datetime(datetime.date.today().replace(year=datetime.date.today().year - 1))

print 'TodayTimeStamp ' + str(TodayTimeStamp)
print 'Today ' + str(Today)
print 'YesterdayTimeStamp ' + str(YesterdayTimeStamp)
print 'Yesterday ' + str(Yesterday)
print 'OneYearTimeStamp ' + str(OneYearTimeStamp)
print 'OneYear ' + str(OneYear)
TodayTimeStamp 2013-10-02 10:34:34
Today 2013-10-02 00:00:00
YesterdayTimeStamp 2013-10-01 10:34:34.188000
Yesterday 2013-10-01 00:00:00
OneYearTimeStamp 2012-10-02 10:34:34.188000
OneYear 2012-10-02 00:00:00

How to get first of month or last day of month

In [8]:
## SQL

"""
--Declare variables
Declare @BeginDate datetime
Declare @EndDate datetime

-- set @BeginDate = first day of last month
Set @BeginDate = DateAdd(mm, DateDiff(mm, 0, GetDate()) - 1, 0)

-- set @EndDate = last day of last month
Set @EndDate = DateAdd(dd, -1, (DateAdd(mm, 1, @BeginDate)))
"""

## Pandas

FirstDay = pd.to_datetime(datetime.date.today().replace(month=datetime.date.today().month, day=1))
LastDay = FirstDay.replace(month=FirstDay.month+1)-datetime.timedelta(days=1)

print 'FirstDay ' + str(FirstDay)
print 'LastDay ' + str(LastDay)
FirstDay 2013-10-01 00:00:00
LastDay 2013-10-31 00:00:00

How to insert into a table from another table

In [9]:
## SQL

"""
IF OBJECT_ID('tempdb..#tblName','u') IS NOT NULL
BEGIN
DROP TABLE #tblName
END
CREATE TABLE #tblName 
(
	Var1 varchar (25) PRIMARY KEY,
	Var2 datetime
)

IF OBJECT_ID('tempdb..#tblName2','u') IS NOT NULL
BEGIN
DROP TABLE #tblName2
END
CREATE TABLE #tblName2
(
	Var1 varchar (25) PRIMARY KEY,
	Var2 datetime
)

INSERT #tblName2 (Var1, Var2)
SELECT 'tom',
		getdate() 

INSERT #tblName (Var1, Var2)		
SELECT Var1,
		Var2
FROM #tblName2	

SELECT * FROM #tblName
"""

## Pandas

# Insert into new data frame using the data from df
df2 = df
df
Out[9]:
Column1 Column2 Column3 Column4
0 1 a 2013-01-01 00:00:00 1
1 1 abc 2013-10-02 00:00:00 2
2 1 abc 2013-10-02 00:00:00 3

How to join two tables

In [10]:
## SQL

"""
-- Only returns if col1 matches col2
SELECT a.*
FROM tbl1 a
JOIN tbl2 b on (a.Col1 = b.Col2)

-- Returns all rows from tbl1 even if col1 does not match col2
SELECT a.*
FROM tbl1 a
LEFT JOIN tbl2 b on (a.Col1 = b.Col2)
"""

## Pandas

# inner join
df.merge(df2,left_on='Column1',right_on='Column1')

# left join
df.merge(df2,left_on='Column1',right_on='Column1',how='left')
Out[10]:
Column1 Column2_x Column3_x Column4_x Column2_y Column3_y Column4_y
0 1 a 2013-01-01 00:00:00 1 a 2013-01-01 00:00:00 1
1 1 a 2013-01-01 00:00:00 1 abc 2013-10-02 00:00:00 2
2 1 a 2013-01-01 00:00:00 1 abc 2013-10-02 00:00:00 3
3 1 abc 2013-10-02 00:00:00 2 a 2013-01-01 00:00:00 1
4 1 abc 2013-10-02 00:00:00 2 abc 2013-10-02 00:00:00 2
5 1 abc 2013-10-02 00:00:00 2 abc 2013-10-02 00:00:00 3
6 1 abc 2013-10-02 00:00:00 3 a 2013-01-01 00:00:00 1
7 1 abc 2013-10-02 00:00:00 3 abc 2013-10-02 00:00:00 2
8 1 abc 2013-10-02 00:00:00 3 abc 2013-10-02 00:00:00 3

How to select n number of rows

In [11]:
## SQL

"""
-- selects only n number of rows
-- does not mean they will be in asc or desc order
SELECT top 500 Col1
FROM tbl1
"""

## Pandas

df.head(500)
df.tail(500)
Out[11]:
Column1 Column2 Column3 Column4
0 1 a 2013-01-01 00:00:00 1
1 1 abc 2013-10-02 00:00:00 2
2 1 abc 2013-10-02 00:00:00 3

How to select rows in ascending/descending order

In [12]:
## SQL

"""
-- Select rows in ascending order
SELECT *
FROM tbl1
ORDER BY Col1 ASC

-- Select rows in descending order
SELECT *
FROM tbl1
ORDER BY Col1 DESC
"""

## Pandas

# order by Column4 ascending
df.sort(ascending=True,columns=['Column4'])

# order by Column4 descending
df.sort(ascending=False,columns=['Column4'])
Out[12]:
Column1 Column2 Column3 Column4
2 1 abc 2013-10-02 00:00:00 3
1 1 abc 2013-10-02 00:00:00 2
0 1 a 2013-01-01 00:00:00 1

How to select unique vales (no dups)

In [13]:
# SQL

"""
SELECT DISTINCT Col1
FROM tbl1
"""

# Pandas

df['Column1'].unique()
Out[13]:
array([1], dtype=int64)

How to write a case statement within an update

In [14]:
## SQL

"""
UPDATE dr
      SET dr.Col1 =
			CASE
				WHEN (SELECT @Var1) > 0 THEN 'Residential'
				ELSE 'Commercial'
			END
      FROM tblName dr
"""

## Pandas

# Set Column1 = 10 if value is equal to 2
# Set Column1 = 20 if value is equal to 3
# else Set Column1 = its current value
df['Column1'].apply(lambda x: 10 if x==2 else (20 if x==3 else x))
Out[14]:
0    1
1    1
2    1
Name: Column1, dtype: int64

How to check for NULL values

In [15]:
## SQL

"""
--function ISNULL (Var1, 0)	
-- if Var1 is null then assign it a zero, or whatever you want
SELECt ISNULL (Col1, 0) FROM tblName

-- you can also just check the column without modifying it
SELECT * 
FROM tblName
WHERE Col1 IS NOT NULL -- select everything where Col1 is not null

SELECT * 
FROM tblName
WHERE Col1 IS NULL -- select everything where Col1 is null
"""

## Pandas

# Only return non null values
df.dropna()

# Check if column has any null values
df['Column1'].isnull()

# Only return rows that are not null
df[~df['Column1'].isnull()]

# Fill null values with a zero
df.fillna(0)
Out[15]:
Column1 Column2 Column3 Column4
0 1 a 2013-01-01 00:00:00 1
1 1 abc 2013-10-02 00:00:00 2
2 1 abc 2013-10-02 00:00:00 3

How to use the Keyword "IN"

In [16]:
## SQL

"""
SELECT * FROM tbl WHERE Col1 IN (1,2,3)
"""

## Pandas

mask = df['Column1'].isin([1,2,3])
df[mask]
Out[16]:
Column1 Column2 Column3 Column4
0 1 a 2013-01-01 00:00:00 1
1 1 abc 2013-10-02 00:00:00 2
2 1 abc 2013-10-02 00:00:00 3

How to count all of the rows in a table

In [17]:
## SQL

"""
SELECT COUNT(*) FROM tblName
"""

## Pandas

df.count()
Out[17]:
Column1    3
Column2    3
Column3    3
Column4    3
dtype: int64

How to delete contents of a table

In [18]:
## SQL

"""
DELETE tblName -- deletes all contents of table

DELETE tblName -- deletes only rows where col1 = 0
WHERE col1 = 0
"""

## Pandas

# delete any rows where column Var2 is less than 1/2/2013
mask = df['Column3'] > pd.to_datetime('2013-1-1')
df[mask]
Out[18]:
Column1 Column2 Column3 Column4
1 1 abc 2013-10-02 00:00:00 2
2 1 abc 2013-10-02 00:00:00 3

How to select the smallest/largest value in a column

In [19]:
## SQL

"""
SELECT MIN (col1) FROM tblName
SELECT MAX (col1) FROM tblName
"""

## Pandas

df.max()
df.min()
Out[19]:
Column1                      1
Column2                      a
Column3    2013-01-01 00:00:00
Column4                      1
dtype: object

How to string match

In [20]:
## SQL

"""
SELECT * 
FROM tblName
WHERE Col1 LIKE '%StringYouAreSearchingFor%' 

SELECT * 
FROM tblName
WHERE Col1 NOT LIKE '%StringYouAreSearchingFor%'
"""

## Pandas

mask1 = df['Column2'].str.startswith('b')
mask2 = df['Column2'].str.endswith('a')
mask3 = df['Column2'].str.contains('c')

df[mask3]
Out[20]:
Column1 Column2 Column3 Column4
1 1 abc 2013-10-02 00:00:00 2
2 1 abc 2013-10-02 00:00:00 3