# import some modules using standard naming conventions
from pylab import *
import pandas as pd
from IPython.core.display import HTML
from matplotlib.dates import MonthLocator,DateFormatter
# full path name of spreadsheet file from Coalition for Buzzards Bay
#xls_path='/usgs/data2/notebook/Falmouth_data.xlsx'
#xls_path='D:\crs\docs\personal\science_project\Falmouth data for Sarah and Lily_mod.xlsx'
xls_path='/usgs/data2/notebook/Falmouth_data_for_Sarah_and_Lily_mod.xlsx'
# use the panda ExcelFile method to make an object out of the xls file
xls_file = pd.ExcelFile(xls_path)
# this version makes a time series and removes some weird values
asn = xls_file.parse('all stations, raw nutrients',index_col='Date',parse_dates=True,
na_values=['?',None,'NS','no sample left','Sample Destroyed','machine error','#VALUE!'])
asn.index=pd.to_datetime(asn.index)
# this version just reads it in
#asn = xls_file.parse('all stations, raw nutrients') # simpler version
# this version makes the index 'Date', but does not make the DataFrame a time series
#asn = xls_file.parse('all stations, raw nutrients',
# na_values=['?',None,'NS','no sample left','Sample Destroyed','machine error','#VALUE!'])
#asn.set_index('Date',drop='false')
'all stations, raw nutrients'!$1:$1 True 'CBB WF2 nutrient data'!$4:$6 True
# define a function that looks in column var of a and replaces lim with val
def convert_detection_limit(a,var,lim,val):
try:
a[var][where(a[var]==lim)[0]] = val
except:
print 'No detection values found for %s' % var
return a
# for example, fix detction limits for 4 nutrients
asn=convert_detection_limit(asn,'PO4 uM','<0.1',0.05)
asn=convert_detection_limit(asn,'NH4 uM','<0.1',0.05)
asn=convert_detection_limit(asn,'NO3 uM','<0.05',0.025)
asn=convert_detection_limit(asn,'Pheo ug/L','<0.05',0.025)
# try to use same function to remove S from sample depths
asn=convert_detection_limit(asn,'Depth (m)','S',0.0)
asn=convert_detection_limit(asn,'Depth (m)','1M',1.0)
asn.ix[0:4,8:10].head()
Secchi m | Total depth m | |
---|---|---|
1993-07-15 | NaN | 3.7 |
1993-07-15 | NaN | 3.7 |
1993-08-01 | NaN | 3.5 |
1993-08-01 | NaN | 3.5 |
asn.tail()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 5 entries, 0001-255-255 00:00:00 to 0001-255-255 00:00:00 Columns: 242 entries, Embayment to None.210 dtypes: float64(214), object(28)
# find the rows for QH2
#find all the data at station
qh2=asn[asn['Station']=='QH2'] #quissett harbor, qh2
mg4=asn[asn['Station']=='MG4'] #megansett harbor, mg4
# subtract the mean Nitrate conc and plot
qh2m = qh2['NO3 uM'] - qh2['NO3 uM'].mean()
qh2m.plot(linestyle='none',marker='o')
# print just a few columns and render as HTML
new=mg4.reindex(columns=['Embayment','Station','Depth (m)','Temp C','Salt ppt','PO4 uM','POM uM','TDN uM'])
HTML(new[1:5].to_html())
#TODO - Also Remove 999 and <xx from columns
Embayment | Station | Depth (m) | Temp C | Salt ppt | PO4 uM | POM uM | TDN uM | |
---|---|---|---|---|---|---|---|---|
1992-09-10 | MEGANSETT HARBOR | MG4 | 0.15 | NaN | 30.7 | 0.4962779 | 3.48496 | 11.7205 |
1992-09-16 | MEGANSETT HARBOR | MG4 | 0.15 | NaN | 29.7 | 0.5571862 | 3.910275 | 13.0195 |
1992-09-16 | MEGANSETT HARBOR | MG4 | 999 | NaN | 29.7 | 0.5571862 | 3.965878 | 15.9115 |
1993-07-14 | MEGANSETT HARBOR | MG4 | 999 | NaN | 31.7 | 0.4228856 | 5.97483 | 10.4845 |
figure # if you don't use this, the xlabel and ylabel commands won't work
plot(qh2['POM uM'],qh2['NO3 uM'],linestyle='none',marker='o')
xlabel('POM uM')
ylabel('NO3 uM')
title('Scatter plot')
<matplotlib.text.Text at 0x5bf5a50>
# plot the first 31 rows of column labelled 'Salt ppt'
asn['Salt ppt'][0:30].plot()
<matplotlib.axes.AxesSubplot at 0x3f47490>
# These are the values used to calculate the Buzz. Bay Env. Index
#Oxygen saturation (mn of lowest 20%) 40 % 90 %
#Transparency 0.6 m 3 m
#Chlorophyll 10 ug/l 3 ug/l
#DIN 10 uM 1 uM
#Total Organic N 0.6 ppm 0.28 ppm
# Define a function to calculate index for transparency
def bbei(val):
val_0 = 0.6
val_100 = 3.0
# calculate according to formula
score = (log(val)-log(val_0))/(log(val_100)-log(val_0))
# limit score to be between 0 and 1
score = min(score,1.0)
score = max(score,0.0)
return score
print(bbei(2))
0.748070363587
# More general function for several parameters
def bbei(val,param):
# First test the first character of param to see
# what kind of measurement, the assing values
if(param[0].lower()=='o'):
# limits for Oxygen saturation
val_0 = 40.0
val_100 = 90.0
elif(param[0].lower()=='t'):
# limits for Transparency (secchi depth)
val_0 = 0.6
val_100 = 3.0
elif(param[0].lower()=='c'):
# limits for Chlorophyll
val_0 = 10.0
val_100 = 3.0
elif(param[0].lower()=='d'):
# limits for DIN
val_0 = 10.0
val_100 = 1.0
elif(param[0].lower()=='t'):
# limits for Total organic N
val_0 = 0.6
val_100 = 0.28
# calculate index, limit to between 0 and 1, and return
return max(min( (log(val)-log(val_0))/(log(val_100)-log(val_0)) ,1),0)
print(bbei(2,param='DIN'))
0.698970004336
# multiple scatter plots on the same page
figure
subplot(121)
plot( qh2['DIN uM'],qh2['TON uM'],linestyle='none',marker='o')
ylabel('TON uM')
xlabel('DIN uM')
subplot(122)
plot( qh2['TDN uM'],qh2['TON uM'],linestyle='none',marker='o')
xlabel('TDN uM')
<matplotlib.text.Text at 0x603dd50>
# include three Stations
wf=asn[asn['Station'].isin(['WF5','WF5pw','WF5pw/CBBWF2'])]
figure(figsize=(12,5))
wf['POM uM'].plot(linestyle='none',marker='o')
<matplotlib.axes.AxesSubplot at 0x5be1ed0>
type(wf['POM uM'])
pandas.core.series.TimeSeries
wf.values
array([[West Falmouth Hbr, WF5pw, Snug, ..., nan, nan, nan], [West Falmouth Hbr, WF5pw, Snug, ..., nan, nan, nan], [West Falmouth Hbr, WF5pw, Snug, ..., nan, nan, nan], ..., [West Falmouth, WF5pw/CBBWF2, Snug, ..., nan, nan, nan], [West Falmouth, WF5pw/CBBWF2, Snug, ..., nan, nan, nan], [West Falmouth, WF5pw/CBBWF2, Snug, ..., nan, nan, nan]], dtype=object)
wfd=wf[wf['Depth (m)'] >= 0.5]
wfd['Depth (m)'].values
array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.52, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.2, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=object)
# print just a few columns and render as HTML
new=wfd.reindex(columns=['Embayment','Station','Depth (m)','Temp C','Salt ppt','NO3 uM','PO4 uM','POM uM','TDN uM'])
HTML(new[100: ].to_html())
<class 'pandas.tseries.index.DatetimeIndex'> Length: 0, Freq: None, Timezone: None | Empty DataFrame |
model = pd.ols(x=wf['Year'],y=wf['NH4 uM'])
model
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 295 Number of Degrees of Freedom: 2 R-squared: 0.0071 Adj R-squared: 0.0037 Rmse: 1.1210 F-stat (1, 293): 2.0941, p-value: 0.1489 Degrees of Freedom: model 1, resid 293 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x 0.0162 0.0112 1.45 0.1489 -0.0057 0.0381 intercept -31.1477 22.3587 -1.39 0.1646 -74.9708 12.6753 ---------------------------------End of Summary---------------------------------
figure(figsize=(12,5))
model.y_fitted.plot(marker='x')
wf['NH4 uM'].plot(linestyle='none',marker='o',title='NH4 (uM) in West Falmouth Harbor')
<matplotlib.axes.AxesSubplot at 0x5fc6e50>
wf_month=wf.groupby(lambda x: x.month).mean()
wf_month['DO mg/L'].head()
7 5.905087 8 5.617639 Name: DO mg/L, dtype: float64
asn['Station'].unique()
array([QH2, QH1, QH3, LSM1, GPP1, FLP1, MAC1, WF1pw, WF1, WF1pw/CBBWF2, WF5pw, WF5, WF5pw/CBBWF2, WF2pw, WF2, WF2pw/CBBWF4, WF3pw, WF3, WF3pw/CBBWF5, WF4pw, WF4, WF4pw/CBBWF1, WF6pw, WF6, WF6pw/CBBWF3, WF7pw, WF7, WF7pw/CBBWF9, WF8pw, WF8, nan, HB2, WH1, WH2, WH3, FC1, RH1, SQ2, SQ1, MG1, MG2, MG3, MG4, CBB1, MB1], dtype=object)
asn['Embayment'].unique()
array([QUISSETT HARBOR, Little Sipp Marsh, GUNNING POINT POND, Flume Pond, Mashapaquit Creek, West Falmouth Hbr, West Falmouth, HERRING BROOK, Wild Harbor, nan, Wild Harbor River, Fiddler Cove, Rands Harbor, SQUETEAGUE HARBOR, MEGANSETT HARBOR, CBB BUOY, Manomet Bay], dtype=object)
pd.__version__
'0.11.0'