#!/usr/bin/env python # coding: utf-8 # # Reading netCDF data # - requires [numpy](http://numpy.scipy.org) and netCDF/HDF5 C libraries. # - Github site: https://github.com/Unidata/netcdf4-python # - Online docs: http://unidata.github.io/netcdf4-python/ # - Based on Konrad Hinsen's old [Scientific.IO.NetCDF](http://dirac.cnrs-orleans.fr/plone/software/scientificpython/) API, with lots of added netcdf version 4 features. # - Developed by Jeff Whitaker at NOAA, with many contributions from users. # ## Interactively exploring a netCDF File # # Let's explore a netCDF file from the *Atlantic Real-Time Ocean Forecast System* # # first, import netcdf4-python and numpy # In[1]: import netCDF4 import numpy as np # ## Create a netCDF4.Dataset object # - **`f`** is a `Dataset` object, representing an open netCDF file. # - printing the object gives you summary information, similar to *`ncdump -h`*. # In[2]: f = netCDF4.Dataset('data/rtofs_glo_3dz_f006_6hrly_reg3.nc') print(f) # ## Access a netCDF variable # - variable objects stored by name in **`variables`** dict. # - print the variable yields summary info (including all the attributes). # - no actual data read yet (just have a reference to the variable object with metadata). # In[3]: print(f.variables.keys()) # get all variable names temp = f.variables['temperature'] # temperature variable print(temp) # ## List the Dimensions # # - All variables in a netCDF file have an associated shape, specified by a list of dimensions. # - Let's list all the dimensions in this netCDF file. # - Note that the **`MT`** dimension is special (*`unlimited`*), which means it can be appended to. # In[4]: for d in f.dimensions.items(): print(d) # Each variable has a **`dimensions`** and a **`shape`** attribute. # In[5]: temp.dimensions # In[6]: temp.shape # ### Each dimension typically has a variable associated with it (called a *coordinate* variable). # - *Coordinate variables* are 1D variables that have the same name as dimensions. # - Coordinate variables and *auxiliary coordinate variables* (named by the *coordinates* attribute) locate values in time and space. # In[7]: mt = f.variables['MT'] depth = f.variables['Depth'] x,y = f.variables['X'], f.variables['Y'] print(mt) print(x) # ## Accessing data from a netCDF variable object # # - netCDF variables objects behave much like numpy arrays. # - slicing a netCDF variable object returns a numpy array with the data. # - Boolean array and integer sequence indexing behaves differently for netCDF variables than for numpy arrays. Only 1-d boolean arrays and integer sequences are allowed, and these indices work independently along each dimension (similar to the way vector subscripts work in fortran). # In[8]: time = mt[:] # Reads the netCDF variable MT, array of one element print(time) # In[9]: dpth = depth[:] # examine depth array print(dpth) # In[10]: xx,yy = x[:],y[:] print('shape of temp variable: %s' % repr(temp.shape)) tempslice = temp[0, dpth > 400, yy > yy.max()/2, xx > xx.max()/2] print('shape of temp slice: %s' % repr(tempslice.shape)) # ## What is the sea surface temperature and salinity at 50N, 140W? # ### Finding the latitude and longitude indices of 50N, 140W # # - The `X` and `Y` dimensions don't look like longitudes and latitudes # - Use the auxilary coordinate variables named in the `coordinates` variable attribute, `Latitude` and `Longitude` # In[11]: lat, lon = f.variables['Latitude'], f.variables['Longitude'] print(lat) # Aha! So we need to find array indices `iy` and `ix` such that `Latitude[iy, ix]` is close to 50.0 and `Longitude[iy, ix]` is close to -140.0 ... # In[12]: # extract lat/lon values (in degrees) to numpy arrays latvals = lat[:]; lonvals = lon[:] # a function to find the index of the point closest pt # (in squared distance) to give lat/lon value. def getclosest_ij(lats,lons,latpt,lonpt): # find squared distance of every point on grid dist_sq = (lats-latpt)**2 + (lons-lonpt)**2 # 1D index of minimum dist_sq element minindex_flattened = dist_sq.argmin() # Get 2D index for latvals and lonvals arrays from 1D index return np.unravel_index(minindex_flattened, lats.shape) iy_min, ix_min = getclosest_ij(latvals, lonvals, 50., -140) # ### Now we have all the information we need to find our answer. # # ``` # |----------+--------| # | Variable | Index | # |----------+--------| # | MT | 0 | # | Depth | 0 | # | Y | iy_min | # | X | ix_min | # |----------+--------| # ``` # ### What is the sea surface temperature and salinity at the specified point? # In[13]: sal = f.variables['salinity'] # Read values out of the netCDF file for temperature and salinity print('%7.4f %s' % (temp[0,0,iy_min,ix_min], temp.units)) print('%7.4f %s' % (sal[0,0,iy_min,ix_min], sal.units)) # ## Remote data access via openDAP # # - Remote data can be accessed seamlessly with the netcdf4-python API # - Access happens via the DAP protocol and DAP servers, such as TDS. # - many formats supported, like GRIB, are supported "under the hood". # The following example showcases some nice netCDF features: # # 1. We are seamlessly accessing **remote** data, from a TDS server. # 2. We are seamlessly accessing **GRIB2** data, as if it were netCDF data. # 3. We are generating **metadata** on-the-fly. # In[14]: import datetime date = datetime.datetime.now() # build URL for latest synoptic analysis time URL = 'https://thredds.ucar.edu/thredds/dodsC/grib/NCEP/GFS/Global_0p5deg/GFS_Global_0p5deg_%04i%02i%02i_%02i%02i.grib2/GC' %\ (date.year,date.month,date.day,6*(date.hour//6),0) # keep moving back 6 hours until a valid URL found validURL = False; ncount = 0 while (not validURL and ncount < 10): print(URL) try: gfs = netCDF4.Dataset(URL) validURL = True except RuntimeError: date -= datetime.timedelta(hours=6) ncount += 1 # In[15]: # Look at metadata for a specific variable # gfs.variables.keys() will show all available variables. sfctmp = gfs.variables['Temperature_surface'] # get info about sfctmp print(sfctmp) # print coord vars associated with this variable for dname in sfctmp.dimensions: print(gfs.variables[dname]) # ##Missing values # - when `data == var.missing_value` somewhere, a masked array is returned. # - illustrate with soil moisture data (only defined over land) # - white areas on plot are masked values over water. # In[16]: soilmvar = gfs.variables['Volumetric_Soil_Moisture_Content_depth_below_surface_layer'] # flip the data in latitude so North Hemisphere is up on the plot soilm = soilmvar[0,0,::-1,:] print('shape=%s, type=%s, missing_value=%s' % \ (soilm.shape, type(soilm), soilmvar.missing_value)) import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') cs = plt.contourf(soilm) # ##Packed integer data # There is a similar feature for variables with `scale_factor` and `add_offset` attributes. # # - short integer data will automatically be returned as float data, with the scale and offset applied. # ## Dealing with dates and times # - time variables usually measure relative to a fixed date using a certain calendar, with units specified like ***`hours since YY:MM:DD hh-mm-ss`***. # - **`num2date`** and **`date2num`** convenience functions provided to convert between these numeric time coordinates and handy python datetime instances. # - **`date2index`** finds the time index corresponding to a datetime instance. # In[17]: from netCDF4 import num2date, date2num, date2index timedim = sfctmp.dimensions[0] # time dim name print('name of time dimension = %s' % timedim) times = gfs.variables[timedim] # time coord var print('units = %s, values = %s' % (times.units, times[:])) # In[18]: dates = num2date(times[:], times.units) print([date.strftime('%Y-%m-%d %H:%M:%S') for date in dates[:10]]) # print only first ten... # ###Get index associated with a specified date, extract forecast data for that date. # In[19]: from datetime import datetime, timedelta date = datetime.now() + timedelta(days=3) print(date) ntime = date2index(date,times,select='nearest') print('index = %s, date = %s' % (ntime, dates[ntime])) # ###Get temp forecast for Boulder (near 40N, -105W) # - use function **`getcloses_ij`** we created before... # In[20]: lats, lons = gfs.variables['lat'][:], gfs.variables['lon'][:] # lats, lons are 1-d. Make them 2-d using numpy.meshgrid. lons, lats = np.meshgrid(lons,lats) j, i = getclosest_ij(lats,lons,40,-105) fcst_temp = sfctmp[ntime,j,i] print('Boulder forecast valid at %s UTC = %5.1f %s' % \ (dates[ntime],fcst_temp,sfctmp.units)) # ##Simple multi-file aggregation # # What if you have a bunch of netcdf files, each with data for a different year, and you want to access all the data as if it were in one file? # In[21]: get_ipython().system('ls -ldgG data/prmsl*nc') # **`MFDataset`** uses file globbing to patch together all the files into one big Dataset. # You can also pass it a list of specific files. # # Limitations: # # - It can only aggregate the data along the leftmost dimension of each variable. # - only works with `NETCDF3`, or `NETCDF4_CLASSIC` formatted files. # - kind of slow. # In[22]: mf = netCDF4.MFDataset('data/prmsl*nc') times = mf.variables['time'] dates = num2date(times[:],times.units) print('starting date = %s' % dates[0]) print('ending date = %s'% dates[-1]) prmsl = mf.variables['prmsl'] print('times shape = %s' % times.shape) print('prmsl dimensions = %s, prmsl shape = %s' %\ (prmsl.dimensions, prmsl.shape)) # ## Closing your netCDF file # # It's good to close netCDF files, but not actually necessary when Dataset is open for read access only. # # In[23]: f.close() gfs.close() # ##That's it! # # Now you're ready to start exploring your data interactively. # # To be continued with **Writing netCDF data** ....