#!/usr/bin/env python
# coding: utf-8

# # Writing netCDF data
# 
# **Important Note**: when running this notebook interactively in a browser, you probably will not be able to execute individual cells out of order without getting an error.  Instead, choose "Run All" from the Cell menu after you modify a cell.

# In[25]:


import netCDF4     # Note: python is case-sensitive!
import numpy as np


# ## Opening a file, creating a new Dataset
# 
# Let's create a new, empty netCDF file named 'data/new.nc', opened for writing.
# 
# Be careful, opening a file with 'w' will clobber any existing data (unless `clobber=False` is used, in which case an exception is raised if the file already exists).
# 
# - `mode='r'` is the default.
# - `mode='a'` opens an existing file and allows for appending (does not clobber existing data)
# - `format` can be one of `NETCDF3_CLASSIC`, `NETCDF3_64BIT`, `NETCDF4_CLASSIC` or `NETCDF4` (default). `NETCDF4_CLASSIC` uses HDF5 for the underlying storage layer (as does `NETCDF4`) but enforces the classic netCDF 3 data model so data can be read with older clients.  

# In[26]:


try: ncfile.close()  # just to be safe, make sure dataset is not already open.
except: pass
ncfile = netCDF4.Dataset('data/new.nc',mode='w',format='NETCDF4_CLASSIC') 
print(ncfile)


# ## Creating dimensions
# 
# The **ncfile** object we created is a container for _dimensions_, _variables_, and _attributes_.   First, let's create some dimensions using the [`createDimension`](http://unidata.github.io/netcdf4-python/netCDF4.Dataset-class.html#createDimension) method.  
# 
# - Every dimension has a name and a length.  
# - The name is a string that is used to specify the dimension to be used when creating a variable, and as a key to access the dimension object in the `ncfile.dimensions` dictionary.
# 
# Setting the dimension length to `0` or `None` makes it unlimited, so it can grow. 
# 
# - For `NETCDF4` files, any variable's dimension can be unlimited.  
# - For `NETCDF4_CLASSIC` and `NETCDF3*` files, only one per variable can be unlimited, and it must be the leftmost (fastest varying) dimension.

# In[27]:


lat_dim = ncfile.createDimension('lat', 73)     # latitude axis
lon_dim = ncfile.createDimension('lon', 144)    # longitude axis
time_dim = ncfile.createDimension('time', None) # unlimited axis (can be appended to).
for dim in ncfile.dimensions.items():
    print(dim)


# ## Creating attributes
# 
# netCDF attributes can be created just like you would for any python object. 
# 
# - Best to adhere to established conventions (like the [CF](http://cfconventions.org/) conventions)
# - We won't try to adhere to any specific convention here though.

# In[28]:


ncfile.title='My model data'
print(ncfile.title)


# Try adding some more attributes...

# ## Creating variables
# 
# Now let's add some variables and store some data in them.  
# 
# - A variable has a name, a type, a shape, and some data values.  
# - The shape of a variable is specified by a tuple of dimension names.  
# - A variable should also have some named attributes, such as 'units', that describe the data.
# 
# The [`createVariable`](http://unidata.github.io/netcdf4-python/netCDF4.Dataset-class.html#createVariable) method takes 3 mandatory args.
# 
# - the 1st argument is the variable name (a string). This is used as the key to access the variable object from the `variables` dictionary.
# - the 2nd argument is the datatype (most numpy datatypes supported).  
# - the third argument is a tuple containing the dimension names (the dimensions must be created first).  Unless this is a `NETCDF4` file, any unlimited dimension must be the leftmost one.
# - there are lots of optional arguments (many of which are only relevant when `format='NETCDF4'`) to control compression, chunking, fill_value, etc.
# 

# In[29]:


# Define two variables with the same names as dimensions,
# a conventional way to define "coordinate variables".
lat = ncfile.createVariable('lat', np.float32, ('lat',))
lat.units = 'degrees_north'
lat.long_name = 'latitude'
lon = ncfile.createVariable('lon', np.float32, ('lon',))
lon.units = 'degrees_east'
lon.long_name = 'longitude'
time = ncfile.createVariable('time', np.float64, ('time',))
time.units = 'hours since 1800-01-01'
time.long_name = 'time'
# Define a 3D variable to hold the data
temp = ncfile.createVariable('temp',np.float64,('time','lat','lon')) # note: unlimited dimension is leftmost
temp.units = 'K' # degrees Kelvin
temp.standard_name = 'air_temperature' # this is a CF standard name
print(temp)


# ## Pre-defined variable attributes (read only)
# 
# The netCDF4 module provides some useful pre-defined Python attributes for netCDF variables, such as dimensions, shape, dtype, ndim. 
# 
# Note: since no data has been written yet, the length of the 'time' dimension is 0.

# In[30]:


print("-- Some pre-defined attributes for variable temp:")
print("temp.dimensions:", temp.dimensions)
print("temp.shape:", temp.shape)
print("temp.dtype:", temp.dtype)
print("temp.ndim:", temp.ndim)


# ## Writing data
# 
# To write data to a netCDF variable object, just treat it like a numpy array and assign values to a slice.

# In[31]:


nlats = len(lat_dim); nlons = len(lon_dim); ntimes = 3
# Write latitudes, longitudes.
# Note: the ":" is necessary in these "write" statements
lat[:] = -90. + (180./nlats)*np.arange(nlats) # south pole to north pole
lon[:] = (180./nlats)*np.arange(nlons) # Greenwich meridian eastward
# create a 3D array of random numbers
data_arr = np.random.uniform(low=280,high=330,size=(ntimes,nlats,nlons))
# Write the data.  This writes the whole 3D netCDF variable all at once.
temp[:,:,:] = data_arr  # Appends data along unlimited dimension
print("-- Wrote data, temp.shape is now ", temp.shape)
# read data back from variable (by slicing it), print min and max
print("-- Min/Max values:", temp[:,:,:].min(), temp[:,:,:].max())


# - You can just treat a netCDF Variable object like a numpy array and assign values to it.
# - Variables automatically grow along unlimited dimensions (unlike numpy arrays)
# - The above writes the whole 3D variable all at once,  but you can write it a slice at a time instead.
# 
# Let's add another time slice....
# 

# In[32]:


# create a 2D array of random numbers
data_slice = np.random.uniform(low=280,high=330,size=(nlats,nlons))
temp[3,:,:] = data_slice   # Appends the 4th time slice
print("-- Wrote more data, temp.shape is now ", temp.shape)


# Note that we have not yet written any data to the time variable.  It automatically grew as we appended data along the time dimension to the variable `temp`, but the data is missing.

# In[33]:


print(time)
times_arr = time[:]
print(type(times_arr),times_arr)  # dashes indicate masked values (where data has not yet been written)


# Let's add write some data into the time variable.  
# 
# - Given a set of datetime instances, use date2num to convert to numeric time values and then write that data to the variable.

# In[34]:


from datetime import datetime
from netCDF4 import date2num,num2date
# 1st 4 days of October.
dates = [datetime(2014,10,1,0),datetime(2014,10,2,0),datetime(2014,10,3,0),datetime(2014,10,4,0)]
print(dates)
times = date2num(dates, time.units)
print(times, time.units) # numeric values
time[:] = times
# read time data back, convert to datetime instances, check values.
print(num2date(time[:],time.units))


# ## Closing a netCDF file
# 
# It's **important** to close a netCDF file you opened for writing:
# 
# - flushes buffers to make sure all data gets written
# - releases memory resources used by open netCDF files

# In[35]:


# first print the Dataset object to see what we've got
print(ncfile)
# close the Dataset.
ncfile.close(); print('Dataset is closed!')


# # Advanced features
# 
# So far we've only exercised features associated with the old netCDF version 3 data model.  netCDF version 4 adds a lot of new functionality that comes with the more flexible HDF5 storage layer.  
# 
# Let's create a new file with `format='NETCDF4'` so we can try out some of these features.

# In[36]:


ncfile = netCDF4.Dataset('data/new2.nc','w',format='NETCDF4')
print(ncfile)


# ## Creating Groups
# 
# netCDF version 4 added support for organizing data in hierarchical groups.
# 
# - analagous to directories in a filesystem. 
# - Groups serve as containers for variables, dimensions and attributes, as well as other groups. 
# - A `netCDF4.Dataset` creates a special group, called the 'root group', which is similar to the root directory in a unix filesystem. 
# 
# - groups are created using the [`createGroup`](http://unidata.github.io/netcdf4-python/netCDF4.Dataset-class.html#createGroup) method.
# - takes a single argument (a string, which is the name of the Group instance).  This string is used as a key to access the group instances in the `groups` dictionary.
# 
# Here we create two groups to hold data for two different model runs.

# In[37]:


grp1 = ncfile.createGroup('model_run1')
grp2 = ncfile.createGroup('model_run2')
for grp in ncfile.groups.items():
    print(grp)


# Create some dimensions in the root group.

# In[38]:


lat_dim = ncfile.createDimension('lat', 73)     # latitude axis
lon_dim = ncfile.createDimension('lon', 144)    # longitude axis
time_dim = ncfile.createDimension('time', None) # unlimited axis (can be appended to).


# Now create a variable in grp1 and grp2.  The library will search recursively upwards in the group tree to find the dimensions (which in this case are defined one level up).
# 
# - These variables are create with **zlib compression**, another nifty feature of netCDF 4. 
# - The data are automatically compressed when data is written to the file, and uncompressed when the data is read.  
# - This can really save disk space, especially when used in conjunction with the [**least_significant_digit**](http://unidata.github.io/netcdf4-python/netCDF4.Dataset-class.html#createVariable) keyword argument, which causes the data to be quantized (truncated) before compression.  This makes the compression lossy, but more efficient.

# In[39]:


temp1 = grp1.createVariable('temp',np.float64,('time','lat','lon'),zlib=True)
temp2 = grp2.createVariable('temp',np.float64,('time','lat','lon'),zlib=True)
for grp in ncfile.groups.items():  # shows that each group now contains 1 variable
    print(grp)


# ##Creating a variable with a compound data type
# 
# - Compound data types map directly to numpy structured (a.k.a 'record' arrays). 
# - Structured arrays are akin to C structs, or derived types in Fortran. 
# - They allow for the construction of table-like structures composed of combinations of other data types, including other compound types. 
# - Might be useful for representing multiple parameter values at each point on a grid, or at each time and space location for scattered (point) data. 
# 
# Here we create a variable with a compound data type to represent complex data (there is no native complex data type in netCDF). 
# 
# - The compound data type is created with the [`createCompoundType`](http://unidata.github.io/netcdf4-python/netCDF4.Dataset-class.html#createCompoundType) method.

# In[40]:


# create complex128 numpy structured data type
complex128 = np.dtype([('real',np.float64),('imag',np.float64)])
# using this numpy dtype, create a netCDF compound data type object
# the string name can be used as a key to access the datatype from the cmptypes dictionary.
complex128_t = ncfile.createCompoundType(complex128,'complex128')
# create a variable with this data type, write some data to it.
cmplxvar = grp1.createVariable('cmplx_var',complex128_t,('time','lat','lon'))
# write some data to this variable
# first create some complex random data
nlats = len(lat_dim); nlons = len(lon_dim)
data_arr_cmplx = np.random.uniform(size=(nlats,nlons))+1.j*np.random.uniform(size=(nlats,nlons))
# write this complex data to a numpy complex128 structured array
data_arr = np.empty((nlats,nlons),complex128)
data_arr['real'] = data_arr_cmplx.real; data_arr['imag'] = data_arr_cmplx.imag
cmplxvar[0] = data_arr  # write the data to the variable (appending to time dimension)
print(cmplxvar)
data_out = cmplxvar[0] # read one value of data back from variable
print(data_out.dtype, data_out.shape, data_out[0,0])


# ##Creating a variable with a variable-length (vlen) data type
# 
# netCDF 4 has support for variable-length or "ragged" arrays. These are arrays of variable length sequences having the same type. 
# 
# - To create a variable-length data type, use the [`createVLType`](http://unidata.github.io/netcdf4-python/netCDF4.Dataset-class.html#createVLType) method.
# - The numpy datatype of the variable-length sequences and the name of the new datatype must be specified. 

# In[41]:


vlen_t = ncfile.createVLType(np.int64, 'phony_vlen')


# A new variable can then be created using this datatype.

# In[42]:


vlvar = grp2.createVariable('phony_vlen_var', vlen_t, ('time','lat','lon'))


# Since there is no native vlen datatype in numpy, vlen arrays are represented in python as object arrays (arrays of dtype `object`). 
# 
# - These are arrays whose elements are Python object pointers, and can contain any type of python object. 
# - For this application, they must contain 1-D numpy arrays all of the same type but of varying length. 
# - Fill with 1-D random numpy int64 arrays of random length between 1 and 10.

# In[43]:


vlen_data = np.empty((nlats,nlons),object)
for i in range(nlons):
    for j in range(nlats):
        size = np.random.randint(1,10,size=1) # random length of sequence
        vlen_data[j,i] = np.random.randint(0,10,size=size)# generate random sequence
vlvar[0] = vlen_data # append along unlimited dimension (time)
print(vlvar)
print('data =\n',vlvar[:])


# Close the Dataset and examine the contents with ncdump.

# In[44]:


ncfile.close()
get_ipython().system('ncdump -h data/new2.nc')


# ##Other interesting and useful projects using netcdf4-python
# 
# - [xarray](https://xarray.pydata.org/en/stable/): N-dimensional variant of the core [pandas](https://pandas.pydata.org) data structure that can operate on netcdf variables.
# - [Iris](https://scitools.org.uk/iris/docs/latest/): a data model to create a data abstraction layer which isolates analysis and visualisation code from data format specifics.  Uses netcdf4-python to access netcdf data (can also handle GRIB).
# - [Dask](https://dask.org/): Virtual large arrays (from netcdf variables) with lazy evaluation.
# - [cf-python](https://cfpython.bitbucket.io/): Implements the [CF](http://cfconventions.org) data model for the reading, writing and processing of data and metadata.