import matplotlib.pyplot as plt
import numpy as np

import iris
import iris.analysis
import iris.quickplot as qplt

print('Iris version: {}'.format(iris.__version__))

obs_filepath = './example_data/EN3_v2a_Profiles_195001.nc'

def ensure_full_mask(array):
    """Return MaskedArray version of data with a full mask array."""
    full_mask_array = np.ma.getmaskarray(array)
    return np.ma.MaskedArray(array, mask=full_mask_array)

def load_en3_variable(var_name):
    """
    Load a named data variable from the EN3 file as an Iris cube with a masked
    data array.

    """
    # Load cube from nc variable
    cube = iris.load_cube(obs_filepath, var_name)
    # Force data to be a masked array, with a fully expanded mask array
    cube.data = ensure_full_mask(cube.data)
    # Also infer a data mask if there is a '_fillvalue' attribute.  This
    # _ought_ to be automatic with netCDF4-python, but some EN3 files seem to
    # have a mis-spelling here (should be '_FillValue', with capitals).
    if '_fillvalue' in cube.attributes:
        mdi = cube.attributes['_fillvalue']
        cube.data.mask |= cube.data == mdi
        cube.data.set_fill_value(mdi)
    return cube

def load_en3_with_quality_mask(main_var_name, qc_var_name, qc_max_valid=9):
    """
    Load a named data variable from the EN3 file as an Iris cube.

    Generates a masked data cube, in which data is also masked where the
    related QC value exceeds a given threshold.

    Args:

    * main_var_name (string):
        the data variable long_name within the netCDF file.
    * qc_var_name (string):
        name of the related QC variable

    Kwargs:
    * qc_max_valid (int):
        Threshold value for QC data.  QC values larger than this result in the
        datapoint being masked.

    """
    # Load main and QC data values
    cube = load_en3_variable(main_var_name)
    qc_data = load_en3_variable(qc_var_name).data
    # Missing QC data equates to okay
    qc_data.set_fill_value(qc_max_valid)
    qc_data = qc_data.filled()
    # Convert to numbers
    qc_data = np.array(qc_data, dtype=int)
    # data QC figure means invalid if *too large*
    cube.data.mask |= qc_data > qc_max_valid
    return cube

# Get the main data (potential temperatures), applying quality levels
potm = load_en3_with_quality_mask('corrected pot. temp',
                                  'quality on pot. temperature',
                                  qc_max_valid=2)

# Get depth and location info
depth = load_en3_with_quality_mask('corrected depth', 'quality on depth')
longitude = load_en3_with_quality_mask(
    'longitude of the station, best estimated value',
    'quality on position (latitude and longitude)',
    qc_max_valid=1)
latitude = load_en3_with_quality_mask(
    'latitude of the station, best estimated value',
    'quality on position (latitude and longitude)',
    qc_max_valid=1)

# Get time reference + convert to units string
reftime = iris.load_cube(obs_filepath, 'date of reference for julian days')
reftime_str = ''.join(reftime.data)
assert len(reftime_str) == 14
ref_unit_str = 'days since {:4s}-{:2s}-{:2s} {:2s}:{:2s}:{:2s}'.format(
    reftime_str[:4], reftime_str[4:6], reftime_str[6:8],
    reftime_str[8:10], reftime_str[10:12], reftime_str[12:14])

# Get time data
time = load_en3_with_quality_mask(
    'julian day (utc) of the location relative to reference_date_time',
    'quality on date and time')
time.units = ref_unit_str

# Show some results
print '\npotm cube:\n', potm
print '\ndepth cube:\n', depth
print '\nlongitude cube:\n', longitude
print '\ntime cube:\n', time

z_min, z_max, dz = 0.0, 500.0, 50.0
y_min, y_max, dy = -90.0, 90.0, 2.0
x_min, x_max, dx = -180.0, 180.0, 3.0
nz = int((z_max - z_min) / dz)
ny = int((y_max - y_min) / dy)
nx = int((x_max - x_min) / dx)

def aggregate_to_grid(phenom, aggregator=iris.analysis.MEAN, **agg_kwargs):
    # Regrid a data cube onto the grid previously specified (by 'nx', 'dx',
    # 'x_min', 'x_max' and the equivalents in y and z).
    #
    # For this, each datapoint is assigned to a gridcell 'box' according to its
    # corresponding coordinate values (depth, longitude, latitude).
    #
    # N.B. agg_kwargs is passed to the aggregator operation.

    # Make a 1-D 'flattened' version of the source data cube
    # (as 1D coordinates are required by the 'aggregate_by' operation)
    cube_flat = iris.cube.Cube(phenom.data.flat[...])
    cube_flat.metadata = phenom.metadata

    # Make coordinates containing gridcell indices for each grid coord value
    def add_index_coord(coord_points, coord_name, start, step):
        # Calculate gridcell indices of all the points.
        cell_indices = np.floor((coord_points - start) / step)
        # Make these all integers, for eventual use as array indices.
        cell_indices = np.array(cell_indices, dtype=int)
        # Add these as a "categorical coord" to aggregate by.
        cube_flat.add_aux_coord(
            iris.coords.AuxCoord(cell_indices, long_name=coord_name),
            0)

    # Add a coordinate containing gridcell indices in the z-dimension (depth).
    add_index_coord(depth.data.flat[:], 'i_depth', z_min, dz)
    # Repeat for lats and lons -- except these need broadcasting to 2d first
    lons_2d, _ = np.broadcast_arrays(longitude.data[:, None], phenom.data)
    lats_2d, _ = np.broadcast_arrays(latitude.data[:, None], phenom.data)
    add_index_coord(lons_2d.flat[:], 'i_lon', x_min, dx)
    add_index_coord(lats_2d.flat[:], 'i_lat', y_min, dy)

    # Aggregate the data to get a statistical result for each 'inhabited' cell.
    result_cells = cube_flat.aggregated_by(('i_depth', 'i_lat', 'i_lon'),
                                           aggregator=aggregator,
                                           **agg_kwargs)

    # Make a full-grid result cube.
    # N.B. metadata comes from aggregation (includes units + cell_method)
    full_data_empty = np.ma.MaskedArray(np.zeros((nz, ny, nx),
                                        dtype=result_cells.data.dtype),
                                        mask=True)
    result_cube = iris.cube.Cube(full_data_empty)
    result_cube.metadata = result_cells.metadata

    # Assign aggregation results to the appropriate gridcells ...
    i_z, i_y, i_x = [result_cells.coord(coord_name).points
                     for coord_name in ('i_depth', 'i_lat', 'i_lon')]
    # ... but discarding results that lie outside the target grid.
    i_ok = np.where((i_z >= 0) & (i_z < nz) &
                    (i_y >= 0) & (i_y < ny) &
                    (i_x >= 0) & (i_x < nx))
    i_z, i_y, i_x = i_z[i_ok], i_y[i_ok], i_x[i_ok]
    result_cube.data[[i_z, i_y, i_x]] = result_cells.data[i_ok]

    # Add DimCoords defining the grid.
    result_cube.add_dim_coord(iris.coords.DimCoord.from_regular(
        z_min - 0.5 * dz, dz, nz, with_bounds=True,
        standard_name='depth', units='metres'), 0)
    result_cube.add_dim_coord(iris.coords.DimCoord.from_regular(
        y_min - 0.5 * dy, dy, ny, with_bounds=True,
        standard_name='latitude', units='degrees'), 1)
    result_cube.add_dim_coord(iris.coords.DimCoord.from_regular(
        x_min - 0.5 * dx, dx, nx, with_bounds=True,
        standard_name='longitude', units='degrees'), 2)

    return result_cube

# Regrid the main data to get a gridded mean, std-dev and count.
potm_mean = aggregate_to_grid(potm)
potm_std_dev = aggregate_to_grid(potm, iris.analysis.STD_DEV)
potm_counts = aggregate_to_grid(potm,
                                iris.analysis.COUNT,
                                function=lambda points: np.ones(points.shape))
    # NOTE: 'function' arg looks a bit odd, as must return an *array* of bool

# Add occupancy 'count > 3' as an extra condition for valid data on the others
invalid_counts = potm_counts.data <= 3
potm_mean.data.mask |= invalid_counts
potm_std_dev.data.mask |= invalid_counts

# Expand the times to 2d so we can use the same code to get min+max times.
time_data2d, _ = np.broadcast_arrays(time.data[:, None], potm.data)
time2d = iris.cube.Cube(time_data2d)
time2d.metadata = time.metadata

# Add a bounded time coordinate to all the output cubes.
time_min = aggregate_to_grid(time2d, iris.analysis.MIN)
time_max = aggregate_to_grid(time2d, iris.analysis.MAX)
time_centres = 0.5 * (time_min.data + time_max.data)
time_bounds = np.concatenate((time_min.data[..., None],
                              time_max.data[..., None]),
                             axis=-1)
time_coord = iris.coords.AuxCoord(time_centres,
                                  bounds=time_bounds,
                                  units=time.units,
                                  standard_name='time')
potm_mean.add_aux_coord(time_coord, (0, 1, 2))
potm_std_dev.add_aux_coord(time_coord, (0, 1, 2))
potm_counts.add_aux_coord(time_coord, (0, 1, 2))

# Save result cubes to netCDF
iris.save((potm_mean, potm_std_dev, potm_counts), 'temp_sparse_regrid.nc')

# Plot the mean result at the top level
plt.figure(figsize=(12, 8))
qplt.pcolormesh(potm_mean[0])
plt.gca().coastlines()
plt.show()

print potm_mean[0]