Hi Patrick,
The following code looks okay to me when I do:
python land.py file.nc
for the following files:
Euro44_bvv_nancil_CTL-BCJ_jules-vn5.2pmed_u-bb422GLTS_monmean_1979.nc
JULES-GL7.0.vn5.2_sif1.CRUNCEPv7SLURM.S2.Annual.1869.nc
JULES-GL7.0.vn5.2_sif1.CRUNCEPv7SLURM.S2.Monthly.1869.nc
JULES-GL7.0.vn5.2_sif1.CRUNCEPv7SLURM.S2.sif_vars.1869.nc
The output uses around 30% less disk space than the original file. The output is CF compliant I believe as per 8.2. Compression by Gathering.
cf-python will read in the data and make it into a normal 2D or 3D dataset. There is an issue with slowness in the reading that David will fix in cf-python 3.15.2. Iris doesn’t understand this CF data but then we all know iris isn’t as rigorous in it’s interpretation of CF as cf-python is.
Happy to give you a demo on my PC and chat some more in person if needed.
import netCDF4 as nc
import numpy as np
import sys
# Initiate the pvars class
# This is used for storing netCDF variables
class pvars(object):
'''The pvars object is used for storing plotting variables in plotvars'''
def __init__(self, **kwargs):
# Initialize a new Pvars instance
for attr, value in kwargs.items():
setattr(self, attr, value)
def __str__(self):
'''x.__str__() <==> str(x)'''
a = None
v = None
out = ['%s = %s' % (a, repr(v))]
for a, v in self.__dict__.items():
return '\n'.join(out)
ncvars = pvars()
inputds = nc.Dataset(sys.argv[1], 'r')
# Allocate the longitude and latitude arrays
lons_data = inputds.variables['longitude'][:].squeeze()
lats_data = inputds.variables['latitude'][:].squeeze()
# Size of the data arrays
lons_unique = np.unique(lons_data)
lats_unique = np.unique(lats_data)
lonsize = np.size(lons_unique)
latsize = np.size(lats_unique)
nlandpoints = np.size(lons_data)
# Create sets of regularly spaced lons and lats
lons_regular = np.arange(lonsize) * (lons_unique[1] - lons_unique[0]) + np.min(lons_unique)
lats_regular = np.arange(latsize) * (lats_unique[1] - lats_unique[0]) + np.min(lats_unique)
lons_regular_list = list(lons_regular)
lats_regular_list = list(lats_regular)
# Create a land-sea mask array
lsmask = np.zeros([latsize, lonsize], dtype=int)
# Find the data position in the mask array and allocate the land points
for i in np.arange(len(lons_data)):
xpos = lons_regular_list.index(lons_data[i])
ypos = lats_regular_list.index(lats_data[i])
lsmask[ypos, xpos] = 1
# Flatten the mask to a vector of points and
# find the points where the mask is non zero
lsmask = lsmask.flatten()
landpts = lsmask.nonzero()
# Edit vars list to leave only the list of fields
vars = list(inputds.variables)
rmvars = ['time', 'time_bnds', 'time_bounds', 'longitude', 'latitude', 'time_bounds']
for rmvar in rmvars:
try:
vars.remove(rmvar)
except:
pass
# Calculate which dimensionsless dims need to be added
dimensionless_dims = []
for var in vars:
vardims = inputds.variables[var].dimensions
for vardim in vardims:
dimensionless_dims.append(vardim)
dimensionless_dims = list(np.unique(dimensionless_dims))
rmdims = ['time', 'bnds', 'x', 'y']
for rmdim in rmdims:
try:
dimensionless_dims.remove(rmdim)
except:
pass
dimensionless_dims_dict = {}
for dimensionless_dim in dimensionless_dims:
if dimensionless_dim not in dimensionless_dims_dict:
for var in vars:
mvars = inputds.variables[var].dimensions
if dimensionless_dim in mvars:
myind = mvars.index(dimensionless_dim)
mysize = list(np.shape(inputds.variables[var]))[myind]
dimensionless_dims_dict[dimensionless_dim] = mysize
time_data = inputds.variables['time'][:]
# Time must have just one dimension
if np.ndim(time_data) > 1:
time_data = time_data.squeeze()
# Writing the data
outputds = nc.Dataset('converted.nc', 'w', format='NETCDF4')
# Create the dimensions
lon = outputds.createDimension('lon', lonsize)
lat = outputds.createDimension('lat', latsize)
landpoint = outputds.createDimension('landpoint', nlandpoints)
time = outputds.createDimension('time', None)
# Create the dimensionless dims and variables and set the data
for ddim in dimensionless_dims_dict:
setattr(ncvars, ddim, outputds.createDimension(ddim, dimensionless_dims_dict[ddim]))
setattr(ncvars, ddim + 's', outputds.createVariable(ddim, 'i4', tuple([ddim])))
getattr(ncvars, ddim + 's')[:] = np.arange(dimensionless_dims_dict[ddim])
# Create the variables
lons = outputds.createVariable('lon', 'f4', ('lon',))
lons.standard_name='longitude'
lons.units = 'degrees_east'
lats = outputds.createVariable('lat', 'f4', ('lat',))
lats.standard_name = 'latitude'
lats.units = 'degrees_north'
landpoint = outputds.createVariable('landpoint', 'i4', ('landpoint',))
landpoint.compress = "lat lon"
times = outputds.createVariable('time', 'f4', ('time',))
times.standard_name = inputds.variables['time'].standard_name
times.units = inputds.variables['time'].units
times.calendar = inputds.variables['time'].calendar
for i in np.arange(len(vars)):
var = vars[i]
print(var + ' ' + str(i + 1) + ' of ' + str(len(vars)))
ndims = np.ndim(inputds.variables[var][:].squeeze())
attrs = inputds.variables[var].ncattrs()
# Form the _FillValue / fill_value dictionary as this must be set at variable creation time
myfill ={}
if '_FillValue' in attrs:
myfill['fill_value'] = getattr(inputds.variables[var], '_FillValue')
attrs.remove('_FillValue')
# Create a tuple of the variable dimensions
# The last two are 'y', 'x' which translates to 'landpoint'
# Code assumes there are two or more dimensions
vardims = list(inputds.variables[var].dimensions)[:-2]
vardims.append('landpoint')
vardims = tuple(vardims)
# Create the variable
setattr(ncvars, var, outputds.createVariable(var, 'f4', vardims, compression='zlib', **myfill))
for attr in attrs:
setattr(getattr(ncvars, var), attr, getattr(inputds.variables[var], attr))
# Add the data to the new variable
# We remove the y dimension of the input field as that doesn't exist in the new data
data = np.squeeze(inputds.variables[var][:], -2)
getattr(ncvars, var)[:] = data
landpoint[:] = landpts
lons[:] = lons_regular
lats[:] = lats_regular
times[:] = time_data
# Close the converted netCDF file
outputds.close()