Download: https://osdyn.ifremer.fr/pyweb/notebooks/utils/mxarray.ipynb

Dataset from single or multiple NetCDF files

Note : A shorter way to read the file is available for each class of models

[1]:
import os
from osdyn.config import get_config_value
from osdyn.utils.data.mxarray import get_dataset, tzyx2index, auto_merge
from osdyn.utils.data.io import list_files

mars outputs

Concat over time

[2]:
# Get the list of files according to the period
path = get_config_value("osdyn.grcm.mars", "path_v9_6")
pattern_file = get_config_value("osdyn.grcm.mars", "pattern_file_v9_6")
pattern_date = ("2013-01-30","2013-02-01 10:00:00")  # plot super long
#pattern_date = ("2013-01-30", "2013-01-30 10:00:00")
infiles = list_files(os.path.join(path, pattern_file), pattern_date)
Found 21 files
[3]:
# Get XE and TEMP and concatene along time
out = get_dataset(infiles, varnames=['XE','TEMP'], gather_unique_dim='time')
[4]:
fig = out.XE[:,100,100].plot()
../../_images/notebooks_utils_mxarray_7_0.png

Super long, voir comment on peut améliorer cela

Concat over time and select a subdomain

[5]:
# Get the selection
tzyx = tzyx2index(infiles[0], 'TEMP', lons=(6.,6.), lats=(43.,43.))
[6]:
tzyx
[6]:
{'nj': (324, 325), 'ni': (413, 414)}
[7]:
# Read the files
out = get_dataset(infiles, varnames=['XE','TEMP'], subdomain=tzyx, decode_times=True)
[8]:
out
[8]:
<xarray.Dataset>
Dimensions:      (time: 21, nj: 1, ni: 1, level: 60, ni_u: 1, nj_u: 1, ni_v: 1, nj_v: 1, ni_f: 1, nj_f: 1, level_w: 61)
Coordinates: (12/19)
  * time         (time) datetime64[ns] 2013-01-30 ... 2013-01-30T20:00:00
  * ni           (ni) float32 413.0
  * nj           (nj) float32 324.0
    latitude     (nj, ni) float64 43.01
    longitude    (nj, ni) float64 6.007
  * level        (level) float32 -0.9917 -0.975 -0.9583 ... -0.025 -0.008333
    ...           ...
    longitude_v  (nj_v, ni_v) float64 6.007
    latitude_u   (nj_u, ni_u) float64 43.01
    latitude_v   (nj_v, ni_v) float64 43.01
    longitude_f  (nj_f, ni_f) float64 6.015
    latitude_f   (nj_f, ni_f) float64 43.01
  * level_w      (level_w) float32 -1.0 -0.9833 -0.9667 ... -0.01667 0.0
Data variables: (12/20)
    XE           (time, nj, ni) float32 dask.array<chunksize=(1, 1, 1), meta=np.ndarray>
    TEMP         (time, level, nj, ni) float32 dask.array<chunksize=(1, 60, 1, 1), meta=np.ndarray>
    dx_f         (nj_f, ni_f) float64 1.182e+03
    dx_u         (nj_u, ni_u) float64 1.182e+03
    dx_v         (nj_v, ni_v) float64 1.182e+03
    dx           (nj, ni) float64 1.182e+03
    ...           ...
    hc           (nj, ni) float32 5.0
    b            float64 0.2
    theta        float64 6.0
    H0           (nj, ni) float32 716.5
    HX           (nj_u, ni_u) float32 693.8
    HY           (nj_v, ni_v) float32 647.2

NEMO outputs

Concat variables (from different files) over time, and add the grid file

[9]:
# Get the list of files
path = get_config_value("osdyn.grcm.nemo", "path_medrys1v1")
pattern_file = get_config_value("osdyn.grcm.nemo",
                                "pattern_medrys1v1")
pattern_date = eval(
    get_config_value("osdyn.grcm.nemo", "period_medrys1v1")
)
infiles = list_files(os.path.join(path,pattern_file), pattern_date)
Found 45 files
[10]:
# Read and gather the files into a unique one
out = get_dataset(infiles[:], decode_times=False, gather_grid=['grid2D','gridS','gridT','gridU','gridV'])
[11]:
out
[11]:
<xarray.Dataset>
Dimensions:       (y: 264, x: 567, deptht: 75, time_counter: 9)
Coordinates:
    nav_lon       (y, x) float32 dask.array<chunksize=(264, 567), meta=np.ndarray>
    nav_lat       (y, x) float32 dask.array<chunksize=(264, 567), meta=np.ndarray>
  * deptht        (deptht) float32 0.5165 1.622 2.848 ... 4.683e+03 4.817e+03
  * time_counter  (time_counter) float64 6.392e+08 6.393e+08 ... 6.399e+08
  * x             (x) int32 1 2 3 4 5 6 7 8 ... 560 561 562 563 564 565 566 567
  * y             (y) int32 1 2 3 4 5 6 7 8 ... 257 258 259 260 261 262 263 264
Data variables: (12/17)
    vosaline      (time_counter, deptht, y, x) float32 dask.array<chunksize=(1, 75, 264, 567), meta=np.ndarray>
    votemper      (time_counter, deptht, y, x) float32 dask.array<chunksize=(1, 75, 264, 567), meta=np.ndarray>
    vodensity     (time_counter, deptht, y, x) float32 dask.array<chunksize=(1, 75, 264, 567), meta=np.ndarray>
    vozocrtx      (time_counter, deptht, y, x) float32 dask.array<chunksize=(1, 75, 264, 567), meta=np.ndarray>
    sozotaux      (time_counter, y, x) float32 dask.array<chunksize=(1, 264, 567), meta=np.ndarray>
    vomecrty      (time_counter, deptht, y, x) float32 dask.array<chunksize=(1, 75, 264, 567), meta=np.ndarray>
    ...            ...
    sohefldo      (time_counter, y, x) float32 dask.array<chunksize=(1, 264, 567), meta=np.ndarray>
    soceshwf      (time_counter, y, x) float32 dask.array<chunksize=(1, 264, 567), meta=np.ndarray>
    sowaflup      (time_counter, y, x) float32 dask.array<chunksize=(1, 264, 567), meta=np.ndarray>
    sosstobs      (time_counter, y, x) float32 dask.array<chunksize=(1, 264, 567), meta=np.ndarray>
    sowaflcd      (time_counter, y, x) float32 dask.array<chunksize=(1, 264, 567), meta=np.ndarray>
    sobarhei      (time_counter, y, x) float32 dask.array<chunksize=(1, 264, 567), meta=np.ndarray>

AROME (previmer format)

[12]:
path = get_config_value("osdyn.grcm.arome", "path_previ")
pattern_file = get_config_value("osdyn.grcm.arome",
                                "pattern_previ")
pattern_date = eval(
    get_config_value("osdyn.grcm.arome", "period_previ")
)
infiles = list_files(os.path.join(path, pattern_file), pattern_date)
Found 48 files
[13]:
# Select a subdomain
tzyx = tzyx2index(infiles[0], 'eau', lons=(2.5,3.5), lats=(42.,43.))
[14]:
# Read and gather the files into a unique one
out = get_dataset(infiles, varnames=['eau'], subdomain=tzyx, decode_times=True)
[15]:
out
[15]:
<xarray.Dataset>
Dimensions:    (time: 48, height: 1, latitude: 43, longitude: 43)
Coordinates:
  * time       (time) datetime64[ns] 2013-12-30 ... 2013-12-31T23:00:00
  * latitude   (latitude) float32 41.97 42.0 42.03 42.05 ... 42.97 43.0 43.03
  * longitude  (longitude) float32 2.475 2.5 2.525 2.55 ... 3.45 3.475 3.5 3.525
  * height     (height) float32 0.0
Data variables:
    eau        (time, height, latitude, longitude) float32 dask.array<chunksize=(1, 1, 43, 43), meta=np.ndarray>

MesoNH

More complicated as the time is not an axis

[16]:
# Get the list of files
path = get_config_value("osdyn.grcm.mesonh", "path_v5_3")
pattern_file = get_config_value("osdyn.grcm.mesonh",
                                "pattern_obc_v5_3")
pattern_date = eval(get_config_value("osdyn.grcm.mesonh",
                                     "period_v5_3")) # plus court
pattern_date = ("2011-09-02 15:00:00","2011-09-04 00:00:00")
infiles = list_files(os.path.join(path, pattern_file), pattern_date)
Found 34 files
[17]:
# Create the process to apply to each profile
from osdyn.grcm.mesonh import get_datetime
def userprocess(dsu):
    """
    `xarray.open_mfdataset calls this function through `preprocess=userprocess`
    to apply the directives on each dataset prior to the concatenation.

    Parameters
    ----------
    dsu : xarray.Dataset
        One of the files.

    Returns
    -------
    xarray.Dataset
        The modified dataset in which the time axis has been added and a few
        variables have been collected.

    """

    timerecord = get_datetime(dsu.DTCUR__TDATE, dsu.DTCUR__TIME)
    mnhgvars = ['time']
    mnhvars = ['UT']
    return dsu.assign(time=timerecord)[mnhvars + mnhgvars]
[18]:
# Gather MesoNH variables along time
mnh = get_dataset(infiles, userprocess=userprocess, decode_times=False)
[19]:
mnh
[19]:
<xarray.Dataset>
Dimensions:  (time: 34, Z: 102, Y: 130, X: 130)
Coordinates:
  * time     (time) datetime64[ns] 2011-09-02T15:00:00 ... 2011-09-04
Dimensions without coordinates: Z, Y, X
Data variables:
    UT       (time, Z, Y, X) float64 dask.array<chunksize=(1, 102, 130, 130), meta=np.ndarray>
[20]:
# grid of MesoNH file
import xarray as xr
gridvars = ['LON0','LAT0', 'BETA', 'JPHEXT', 'XHAT', 'YHAT', 'ZHAT',
            'LAT', 'LON', 'ZS','ZSMT']
mnh_grid = xr.open_dataset(infiles[0])[gridvars]
[21]:
# Add the grid into MesoNH file
mnh = mnh.merge(mnh_grid)
[22]:
mnh
[22]:
<xarray.Dataset>
Dimensions:  (time: 34, Z: 102, Y: 130, X: 130)
Coordinates:
  * time     (time) datetime64[ns] 2011-09-02T15:00:00 ... 2011-09-04
Dimensions without coordinates: Z, Y, X
Data variables:
    UT       (time, Z, Y, X) float64 dask.array<chunksize=(1, 102, 130, 130), meta=np.ndarray>
    LON0     float64 -5.15
    LAT0     float64 48.38
    BETA     float64 0.0
    JPHEXT   int32 1
    XHAT     (X) float64 1.7e+05 1.712e+05 1.725e+05 ... 3.3e+05 3.312e+05
    YHAT     (Y) float64 1.7e+05 1.712e+05 1.725e+05 ... 3.3e+05 3.312e+05
    ZHAT     (Z) float64 -10.0 0.0 10.0 20.5 ... 2.235e+04 2.315e+04 2.395e+04
    LAT      (Y, X) float64 47.64 47.64 47.64 47.64 ... 49.09 49.09 49.09 49.09
    LON      (Y, X) float64 -6.242 -6.225 -6.208 -6.191 ... -4.092 -4.075 -4.058
    ZS       (Y, X) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0
    ZSMT     (Y, X) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0

Basic files

From database organisation point of view

Automatically merge a split xarray Dataset. This is designed to behave like xarray.open_mfdataset, except it supports concatenation along multiple dimensions.

[23]:
# Arpege (Previmer format)
path = get_config_value("osdyn.grcm.arpegehr", "path_previ")
pattern_file = get_config_value("osdyn.grcm.arpegehr",
                                "pattern_previ")
pattern_date = eval(
    get_config_value("osdyn.grcm.arpegehr", "period_previ")
)
infiles = list_files(os.path.join(path, pattern_file), pattern_date)
Found 24 files
[24]:
auto_merge(infiles)
[24]:
<xarray.Dataset>
Dimensions:    (time: 24, latitude: 366, longitude: 691, height: 1)
Coordinates:
  * time       (time) datetime64[ns] 2019-08-12 ... 2019-08-12T23:00:00
  * latitude   (latitude) float32 30.0 30.1 30.2 30.3 ... 66.2 66.3 66.4 66.5
  * longitude  (longitude) float32 -32.0 -31.9 -31.8 -31.7 ... 36.8 36.9 37.0
  * height     (height) float32 0.0
Data variables:
    hu2m       (time, height, latitude, longitude) float32 dask.array<chunksize=(1, 1, 366, 691), meta=np.ndarray>
    nebul      (time, height, latitude, longitude) float32 dask.array<chunksize=(1, 1, 366, 691), meta=np.ndarray>
    pmer       (time, height, latitude, longitude) float32 dask.array<chunksize=(1, 1, 366, 691), meta=np.ndarray>
    t2m        (time, height, latitude, longitude) float32 dask.array<chunksize=(1, 1, 366, 691), meta=np.ndarray>
    u10m       (time, height, latitude, longitude) float32 dask.array<chunksize=(1, 1, 366, 691), meta=np.ndarray>
    v10m       (time, height, latitude, longitude) float32 dask.array<chunksize=(1, 1, 366, 691), meta=np.ndarray>

Note

simplest way Very usefull when files splitted by periods and when all the variables are dependent on time and available in each file

Warning

DataArrays are extended over concatenation dimension. See dx variables for instance below

[25]:
# Get the list of files according to the period
path = get_config_value("osdyn.grcm.mars", "path_v9_6")
pattern_file = get_config_value("osdyn.grcm.mars", "pattern_file_v9_6")
pattern_date = ("2013-01-30","2013-01-30 03:00:00")  # plot super long
#pattern_date = ("2013-01-30", "2013-01-30 10:00:00")
infiles = list_files(os.path.join(path, pattern_file), pattern_date)
Found 4 files
[26]:
infiles
[26]:
['/home/datawork-lops-siam-moawi/PROJECTS/OSDYN/DATA/MODELS/MARS/V96/champs_HOURLY_DIF-V10.03_20130130T000000Z.nc',
 '/home/datawork-lops-siam-moawi/PROJECTS/OSDYN/DATA/MODELS/MARS/V96/champs_HOURLY_DIF-V10.03_20130130T010000Z.nc',
 '/home/datawork-lops-siam-moawi/PROJECTS/OSDYN/DATA/MODELS/MARS/V96/champs_HOURLY_DIF-V10.03_20130130T020000Z.nc',
 '/home/datawork-lops-siam-moawi/PROJECTS/OSDYN/DATA/MODELS/MARS/V96/champs_HOURLY_DIF-V10.03_20130130T030000Z.nc']
[27]:
auto = auto_merge(infiles, decode_times=False)
auto.attrs = {}
[28]:
auto
[28]:
<xarray.Dataset>
Dimensions:      (ni: 1101, nj: 463, ni_u: 1101, nj_u: 463, ni_v: 1101, nj_v: 463, ni_f: 1101, nj_f: 463, time: 4, level: 60, level_w: 61)
Coordinates: (12/19)
  * ni           (ni) float32 0.0 1.0 2.0 3.0 ... 1.098e+03 1.099e+03 1.1e+03
  * nj           (nj) float32 0.0 1.0 2.0 3.0 4.0 ... 459.0 460.0 461.0 462.0
  * ni_u         (ni_u) float32 0.5 1.5 2.5 3.5 ... 1.098e+03 1.1e+03 1.1e+03
  * nj_u         (nj_u) float32 0.0 1.0 2.0 3.0 4.0 ... 459.0 460.0 461.0 462.0
  * ni_v         (ni_v) float32 0.0 1.0 2.0 3.0 ... 1.098e+03 1.099e+03 1.1e+03
  * nj_v         (nj_v) float32 0.5 1.5 2.5 3.5 4.5 ... 459.5 460.5 461.5 462.5
    ...           ...
    longitude_v  (nj_v, ni_v) float64 dask.array<chunksize=(463, 1101), meta=np.ndarray>
    latitude_u   (nj_u, ni_u) float64 dask.array<chunksize=(463, 1101), meta=np.ndarray>
    latitude_v   (nj_v, ni_v) float64 dask.array<chunksize=(463, 1101), meta=np.ndarray>
    longitude_f  (nj_f, ni_f) float64 dask.array<chunksize=(463, 1101), meta=np.ndarray>
    latitude_f   (nj_f, ni_f) float64 dask.array<chunksize=(463, 1101), meta=np.ndarray>
  * level_w      (level_w) float32 -1.0 -0.9833 -0.9667 ... -0.01667 0.0
Data variables: (12/35)
    dx_f         (time, nj_f, ni_f) float64 dask.array<chunksize=(1, 463, 1101), meta=np.ndarray>
    dx_u         (time, nj_u, ni_u) float64 dask.array<chunksize=(1, 463, 1101), meta=np.ndarray>
    dx_v         (time, nj_v, ni_v) float64 dask.array<chunksize=(1, 463, 1101), meta=np.ndarray>
    dx           (time, nj, ni) float64 dask.array<chunksize=(1, 463, 1101), meta=np.ndarray>
    dy_v         (time, nj_v, ni_v) float64 dask.array<chunksize=(1, 463, 1101), meta=np.ndarray>
    dy_u         (time, nj_u, ni_u) float64 dask.array<chunksize=(1, 463, 1101), meta=np.ndarray>
    ...           ...
    N2           (time, level_w, nj, ni) float32 dask.array<chunksize=(1, 61, 463, 1101), meta=np.ndarray>
    ECT          (time, level_w, nj, ni) float32 dask.array<chunksize=(1, 61, 463, 1101), meta=np.ndarray>
    EPS          (time, level_w, nj, ni) float32 dask.array<chunksize=(1, 61, 463, 1101), meta=np.ndarray>
    BZ           (time, level, nj, ni) float32 dask.array<chunksize=(1, 60, 463, 1101), meta=np.ndarray>
    TAUX         (time, nj_u, ni_u) float32 dask.array<chunksize=(1, 463, 1101), meta=np.ndarray>
    TAUY         (time, nj_v, ni_v) float32 dask.array<chunksize=(1, 463, 1101), meta=np.ndarray>

Note

auto_merge = database when gather_unique_dim=None

[29]:
ds = get_dataset(infiles, gather_unique_dim=None)
[30]:
auto.time
[30]:
<xarray.DataArray 'time' (time: 4)>
array([3.568493e+09, 3.568496e+09, 3.568500e+09, 3.568504e+09])
Coordinates:
  * time     (time) float64 3.568e+09 3.568e+09 3.568e+09 3.569e+09
[31]:
ds.time
[31]:
<xarray.DataArray 'time' (time: 4)>
array([3.568493e+09, 3.568496e+09, 3.568500e+09, 3.568504e+09])
Coordinates:
  * time     (time) float64 3.568e+09 3.568e+09 3.568e+09 3.569e+09
[32]:
xr.testing.assert_identical(ds, auto)