NWM Gridded Data Loading#

When fetching NWM gridded data, such as rainfall, TEEHR summarizes the grid pixels to user-provided polygons through an area-weighted average.

In TEEHR this is a two step process consisting of:

  1. Calculating the pixel weights for your grid and polygons of interest (weights are the fraction of each grid cell intersecting each polygon).

  2. Fetching the grids and calculating weighted average of pixel values for each polygon.

# Import the required packages.
import os

from dask.distributed import Client
from pathlib import Path

import teehr.loading.nwm.nwm_grids as tlg

import teehr.utilities.generate_weights as gw
from teehr.loading.nwm.const import CONUS_NWM_WKT

Specify input variables#

TEMP_GEO_DIR = Path(Path.home(), "temp/geo")
TEMP_GEO_DIR.mkdir(exist_ok=True, parents=True)

# Generating weights
GRID_TEMPLATE_FILE = Path(TEMP_GEO_DIR, "nwm.t00z.short_range.forcing.f001.conus.nc")
ZONE_GEO_FILE = Path(TEMP_GEO_DIR, "nextgen_03S.parquet")
ZONAL_WEIGHTS_FILEPATH = Path(TEMP_GEO_DIR, "nextgen_03S_weights.parquet")
UNIQUE_ZONE_ID = "id"

# NWM
CONFIGURATION = "forcing_short_range"  # forcing_short_range, forcing_analysis_assim, forcing_medium_range
OUTPUT_TYPE = "forcing"
VARIABLE_NAME = "RAINRATE"

START_DATE = "2020-12-18"
INGEST_DAYS = 1

JSON_DIR = Path(Path.home(), "temp/parquet/jsons/")
OUTPUT_DIR = Path(Path.home(), "temp/parquet")

NWM_VERSION = "nwm22"  # Currently accepts "nwm22" or "nwm30"
                       # Use "nwm22" for dates prior to 09-19-2023

DATA_SOURCE = "GCS"    # Specifies the remote location from which to fetch the data
                       # ("GCS", "NOMADS", "DSTOR")

KERCHUNK_METHOD = "auto"  # When data_source = "GCS", specifies the preference in creating Kerchunk reference json files.
                          # "local" - always create new json files from netcdf files in GCS and save locally, if they do not already exist
                          # "remote" - read the CIROH pre-generated jsons from s3, ignoring any that are unavailable
                          # "auto" - read the CIROH pre-generated jsons from s3, and create any that are unavailable, storing locally

CONCAT_DIMS = ["time"]  # "reference_time"
T_MINUS = [0, 1, 2]  # Only used if an assimilation run is selected
IGNORE_MISSING_FILE = True  # If True, the missing file(s) will be skipped and the process will resume
                            # If False, TEEHR will fail if a missing NWM file is encountered
OVERWRITE_OUTPUT = True  # If True, existing output files will be overwritten
                         # If False (default), existing files are retained

Fetch a template forcing netCDF file#

!wget -O /home/jovyan/temp/geo/nwm.t00z.short_range.forcing.f001.conus.nc \
https://storage.googleapis.com/national-water-model/nwm.20220101/forcing_short_range/nwm.t00z.short_range.forcing.f001.conus.nc

Fetch some example polygons (nextgen divides)#

!wget -O /home/jovyan/temp/geo/nextgen_03S.parquet https://lynker-spatial.s3-us-west-2.amazonaws.com/hydrofabric/v2.1.1/nextgen/conus_divides/vpuid=03S/part-0.parquet

Start a local dask cluster#

n_workers = max(os.cpu_count() - 1, 1)
client = Client(n_workers=n_workers)
client

Generate the weights file#

%%time
gw.generate_weights_file(
    zone_polygon_filepath=ZONE_GEO_FILE,
    template_dataset=GRID_TEMPLATE_FILE,
    variable_name=VARIABLE_NAME,
    output_weights_filepath=ZONAL_WEIGHTS_FILEPATH,
    crs_wkt=CONUS_NWM_WKT,
    unique_zone_id=UNIQUE_ZONE_ID,
    layer="divides"
)

Fetch the gridded data summarized to the polygons#

%%time
tlg.nwm_grids_to_parquet(
    configuration=CONFIGURATION,
    output_type=OUTPUT_TYPE,
    variable_name=VARIABLE_NAME,
    start_date=START_DATE,
    ingest_days=INGEST_DAYS,
    zonal_weights_filepath=ZONAL_WEIGHTS_FILEPATH,
    json_dir=JSON_DIR,
    output_parquet_dir=OUTPUT_DIR,
    nwm_version=NWM_VERSION,
    data_source=DATA_SOURCE,
    kerchunk_method=KERCHUNK_METHOD,
    t_minus_hours=T_MINUS,
    ignore_missing_file=IGNORE_MISSING_FILE,
    overwrite_output=OVERWRITE_OUTPUT
)