{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f87ea1e0",
   "metadata": {},
   "source": [
    "# NWM Gridded Data Loading"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "363c4601",
   "metadata": {},
   "source": [
    "When fetching NWM gridded data, such as rainfall, TEEHR summarizes the grid pixels to user-provided polygons\n",
    "through an area-weighted average.\n",
    "\n",
    "In TEEHR this is a two step process consisting of:\n",
    "1. Calculating the pixel weights for your grid and polygons of interest (weights are the fraction of each grid cell intersecting each polygon).\n",
    "2. Fetching the grids and calculating weighted average of pixel values for each polygon."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "38f2f9f4-5294-4f9b-971a-931b91da8f85",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Import the required packages.\n",
    "import os\n",
    "\n",
    "from dask.distributed import Client\n",
    "from pathlib import Path\n",
    "\n",
    "import teehr.loading.nwm.nwm_grids as tlg\n",
    "\n",
    "import teehr.utilities.generate_weights as gw\n",
    "from teehr.loading.nwm.const import CONUS_NWM_WKT"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Specify input variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76e363e7-a3e0-427f-811f-a6298b93536f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "TEMP_GEO_DIR = Path(Path.home(), \"temp/geo\")\n",
    "TEMP_GEO_DIR.mkdir(exist_ok=True, parents=True)\n",
    "\n",
    "# Generating weights\n",
    "GRID_TEMPLATE_FILE = Path(TEMP_GEO_DIR, \"nwm.t00z.short_range.forcing.f001.conus.nc\")\n",
    "ZONE_GEO_FILE = Path(TEMP_GEO_DIR, \"nextgen_03S.parquet\")\n",
    "ZONAL_WEIGHTS_FILEPATH = Path(TEMP_GEO_DIR, \"nextgen_03S_weights.parquet\")\n",
    "UNIQUE_ZONE_ID = \"id\"\n",
    "\n",
    "# NWM\n",
    "CONFIGURATION = \"forcing_short_range\"  # forcing_short_range, forcing_analysis_assim, forcing_medium_range\n",
    "OUTPUT_TYPE = \"forcing\"\n",
    "VARIABLE_NAME = \"RAINRATE\"\n",
    "\n",
    "START_DATE = \"2020-12-18\"\n",
    "INGEST_DAYS = 1\n",
    "\n",
    "JSON_DIR = Path(Path.home(), \"temp/parquet/jsons/\")\n",
    "OUTPUT_DIR = Path(Path.home(), \"temp/parquet\")\n",
    "\n",
    "NWM_VERSION = \"nwm22\"  # Currently accepts \"nwm22\" or \"nwm30\"\n",
    "                       # Use \"nwm22\" for dates prior to 09-19-2023\n",
    "\n",
    "DATA_SOURCE = \"GCS\"    # Specifies the remote location from which to fetch the data\n",
    "                       # (\"GCS\", \"NOMADS\", \"DSTOR\")\n",
    "\n",
    "KERCHUNK_METHOD = \"auto\"  # When data_source = \"GCS\", specifies the preference in creating Kerchunk reference json files.\n",
    "                          # \"local\" - always create new json files from netcdf files in GCS and save locally, if they do not already exist\n",
    "                          # \"remote\" - read the CIROH pre-generated jsons from s3, ignoring any that are unavailable\n",
    "                          # \"auto\" - read the CIROH pre-generated jsons from s3, and create any that are unavailable, storing locally\n",
    "\n",
    "CONCAT_DIMS = [\"time\"]  # \"reference_time\"\n",
    "T_MINUS = [0, 1, 2]  # Only used if an assimilation run is selected\n",
    "IGNORE_MISSING_FILE = True  # If True, the missing file(s) will be skipped and the process will resume\n",
    "                            # If False, TEEHR will fail if a missing NWM file is encountered\n",
    "OVERWRITE_OUTPUT = True  # If True, existing output files will be overwritten\n",
    "                         # If False (default), existing files are retained"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fetch a template forcing netCDF file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53724249-5fbe-4fa4-8fd9-96a3238d16b3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "!wget -O /home/jovyan/temp/geo/nwm.t00z.short_range.forcing.f001.conus.nc \\\n",
    "https://storage.googleapis.com/national-water-model/nwm.20220101/forcing_short_range/nwm.t00z.short_range.forcing.f001.conus.nc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fetch some example polygons (nextgen divides)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a717b43",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "!wget -O /home/jovyan/temp/geo/nextgen_03S.parquet https://lynker-spatial.s3-us-west-2.amazonaws.com/hydrofabric/v2.1.1/nextgen/conus_divides/vpuid=03S/part-0.parquet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Start a local dask cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30f8283c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "n_workers = max(os.cpu_count() - 1, 1)\n",
    "client = Client(n_workers=n_workers)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate the weights file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ac3c383-9aaf-4fd7-8c33-461ff5e87bb7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "gw.generate_weights_file(\n",
    "    zone_polygon_filepath=ZONE_GEO_FILE,\n",
    "    template_dataset=GRID_TEMPLATE_FILE,\n",
    "    variable_name=VARIABLE_NAME,\n",
    "    output_weights_filepath=ZONAL_WEIGHTS_FILEPATH,\n",
    "    crs_wkt=CONUS_NWM_WKT,\n",
    "    unique_zone_id=UNIQUE_ZONE_ID,\n",
    "    layer=\"divides\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fetch the gridded data summarized to the polygons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b77bd47",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "tlg.nwm_grids_to_parquet(\n",
    "    configuration=CONFIGURATION,\n",
    "    output_type=OUTPUT_TYPE,\n",
    "    variable_name=VARIABLE_NAME,\n",
    "    start_date=START_DATE,\n",
    "    ingest_days=INGEST_DAYS,\n",
    "    zonal_weights_filepath=ZONAL_WEIGHTS_FILEPATH,\n",
    "    json_dir=JSON_DIR,\n",
    "    output_parquet_dir=OUTPUT_DIR,\n",
    "    nwm_version=NWM_VERSION,\n",
    "    data_source=DATA_SOURCE,\n",
    "    kerchunk_method=KERCHUNK_METHOD,\n",
    "    t_minus_hours=T_MINUS,\n",
    "    ignore_missing_file=IGNORE_MISSING_FILE,\n",
    "    overwrite_output=OVERWRITE_OUTPUT\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}