PEDSnet - ERA5 tract10 zonal statistics v1

Published

March 27, 2025

## Dependencies
library(pacman) 
pacman::p_load(tidyverse, exactextractr, terra, glue, 
               sfarrow, leaflet, 
               furrr, sf, purrr, cli, tictoc, assertr, reactable,
                 ecmwfr,
  sf, tigris, rmapshaper, geoarrow,
  ncdf4, exactextractr, terra, here)

## Set directory
setwd(here("notebooks/ERA5 zonal statistics"))

## Local objects
context = lst(
  boundaries_server = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/boundaries'
)

Raster Data

Inventory

Code

server = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/ContiguousUS/ByMonth/era5_land__v1/raw'

df_files = tibble(
  path = list.files(server, full.names = T),
  file = basename(path)
) |> 
  select(file, everything()) |> 
  mutate(
    split_tmp = str_split(file, pattern = '__'),
    year_string = map_chr(split_tmp, ~.x[2]),
    variable = map_chr(split_tmp, ~.x[3]),
    year = str_sub(year_string, 1,4)|> parse_number(),
    month  = str_sub(year_string, -1L-1, -1L) |> parse_number()
  ) |> 
  select(-split_tmp)

Let’s just get a summary of work we need to do

Code

df_files |> 
  count(variable, name = 'n_months') |> 
  mutate(n_years =  n_months/12)

variable	n_months	n_years
10m_u_component_of_wind.nc	180	15
10m_v_component_of_wind.nc	180	15
snow_depth.nc	180	15
snowfall.nc	180	15
surface_runoff.nc	180	15
total_precipitation.nc	180	15

Metadata check

Code

nc_example = df_files |> 
  filter(variable == 'total_precipitation.nc') |> 
  slice(1) |> 
  pull(path)
nc_example

[1] "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/ContiguousUS/ByMonth/era5_land__v1/raw/era5-conus__2009_1__total_precipitation.nc"

Code

nc1 =  nc_example |> 
  nc_open()
nc1

File //files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/ContiguousUS/ByMonth/era5_land__v1/raw/era5-conus__2009_1__total_precipitation.nc (NC_FORMAT_NETCDF4):

     3 variables (excluding dimension variables):
        8 byte int number[]   (Contiguous storage)  
            long_name: ensemble member numerical id
            units: 1
            standard_name: realization
        string expver[valid_time]   (Contiguous storage)  
        float tp[longitude,latitude,valid_time]   (Chunking: [156,73,186])  (Compression: shuffle,level 1)
[1] ">>>> WARNING <<<  attribute GRIB_paramId is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_numberOfPoints is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_stepUnits is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_uvRelativeToGrid is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_NV is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_Nx is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_Ny is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_iScansNegatively is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_jPointsAreConsecutive is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_jScansPositively is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_totalNumber is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
            _FillValue: NaN
            GRIB_paramId: 228
            GRIB_dataType: fc
            GRIB_numberOfPoints: 180711
            GRIB_typeOfLevel: surface
            GRIB_stepUnits: 1
            GRIB_stepType: accum
            GRIB_gridType: regular_ll
            GRIB_uvRelativeToGrid: 0
            GRIB_NV: 0
            GRIB_Nx: 621
            GRIB_Ny: 291
            GRIB_cfName: unknown
            GRIB_cfVarName: tp
            GRIB_gridDefinitionDescription: Latitude/Longitude Grid
            GRIB_iDirectionIncrementInDegrees: 0.1
            GRIB_iScansNegatively: 0
            GRIB_jDirectionIncrementInDegrees: 0.1
            GRIB_jPointsAreConsecutive: 0
            GRIB_jScansPositively: 0
            GRIB_latitudeOfFirstGridPointInDegrees: 52
            GRIB_latitudeOfLastGridPointInDegrees: 23
            GRIB_longitudeOfFirstGridPointInDegrees: -128
            GRIB_longitudeOfLastGridPointInDegrees: -66
            GRIB_missingValue: 3.40282346638529e+38
            GRIB_name: Total precipitation
            GRIB_shortName: tp
            GRIB_totalNumber: 0
            GRIB_units: m
            long_name: Total precipitation
            units: m
            standard_name: unknown
            GRIB_surface: 0
            coordinates: number valid_time latitude longitude expver

     3 dimensions:
        valid_time  Size:744 
            long_name: time
            standard_name: time
            units: seconds since 1970-01-01
            calendar: proleptic_gregorian
        latitude  Size:291 
            _FillValue: NaN
            units: degrees_north
            standard_name: latitude
            long_name: latitude
            stored_direction: decreasing
        longitude  Size:621 
            _FillValue: NaN
            units: degrees_east
            standard_name: longitude
            long_name: longitude
[1] ">>>> WARNING <<<  attribute GRIB_subCentre is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"

    6 global attributes:
        GRIB_centre: ecmf
        GRIB_centreDescription: European Centre for Medium-Range Weather Forecasts
        GRIB_subCentre: 0
        Conventions: CF-1.7
        institution: European Centre for Medium-Range Weather Forecasts
        history: 2025-02-17T21:58 GRIB to CDM+CF via cfgrib-0.9.15.0/ecCodes-2.39.0 with {"source": "data.grib", "filter_by_keys": {}, "encode_cf": ["parameter", "time", "geography", "vertical"]}

Now let’s examine details

Code

names(nc1$var)

[1] "number" "expver" "tp"

Looks good. we like tp

Code

names(nc1$dim)

[1] "valid_time" "latitude"   "longitude"

Okay we have cell coord + time

Code

time_vals <- ncvar_get(nc1, "valid_time")
dates <- as.POSIXct(time_vals, origin="1970-01-01", tz="UTC")

head(dates)

[1] "2009-01-01 00:00:00 UTC" "2009-01-01 01:00:00 UTC"
[3] "2009-01-01 02:00:00 UTC" "2009-01-01 03:00:00 UTC"
[5] "2009-01-01 04:00:00 UTC" "2009-01-01 05:00:00 UTC"

Code

print(length(dates))

[1] 744

This make sense we have 744 hours in January.

Boundaries

Steps:

Imported: raw boundaries from Census2010_US20230929.gdb
Subset: We will save in a format that exactextract can operate 1. (gdb) 2. (shp)

Raw Geodatabase

We need the CONTUS tract10 boundaries that have land area greater zero. We will import from a geodatabase here:

Code

path_gdb = '//files.drexel.edu/colleges/SOPH/Shared/UHC/ResearchAndData/UHC_Data/Census2010/Geodatabases/Census2010_US20230929.gdb'

dir.exists(path_gdb)

[1] TRUE

Let’s see the feature-classes available.

Code

path_gdb |> sf::st_layers()

Driver: OpenFileGDB 
Available layers:
                      layer_name geometry_type features fields
1 CT_2010_ContUSlandRepairAlbers Multi Polygon    72246     16
2    CT_2010_ContUSland_TimeZone Multi Polygon    72246     20
                                             crs_name
1 USA_Contiguous_Albers_Equal_Area_Conic_USGS_version
2 USA_Contiguous_Albers_Equal_Area_Conic_USGS_version

We will use the CT_2010_ContUSland_TimeZone feature class.

Code

sf_boundaries = path_gdb |> 
  sf::st_read("CT_2010_ContUSland_TimeZone")

Reading layer `CT_2010_ContUSland_TimeZone' from data source 
  `\\files.drexel.edu\colleges\SOPH\Shared\UHC\ResearchAndData\UHC_Data\Census2010\Geodatabases\Census2010_US20230929.gdb' 
  using driver `OpenFileGDB'
Simple feature collection with 72246 features and 20 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -2356499 ymin: 264001.4 xmax: 2259829 ymax: 3177417
Projected CRS: USA_Contiguous_Albers_Equal_Area_Conic_USGS_version

Subset

We decided to focus on 16 PEDSNET states for now. Let’s bring in those states.

Code

df_states = read_csv('PEDSNETStates.csv')

Let’s subset the original all tracts object to states of interest.

Code

sf_subset = sf_boundaries |> 
  filter(STATEFP10 %in% df_states$STFIPS)

sf_subset |>
  sf::st_drop_geometry() |> 
  count(STATEFP10)

STATEFP10	n
06	8036
08	1249
10	215
11	179
17	3121
18	1508
21	1115
24	1394
34	2004
39	2946
42	3217
48	5253
51	1895
53	1446
54	484
55	1393

Now we want to save these somewhere by state for exact extract to work out of.

Code

sf_subset |> 
  group_by(STATEFP10) |>
  group_walk(function(.x, .y) {
    folder_tmp = glue('{context$boundaries_server}/shp/tract10_{unique(.y)}')
    dir.create(folder_tmp, showWarnings = F)
    path_tmp = glue('{folder_tmp}/tract10_{unique(.y)}.shp')
    if (!file.exists(path_tmp)) .x |> sf::st_write(path_tmp)
    cli_alert_success('saving to {path_tmp}')
  })

Zonal Statistics

Goals

Final Table

The final tables will have these columns.

tract geoid
local time
time-zone (?) this is in the boundary file? (linkable by tract10 and this info is found in the origainl GDB)
variable
value

Intermediate Table

The final tables will have these columns.

tract geoid
date-time (UTC)
variable
value

Setup

There are few things to set up. One is creating a folder to store results locally (prec-compiled intermediate tables) cache.

Code

list.files('cache')

[1] "zonal_stats"     "zonal_stats_dev"

the zonal_stats folder is for final results
the zonal_stats_dev folder is for storing benchmark results; feel free to deleted content at any time.

Repo note; we will gitingore these results so they don’t get pushed to GitHub.

Processing Function

Thigns we can remove:

geo_level_tmp: remove beacuse we only probably need tract10
engine: we have decdied to use exactextract
[] Added hour because processing at hour

```{r}

measure_tmp = 'tp'
state_tmp = 'DE'
year_tmp =  '2010'
month_tmp = '1'
day_tmp = '12'
hour_tmp = '01'
datetimetxt_tmp = glue("{year_tmp}-{month_tmp}-{day_tmp}T{hour_tmp}:00:00Z")
dev = T

crate_calc_prism_zonal_exactextract = crate(
  template_raw = template_raw,
  template = template,
  function(year_tmp, month_tmp, day_tmp, hour_tmp, datetimetxt_tmp,
           dev = ){
    
    { ###### Setup ######
      
      out_folder = 'cache/zonal_stats'
      if (dev == T) out_folder = 'cache/zonal_stats_dev'
      out_dir = base::file.path(
        out_folder,
        glue::glue("measure={measure_tmp}"),
        glue::glue("state={state_tmp}"),
        glue::glue("year={year_tmp}") ,
        glue::glue("month={month_tmp}")  ,
        glue::glue("day={day_tmp}")   ,
        glue::glue("hour={hour_tmp}")    
      )|>
        as.character()
      cli::cli_alert_info("Started: {measure_tmp} {state_tmp} {datetimetxt_tmp}")
      cli::cli_alert_info("Output: {out_dir}")
      
      ## Tempalate contains information for this processing unit (boundary path), (raster data path) and potentially other information.
      template_tmp = template
      if (dev == T) template_tmp = template_raw
    
      row = template_tmp |>
        dplyr::filter(
          geo_level ==  geo_level_tmp,
          measure ==  measure_tmp,
          year == year_tmp,
          state == state_tmp,
          datetxt == datetxt_tmp
        )
      out_file = file.path(out_dir,'data.parquet')
  
    }
    
    { ###### Input Validations ######
      if (file.exists(out_file)){
        cli::cli_alert_info("Already processed:  {geo_level_tmp} {measure_tmp} {state_tmp} {year_tmp} {datetxt_tmp}")
        return("Already processed")
      }
      
      ## Iterate over state,year
      if(nrow(row) == 0){
        cli::cli_alert_info("No data: {geo_level_tmp} {measure_tmp} {state_tmp} {year_tmp} {datetxt_tmp}")
        return("No template rows found")
      }
      
      if (dev == T) print(row)
    }
   
   
    { ###### Computation ######
      
      ## Imports
      r = terra::rast(x = row$r_path_cache)
      z = sf::st_read(row$z_path_cache, quiet = T)

      ## Try .tif raster calculation
      value_results <- tryCatch({
        exactextractr::exact_extract(r, z, 'mean', progress = FALSE)
      }, error = function(e) {
        # If there's an error, try the .bil raster
        r_bil <- terra::rast(x = row$r_path_bil)
        exactextractr::exact_extract(r_bil, z, 'mean', progress = FALSE)
      })
      
      ## If .tif results have NA values then use .bil raster for calculation
      if (any(is.nan(value_results))){
        r = terra::rast(x = row$r_path_bil)
        value_results = exactextractr::exact_extract(
          r, z, 'mean', 
          progress = FALSE )
      }
      
      ## Produce results
      df_results_state_year = terra::as.data.frame(z) |> 
        dplyr::select(state_abb, geoid) |>
        dplyr::mutate(
          date = row$date,
          value = value_results )  |>
        dplyr::select(geoid, date, state_abb, value) 
      
      if (dev == T) print("DONE!!!!")
    }
    
    { ###### Exports ######
      if (!dir.exists(out_dir)) dir.create(out_dir, recursive = T)
      df_results_state_year |> arrow::write_parquet(out_file)
      cli::cli_alert_success("Finished: {measure_tmp} {state_tmp}  {geo_level_tmp} {datetxt_tmp}")
      return()
      
    }
    
  })
 
rm(geo_level_tmp, measure_tmp, state_tmp, 
   month_tmp, day_tmp, datetxt_tmp, year_tmp, dev, engine)
```