PEDSnet - ERA5 tract10 zonal statistics v1

Published

March 27, 2025

## Dependencies
library(pacman) 
pacman::p_load(tidyverse, exactextractr, terra, glue, 
               sfarrow, leaflet, 
               furrr, sf, purrr, cli, tictoc, assertr, reactable,
                 ecmwfr,
  sf, tigris, rmapshaper, geoarrow,
  ncdf4, exactextractr, terra, here)

## Set directory
setwd(here("notebooks/ERA5 zonal statistics"))

## Local objects
context = lst(
  boundaries_server = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/boundaries'
)

Raster Data

Inventory

Code
server = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/ContiguousUS/ByMonth/era5_land__v1/raw'

df_files = tibble(
  path = list.files(server, full.names = T),
  file = basename(path)
) |> 
  select(file, everything()) |> 
  mutate(
    split_tmp = str_split(file, pattern = '__'),
    year_string = map_chr(split_tmp, ~.x[2]),
    variable = map_chr(split_tmp, ~.x[3]),
    year = str_sub(year_string, 1,4)|> parse_number(),
    month  = str_sub(year_string, -1L-1, -1L) |> parse_number()
  ) |> 
  select(-split_tmp)

Let’s just get a summary of work we need to do

Code
df_files |> 
  count(variable, name = 'n_months') |> 
  mutate(n_years =  n_months/12) 
variable n_months n_years
10m_u_component_of_wind.nc 180 15
10m_v_component_of_wind.nc 180 15
snow_depth.nc 180 15
snowfall.nc 180 15
surface_runoff.nc 180 15
total_precipitation.nc 180 15

Metadata check

Code
nc_example = df_files |> 
  filter(variable == 'total_precipitation.nc') |> 
  slice(1) |> 
  pull(path)
nc_example
[1] "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/ContiguousUS/ByMonth/era5_land__v1/raw/era5-conus__2009_1__total_precipitation.nc"
Code
nc1 =  nc_example |> 
  nc_open()
nc1
File //files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/ContiguousUS/ByMonth/era5_land__v1/raw/era5-conus__2009_1__total_precipitation.nc (NC_FORMAT_NETCDF4):

     3 variables (excluding dimension variables):
        8 byte int number[]   (Contiguous storage)  
            long_name: ensemble member numerical id
            units: 1
            standard_name: realization
        string expver[valid_time]   (Contiguous storage)  
        float tp[longitude,latitude,valid_time]   (Chunking: [156,73,186])  (Compression: shuffle,level 1)
[1] ">>>> WARNING <<<  attribute GRIB_paramId is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_numberOfPoints is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_stepUnits is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_uvRelativeToGrid is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_NV is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_Nx is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_Ny is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_iScansNegatively is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_jPointsAreConsecutive is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_jScansPositively is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<<  attribute GRIB_totalNumber is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
            _FillValue: NaN
            GRIB_paramId: 228
            GRIB_dataType: fc
            GRIB_numberOfPoints: 180711
            GRIB_typeOfLevel: surface
            GRIB_stepUnits: 1
            GRIB_stepType: accum
            GRIB_gridType: regular_ll
            GRIB_uvRelativeToGrid: 0
            GRIB_NV: 0
            GRIB_Nx: 621
            GRIB_Ny: 291
            GRIB_cfName: unknown
            GRIB_cfVarName: tp
            GRIB_gridDefinitionDescription: Latitude/Longitude Grid
            GRIB_iDirectionIncrementInDegrees: 0.1
            GRIB_iScansNegatively: 0
            GRIB_jDirectionIncrementInDegrees: 0.1
            GRIB_jPointsAreConsecutive: 0
            GRIB_jScansPositively: 0
            GRIB_latitudeOfFirstGridPointInDegrees: 52
            GRIB_latitudeOfLastGridPointInDegrees: 23
            GRIB_longitudeOfFirstGridPointInDegrees: -128
            GRIB_longitudeOfLastGridPointInDegrees: -66
            GRIB_missingValue: 3.40282346638529e+38
            GRIB_name: Total precipitation
            GRIB_shortName: tp
            GRIB_totalNumber: 0
            GRIB_units: m
            long_name: Total precipitation
            units: m
            standard_name: unknown
            GRIB_surface: 0
            coordinates: number valid_time latitude longitude expver

     3 dimensions:
        valid_time  Size:744 
            long_name: time
            standard_name: time
            units: seconds since 1970-01-01
            calendar: proleptic_gregorian
        latitude  Size:291 
            _FillValue: NaN
            units: degrees_north
            standard_name: latitude
            long_name: latitude
            stored_direction: decreasing
        longitude  Size:621 
            _FillValue: NaN
            units: degrees_east
            standard_name: longitude
            long_name: longitude
[1] ">>>> WARNING <<<  attribute GRIB_subCentre is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"

    6 global attributes:
        GRIB_centre: ecmf
        GRIB_centreDescription: European Centre for Medium-Range Weather Forecasts
        GRIB_subCentre: 0
        Conventions: CF-1.7
        institution: European Centre for Medium-Range Weather Forecasts
        history: 2025-02-17T21:58 GRIB to CDM+CF via cfgrib-0.9.15.0/ecCodes-2.39.0 with {"source": "data.grib", "filter_by_keys": {}, "encode_cf": ["parameter", "time", "geography", "vertical"]}

Now let’s examine details

Code
names(nc1$var)
[1] "number" "expver" "tp"    

Looks good. we like tp

Code
names(nc1$dim)
[1] "valid_time" "latitude"   "longitude" 

Okay we have cell coord + time

Code
time_vals <- ncvar_get(nc1, "valid_time")
dates <- as.POSIXct(time_vals, origin="1970-01-01", tz="UTC")

head(dates)
[1] "2009-01-01 00:00:00 UTC" "2009-01-01 01:00:00 UTC"
[3] "2009-01-01 02:00:00 UTC" "2009-01-01 03:00:00 UTC"
[5] "2009-01-01 04:00:00 UTC" "2009-01-01 05:00:00 UTC"
Code
print(length(dates))
[1] 744

This make sense we have 744 hours in January.

Boundaries

Steps:

  • Imported: raw boundaries from Census2010_US20230929.gdb
  • Subset: We will save in a format that exactextract can operate 1. (gdb) 2. (shp)

Raw Geodatabase

We need the CONTUS tract10 boundaries that have land area greater zero. We will import from a geodatabase here:

Code
path_gdb = '//files.drexel.edu/colleges/SOPH/Shared/UHC/ResearchAndData/UHC_Data/Census2010/Geodatabases/Census2010_US20230929.gdb'

dir.exists(path_gdb)
[1] TRUE

Let’s see the feature-classes available.

Code
path_gdb |> sf::st_layers()
Driver: OpenFileGDB 
Available layers:
                      layer_name geometry_type features fields
1 CT_2010_ContUSlandRepairAlbers Multi Polygon    72246     16
2    CT_2010_ContUSland_TimeZone Multi Polygon    72246     20
                                             crs_name
1 USA_Contiguous_Albers_Equal_Area_Conic_USGS_version
2 USA_Contiguous_Albers_Equal_Area_Conic_USGS_version

We will use the CT_2010_ContUSland_TimeZone feature class.

Code
sf_boundaries = path_gdb |> 
  sf::st_read("CT_2010_ContUSland_TimeZone") 
Reading layer `CT_2010_ContUSland_TimeZone' from data source 
  `\\files.drexel.edu\colleges\SOPH\Shared\UHC\ResearchAndData\UHC_Data\Census2010\Geodatabases\Census2010_US20230929.gdb' 
  using driver `OpenFileGDB'
Simple feature collection with 72246 features and 20 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -2356499 ymin: 264001.4 xmax: 2259829 ymax: 3177417
Projected CRS: USA_Contiguous_Albers_Equal_Area_Conic_USGS_version

Subset

We decided to focus on 16 PEDSNET states for now. Let’s bring in those states.

Code
df_states = read_csv('PEDSNETStates.csv')

Let’s subset the original all tracts object to states of interest.

Code
sf_subset = sf_boundaries |> 
  filter(STATEFP10 %in% df_states$STFIPS)

sf_subset |>
  sf::st_drop_geometry() |> 
  count(STATEFP10)
STATEFP10 n
06 8036
08 1249
10 215
11 179
17 3121
18 1508
21 1115
24 1394
34 2004
39 2946
42 3217
48 5253
51 1895
53 1446
54 484
55 1393

Now we want to save these somewhere by state for exact extract to work out of.

Code
sf_subset |> 
  group_by(STATEFP10) |>
  group_walk(function(.x, .y) {
    folder_tmp = glue('{context$boundaries_server}/shp/tract10_{unique(.y)}')
    dir.create(folder_tmp, showWarnings = F)
    path_tmp = glue('{folder_tmp}/tract10_{unique(.y)}.shp')
    if (!file.exists(path_tmp)) .x |> sf::st_write(path_tmp)
    cli_alert_success('saving to {path_tmp}')
  })

Zonal Statistics

Goals

Final Table

The final tables will have these columns.

  1. tract geoid
  2. local time
  3. time-zone (?) this is in the boundary file? (linkable by tract10 and this info is found in the origainl GDB)
  4. variable
  5. value

Intermediate Table

The final tables will have these columns.

  1. tract geoid
  2. date-time (UTC)
  3. variable
  4. value

Setup

There are few things to set up. One is creating a folder to store results locally (prec-compiled intermediate tables) cache.

Code
list.files('cache')
[1] "zonal_stats"     "zonal_stats_dev"
  • the zonal_stats folder is for final results
  • the zonal_stats_dev folder is for storing benchmark results; feel free to deleted content at any time.

Repo note; we will gitingore these results so they don’t get pushed to GitHub.

Processing Function

Thigns we can remove:

  • [] Added hour because processing at hour
```{r}

measure_tmp = 'tp'
state_tmp = 'DE'
year_tmp =  '2010'
month_tmp = '1'
day_tmp = '12'
hour_tmp = '01'
datetimetxt_tmp = glue("{year_tmp}-{month_tmp}-{day_tmp}T{hour_tmp}:00:00Z")
dev = T

crate_calc_prism_zonal_exactextract = crate(
  template_raw = template_raw,
  template = template,
  function(year_tmp, month_tmp, day_tmp, hour_tmp, datetimetxt_tmp,
           dev = ){
    
    { ###### Setup ######
      
      out_folder = 'cache/zonal_stats'
      if (dev == T) out_folder = 'cache/zonal_stats_dev'
      out_dir = base::file.path(
        out_folder,
        glue::glue("measure={measure_tmp}"),
        glue::glue("state={state_tmp}"),
        glue::glue("year={year_tmp}") ,
        glue::glue("month={month_tmp}")  ,
        glue::glue("day={day_tmp}")   ,
        glue::glue("hour={hour_tmp}")    
      )|>
        as.character()
      cli::cli_alert_info("Started: {measure_tmp} {state_tmp} {datetimetxt_tmp}")
      cli::cli_alert_info("Output: {out_dir}")
      
      ## Tempalate contains information for this processing unit (boundary path), (raster data path) and potentially other information.
      template_tmp = template
      if (dev == T) template_tmp = template_raw
    
      row = template_tmp |>
        dplyr::filter(
          geo_level ==  geo_level_tmp,
          measure ==  measure_tmp,
          year == year_tmp,
          state == state_tmp,
          datetxt == datetxt_tmp
        )
      out_file = file.path(out_dir,'data.parquet')
  
    }
    
    { ###### Input Validations ######
      if (file.exists(out_file)){
        cli::cli_alert_info("Already processed:  {geo_level_tmp} {measure_tmp} {state_tmp} {year_tmp} {datetxt_tmp}")
        return("Already processed")
      }
      
      ## Iterate over state,year
      if(nrow(row) == 0){
        cli::cli_alert_info("No data: {geo_level_tmp} {measure_tmp} {state_tmp} {year_tmp} {datetxt_tmp}")
        return("No template rows found")
      }
      
      if (dev == T) print(row)
    }
   
   
    { ###### Computation ######
      
      ## Imports
      r = terra::rast(x = row$r_path_cache)
      z = sf::st_read(row$z_path_cache, quiet = T)

      ## Try .tif raster calculation
      value_results <- tryCatch({
        exactextractr::exact_extract(r, z, 'mean', progress = FALSE)
      }, error = function(e) {
        # If there's an error, try the .bil raster
        r_bil <- terra::rast(x = row$r_path_bil)
        exactextractr::exact_extract(r_bil, z, 'mean', progress = FALSE)
      })
      
      ## If .tif results have NA values then use .bil raster for calculation
      if (any(is.nan(value_results))){
        r = terra::rast(x = row$r_path_bil)
        value_results = exactextractr::exact_extract(
          r, z, 'mean', 
          progress = FALSE )
      }
      
      ## Produce results
      df_results_state_year = terra::as.data.frame(z) |> 
        dplyr::select(state_abb, geoid) |>
        dplyr::mutate(
          date = row$date,
          value = value_results )  |>
        dplyr::select(geoid, date, state_abb, value) 
      
      if (dev == T) print("DONE!!!!")
    }
    
    { ###### Exports ######
      if (!dir.exists(out_dir)) dir.create(out_dir, recursive = T)
      df_results_state_year |> arrow::write_parquet(out_file)
      cli::cli_alert_success("Finished: {measure_tmp} {state_tmp}  {geo_level_tmp} {datetxt_tmp}")
      return()
      
    }
    
  })
 
rm(geo_level_tmp, measure_tmp, state_tmp, 
   month_tmp, day_tmp, datetxt_tmp, year_tmp, dev, engine)
```