## Dependencies
library(pacman)
::p_load(tidyverse, exactextractr, terra, glue,
pacman
sfarrow, leaflet,
furrr, sf, purrr, cli, tictoc, assertr, reactable,
ecmwfr,
sf, tigris, rmapshaper, geoarrow,
ncdf4, exactextractr, terra, here)
## Set directory
setwd(here("notebooks/ERA5 zonal statistics"))
## Local objects
= lst(
context boundaries_server = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/boundaries'
)
PEDSnet - ERA5 tract10 zonal statistics v1
Raster Data
Inventory
Code
= '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/ContiguousUS/ByMonth/era5_land__v1/raw'
server
= tibble(
df_files path = list.files(server, full.names = T),
file = basename(path)
|>
) select(file, everything()) |>
mutate(
split_tmp = str_split(file, pattern = '__'),
year_string = map_chr(split_tmp, ~.x[2]),
variable = map_chr(split_tmp, ~.x[3]),
year = str_sub(year_string, 1,4)|> parse_number(),
month = str_sub(year_string, -1L-1, -1L) |> parse_number()
|>
) select(-split_tmp)
Let’s just get a summary of work we need to do
Code
|>
df_files count(variable, name = 'n_months') |>
mutate(n_years = n_months/12)
variable | n_months | n_years |
---|---|---|
10m_u_component_of_wind.nc | 180 | 15 |
10m_v_component_of_wind.nc | 180 | 15 |
snow_depth.nc | 180 | 15 |
snowfall.nc | 180 | 15 |
surface_runoff.nc | 180 | 15 |
total_precipitation.nc | 180 | 15 |
Metadata check
Code
= df_files |>
nc_example filter(variable == 'total_precipitation.nc') |>
slice(1) |>
pull(path)
nc_example
[1] "//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/ContiguousUS/ByMonth/era5_land__v1/raw/era5-conus__2009_1__total_precipitation.nc"
Code
= nc_example |>
nc1 nc_open()
nc1
File //files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH_read_only_access/ERA5Land/ContiguousUS/ByMonth/era5_land__v1/raw/era5-conus__2009_1__total_precipitation.nc (NC_FORMAT_NETCDF4):
3 variables (excluding dimension variables):
8 byte int number[] (Contiguous storage)
long_name: ensemble member numerical id
units: 1
standard_name: realization
string expver[valid_time] (Contiguous storage)
float tp[longitude,latitude,valid_time] (Chunking: [156,73,186]) (Compression: shuffle,level 1)
[1] ">>>> WARNING <<< attribute GRIB_paramId is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<< attribute GRIB_numberOfPoints is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<< attribute GRIB_stepUnits is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<< attribute GRIB_uvRelativeToGrid is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<< attribute GRIB_NV is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<< attribute GRIB_Nx is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<< attribute GRIB_Ny is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<< attribute GRIB_iScansNegatively is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<< attribute GRIB_jPointsAreConsecutive is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<< attribute GRIB_jScansPositively is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
[1] ">>>> WARNING <<< attribute GRIB_totalNumber is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
_FillValue: NaN
GRIB_paramId: 228
GRIB_dataType: fc
GRIB_numberOfPoints: 180711
GRIB_typeOfLevel: surface
GRIB_stepUnits: 1
GRIB_stepType: accum
GRIB_gridType: regular_ll
GRIB_uvRelativeToGrid: 0
GRIB_NV: 0
GRIB_Nx: 621
GRIB_Ny: 291
GRIB_cfName: unknown
GRIB_cfVarName: tp
GRIB_gridDefinitionDescription: Latitude/Longitude Grid
GRIB_iDirectionIncrementInDegrees: 0.1
GRIB_iScansNegatively: 0
GRIB_jDirectionIncrementInDegrees: 0.1
GRIB_jPointsAreConsecutive: 0
GRIB_jScansPositively: 0
GRIB_latitudeOfFirstGridPointInDegrees: 52
GRIB_latitudeOfLastGridPointInDegrees: 23
GRIB_longitudeOfFirstGridPointInDegrees: -128
GRIB_longitudeOfLastGridPointInDegrees: -66
GRIB_missingValue: 3.40282346638529e+38
GRIB_name: Total precipitation
GRIB_shortName: tp
GRIB_totalNumber: 0
GRIB_units: m
long_name: Total precipitation
units: m
standard_name: unknown
GRIB_surface: 0
coordinates: number valid_time latitude longitude expver
3 dimensions:
valid_time Size:744
long_name: time
standard_name: time
units: seconds since 1970-01-01
calendar: proleptic_gregorian
latitude Size:291
_FillValue: NaN
units: degrees_north
standard_name: latitude
long_name: latitude
stored_direction: decreasing
longitude Size:621
_FillValue: NaN
units: degrees_east
standard_name: longitude
long_name: longitude
[1] ">>>> WARNING <<< attribute GRIB_subCentre is an 8-byte value, but R"
[1] "does not support this data type. I am returning a double precision"
[1] "floating point, but you must be aware that this could lose precision!"
6 global attributes:
GRIB_centre: ecmf
GRIB_centreDescription: European Centre for Medium-Range Weather Forecasts
GRIB_subCentre: 0
Conventions: CF-1.7
institution: European Centre for Medium-Range Weather Forecasts
history: 2025-02-17T21:58 GRIB to CDM+CF via cfgrib-0.9.15.0/ecCodes-2.39.0 with {"source": "data.grib", "filter_by_keys": {}, "encode_cf": ["parameter", "time", "geography", "vertical"]}
Now let’s examine details
Code
names(nc1$var)
[1] "number" "expver" "tp"
Looks good. we like tp
Code
names(nc1$dim)
[1] "valid_time" "latitude" "longitude"
Okay we have cell coord + time
Code
<- ncvar_get(nc1, "valid_time")
time_vals <- as.POSIXct(time_vals, origin="1970-01-01", tz="UTC")
dates
head(dates)
[1] "2009-01-01 00:00:00 UTC" "2009-01-01 01:00:00 UTC"
[3] "2009-01-01 02:00:00 UTC" "2009-01-01 03:00:00 UTC"
[5] "2009-01-01 04:00:00 UTC" "2009-01-01 05:00:00 UTC"
Code
print(length(dates))
[1] 744
This make sense we have 744 hours in January.
Boundaries
Steps:
- Imported: raw boundaries from
Census2010_US20230929.gdb
- Subset: We will save in a format that exactextract can operate 1. (gdb) 2. (shp)
Raw Geodatabase
We need the CONTUS tract10 boundaries that have land area greater zero. We will import from a geodatabase here:
Code
= '//files.drexel.edu/colleges/SOPH/Shared/UHC/ResearchAndData/UHC_Data/Census2010/Geodatabases/Census2010_US20230929.gdb'
path_gdb
dir.exists(path_gdb)
[1] TRUE
Let’s see the feature-classes available.
Code
|> sf::st_layers() path_gdb
Driver: OpenFileGDB
Available layers:
layer_name geometry_type features fields
1 CT_2010_ContUSlandRepairAlbers Multi Polygon 72246 16
2 CT_2010_ContUSland_TimeZone Multi Polygon 72246 20
crs_name
1 USA_Contiguous_Albers_Equal_Area_Conic_USGS_version
2 USA_Contiguous_Albers_Equal_Area_Conic_USGS_version
We will use the CT_2010_ContUSland_TimeZone
feature class.
Code
= path_gdb |>
sf_boundaries ::st_read("CT_2010_ContUSland_TimeZone") sf
Reading layer `CT_2010_ContUSland_TimeZone' from data source
`\\files.drexel.edu\colleges\SOPH\Shared\UHC\ResearchAndData\UHC_Data\Census2010\Geodatabases\Census2010_US20230929.gdb'
using driver `OpenFileGDB'
Simple feature collection with 72246 features and 20 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -2356499 ymin: 264001.4 xmax: 2259829 ymax: 3177417
Projected CRS: USA_Contiguous_Albers_Equal_Area_Conic_USGS_version
Subset
We decided to focus on 16 PEDSNET states for now. Let’s bring in those states.
Code
= read_csv('PEDSNETStates.csv') df_states
Let’s subset the original all tracts object to states of interest.
Code
= sf_boundaries |>
sf_subset filter(STATEFP10 %in% df_states$STFIPS)
|>
sf_subset ::st_drop_geometry() |>
sfcount(STATEFP10)
STATEFP10 | n |
---|---|
06 | 8036 |
08 | 1249 |
10 | 215 |
11 | 179 |
17 | 3121 |
18 | 1508 |
21 | 1115 |
24 | 1394 |
34 | 2004 |
39 | 2946 |
42 | 3217 |
48 | 5253 |
51 | 1895 |
53 | 1446 |
54 | 484 |
55 | 1393 |
Now we want to save these somewhere by state for exact extract to work out of.
Code
|>
sf_subset group_by(STATEFP10) |>
group_walk(function(.x, .y) {
= glue('{context$boundaries_server}/shp/tract10_{unique(.y)}')
folder_tmp dir.create(folder_tmp, showWarnings = F)
= glue('{folder_tmp}/tract10_{unique(.y)}.shp')
path_tmp if (!file.exists(path_tmp)) .x |> sf::st_write(path_tmp)
cli_alert_success('saving to {path_tmp}')
})
Zonal Statistics
Goals
Final Table
The final tables will have these columns.
- tract geoid
- local time
- time-zone (?) this is in the boundary file? (linkable by tract10 and this info is found in the origainl GDB)
- variable
- value
Intermediate Table
The final tables will have these columns.
- tract geoid
- date-time (UTC)
- variable
- value
Setup
There are few things to set up. One is creating a folder to store results locally (prec-compiled intermediate tables) cache
.
Code
list.files('cache')
[1] "zonal_stats" "zonal_stats_dev"
- the
zonal_stats
folder is forfinal
results - the
zonal_stats_dev
folder is for storing benchmark results; feel free to deleted content at any time.
Repo note; we will gitingore these results so they don’t get pushed to GitHub.
Processing Function
Thigns we can remove:
- [] Added hour because processing at hour
```{r}
measure_tmp = 'tp'
state_tmp = 'DE'
year_tmp = '2010'
month_tmp = '1'
day_tmp = '12'
hour_tmp = '01'
datetimetxt_tmp = glue("{year_tmp}-{month_tmp}-{day_tmp}T{hour_tmp}:00:00Z")
dev = T
crate_calc_prism_zonal_exactextract = crate(
template_raw = template_raw,
template = template,
function(year_tmp, month_tmp, day_tmp, hour_tmp, datetimetxt_tmp,
dev = ){
{ ###### Setup ######
out_folder = 'cache/zonal_stats'
if (dev == T) out_folder = 'cache/zonal_stats_dev'
out_dir = base::file.path(
out_folder,
glue::glue("measure={measure_tmp}"),
glue::glue("state={state_tmp}"),
glue::glue("year={year_tmp}") ,
glue::glue("month={month_tmp}") ,
glue::glue("day={day_tmp}") ,
glue::glue("hour={hour_tmp}")
)|>
as.character()
cli::cli_alert_info("Started: {measure_tmp} {state_tmp} {datetimetxt_tmp}")
cli::cli_alert_info("Output: {out_dir}")
## Tempalate contains information for this processing unit (boundary path), (raster data path) and potentially other information.
template_tmp = template
if (dev == T) template_tmp = template_raw
row = template_tmp |>
dplyr::filter(
geo_level == geo_level_tmp,
measure == measure_tmp,
year == year_tmp,
state == state_tmp,
datetxt == datetxt_tmp
)
out_file = file.path(out_dir,'data.parquet')
}
{ ###### Input Validations ######
if (file.exists(out_file)){
cli::cli_alert_info("Already processed: {geo_level_tmp} {measure_tmp} {state_tmp} {year_tmp} {datetxt_tmp}")
return("Already processed")
}
## Iterate over state,year
if(nrow(row) == 0){
cli::cli_alert_info("No data: {geo_level_tmp} {measure_tmp} {state_tmp} {year_tmp} {datetxt_tmp}")
return("No template rows found")
}
if (dev == T) print(row)
}
{ ###### Computation ######
## Imports
r = terra::rast(x = row$r_path_cache)
z = sf::st_read(row$z_path_cache, quiet = T)
## Try .tif raster calculation
value_results <- tryCatch({
exactextractr::exact_extract(r, z, 'mean', progress = FALSE)
}, error = function(e) {
# If there's an error, try the .bil raster
r_bil <- terra::rast(x = row$r_path_bil)
exactextractr::exact_extract(r_bil, z, 'mean', progress = FALSE)
})
## If .tif results have NA values then use .bil raster for calculation
if (any(is.nan(value_results))){
r = terra::rast(x = row$r_path_bil)
value_results = exactextractr::exact_extract(
r, z, 'mean',
progress = FALSE )
}
## Produce results
df_results_state_year = terra::as.data.frame(z) |>
dplyr::select(state_abb, geoid) |>
dplyr::mutate(
date = row$date,
value = value_results ) |>
dplyr::select(geoid, date, state_abb, value)
if (dev == T) print("DONE!!!!")
}
{ ###### Exports ######
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = T)
df_results_state_year |> arrow::write_parquet(out_file)
cli::cli_alert_success("Finished: {measure_tmp} {state_tmp} {geo_level_tmp} {datetxt_tmp}")
return()
}
})
rm(geo_level_tmp, measure_tmp, state_tmp,
month_tmp, day_tmp, datetxt_tmp, year_tmp, dev, engine)
```