2 - PRISM Benchmakrs

Benchmark parallel Terra workflow for PRISM data.

Author

Ran Li

Published

April 9, 2024

Setup Code

## Dependencies
library(pacman) 
pacman::p_load(tidyverse, tictoc, furrr, carrier, glue, 
               terra, sfarrow, leaflet, sf, haven, ggpubr)

## Import Pipeline Objects
source("../0 - Pipeline objects.R")

## Set up sample information
sample = lst(
  tract = '42101000500',
  datetxt  = '01/01/2019',
  date  = as.Date(datetxt, format = "%m/%d/%Y"),
  year = lubridate::year(date),
  measure = 'tdmean',
  boundaries_SpatVector = file.path(sam_tiger_lake,"tl_2010_PA_tract10_albers"),
  prism_SpatRaster = file.path(prism_ccuh_lake_raw,'prism_tdmean_us_30s_20190101.tif'),
  prism_cached_SpatRaster = 'raw/prism_tdmean_us_30s_20190101.tif'
)

This aims to reproduce the logic in TractZonalStatisticsDailyUSforSAM.pynb with Terra. For a single day and measure the process took around 17 minutes in the Pynb. Here we want to get a benchmark in Terra.

The general workflow is to:

import tract boundary
import PRISM raster
calculate zonal statistic for each tract unit
saves results

Lets first import some existing results as a reference point.

Reference results (ESRI)

We have preprocessed the ESRI results from a bunch of .sas7bdat files into a distributed database. Lets pull results from our sample: tract42101000500, tmean, 01/01/2019.

Pull sample reference results

## Connect to database
db_reference = arrow::open_dataset(prism_ccuh_hive_reference)

## Pull PA reference results
df_reference = db_reference %>% 
    filter(measure == sample$measure,
           year == sample$year,
           DATETXT == sample$datetxt) %>% 
  collect() %>% 
  select(GEOID10,
         measure = MEASURE,
         date =  DATEVAL,
         value_esri = VALUE)

## Subset sample reference
df_sample_reference = df_reference %>% 
    filter(GEOID10 == sample$tract) 
         

## Displan sample reference results
df_sample_reference

GEOID10	measure	date	value_esri
42101000500	tdmean	2019-01-01	7.329277

Reproduce Zonal stat in Terra

Import Raster (PRISM - SpatRast)

Okay. We now have our tract boundaries. Lets grab the PRISM data! We have already pre-processed all PRISM data into a standardized .tif for performance and to ensure correct projections. Lets pull this from that data lake.

Import PRISM SpatRaster

## Cache local
if (!file.exists(sample$prism_cached_SpatRaster)){
  tic()
  file.copy(sample$prism_SpatRaster, sample$prism_cached_SpatRaster)
  toc()
}

## Import
SpatRaster_prism = terra::rast(sample$prism_cached_SpatRaster)

## Display
SpatRaster_prism

class       : SpatRaster 
dimensions  : 4269, 7920, 1  (nrow, ncol, nlyr)
resolution  : 751.2838, 751.2838  (x, y)
extent      : -2950369, 2999799, 115807.3, 3323038  (xmin, xmax, ymin, ymax)
coord. ref. : NAD83 / Conus Albers (EPSG:5070) 
source      : prism_tdmean_us_30s_20190101.tif 
name        : prism_tdmean_us_30s_20190101 
min value   :                    -35.98596 
max value   :                     21.94787

Import Zones (TigerLine - SpatVector)

Lets try to reproduce this logic with a single tract “City Hall” = )

Import tract

## Import PA Boundaries as SpatVector
SpatVector_tract_PA = sample$boundaries_SpatVector %>% terra::vect() 

## Op. Sample tract SpatVector
target_indices  = which(SpatVector_tract_PA$GEOID10 == sample$tract)
SpatVector_sample_boundaries = SpatVector_tract_PA[target_indices, ]

## Op. Sample SpatVector as sf
sf_sample_boundaries = st_as_sf(SpatVector_sample_boundaries) %>% 
  st_transform(4326) 

## Map to QC
sf_sample_boundaries %>% 
  leaflet()  %>% 
  addTiles()  %>% 
  addPolygons()

Calculate Zonal Stat for a single tract

Okay now lets calculate the zonal statistic for this tract-variable.

Calculate zonal stat

## Calculate zonal stat
zonal_stats = terra::zonal(
  SpatRaster_prism, 
  SpatVector_sample_boundaries,
  fun="mean", 
  na.rm=TRUE) %>%
  pull(1)

## Op. results
df_sample_results = tibble( 
  GEOID10 = sample$tract, 
  measure = sample$measure,
  date = sample$date, 
  value_terra = zonal_stats)

## Compare
left_join(df_sample_results, df_sample_reference,
          by = c('GEOID10', 'measure', 'date'))

GEOID10	measure	date	value_terra	value_esri
42101000500	tdmean	2019-01-01	7.32197	7.329277

Great! So now we can see that the zonal_stat for this tract. I mean its slightly different but for temperature being off by less than 0.1 degrees seems okay. But we will review this QC in an whole run … see next section.

Try a whole state (PA)

Calculate zonal stat

## Calculate zonal stat
tic()
df_zonal_stats_PA = terra::zonal(
  terra::rast("raw/prism_tdmean_us_30s_20190101.tif"), 
  SpatVector_tract_PA,
  fun="mean", 
  na.rm=TRUE) %>% 
  dplyr::rename('value' = 1) 
toc()

7.09 sec elapsed

Calculate zonal stat

## Op dataframe result
df_results_terra_PA =  SpatVector_tract_PA %>%
  as.data.frame() %>%
  as_tibble() %>%
  select(GEOID10) %>%
  mutate(
    measure = sample$measure,
    date = sample$date) %>%
  bind_cols(df_zonal_stats_PA)

head(df_results_terra_PA)

GEOID10	measure	date	value
42003560500	tdmean	2019-01-01	7.104099
42003560400	tdmean	2019-01-01	7.127940
42003552400	tdmean	2019-01-01	7.067830
42003552300	tdmean	2019-01-01	7.072888
42003552200	tdmean	2019-01-01	7.199420
42003552100	tdmean	2019-01-01	7.067318

That was quick for the 3000ish tracks in PA the computations took 4-5 seconds.

QC (PA)

Lets take a look at the correlation to old results for our PA results.

QC PA results with ESRI results

## Op. QC data structures
xwalk_esri_pa_results = df_reference %>%
  select(GEOID10, value_esri)
df_qc_pa = df_results_terra_PA %>%
  rename(value_terra = value) %>%
  left_join(xwalk_esri_pa_results,
            by = 'GEOID10')


## Plot
correlation_coefficient <- cor(df_qc_pa$value_terra, df_qc_pa$value_esri, use = "complete.obs")
ggplot(df_qc_pa, aes(x = value_terra, y = value_esri)) +
  geom_point(alpha = 0.6) +  # Scatter plot
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Fitted line
  theme_minimal() +  # Use a minimal theme for a nice look
  labs(title = "PA/tdmean/2019-01-01: 3218 data points" ,
       subtitle = paste("Correlation coefficient:", round(correlation_coefficient, 6)),
       x = "Value Terra",
       y = "Value Esri") +
  stat_cor(method = "pearson", label.x = 3)  # Automatically add correlation coefficient text

We can see that the correlation coefficient is 0.9998. So we can see that the values are pretty much the same. Passes the initial smell test and could be passed to Steve and team for further validation.

Reproduce for whole whole unit (variable, date)

Heres the big test now. Lest try the benchmark for a whole unit of computation which is for every tract in the contigous US calculate zonal stats for a single measure - tdmean and single day 01/01/2019.

To the PA code we need to add one more layer of complexity to loop through all states. Lets turn the state specific computation into a function then just loop!

Function

Parallel runs are a little different as each function runs in its own environment, while global object detection usually works, we had some trouble so lets do a crated function just to explicitly define our dependencies to prevent issues in parallel processing. A few things crated function differs in

name scoping of all dependencies
explicitly defining object dependencies
SpatRaster and SpatVector objects are not able to be used in Parallelization (due to C++ stuff) so we need to import from disk during loops

So this function will work if the user has access to the SAM un-encrypted server where we are staging some pre-processed boudnaries.

Setup Function for state specific zonal calculation

##  Create state function
crate_calc_zonal_state = crate(
  sam_tiger_lake = sam_tiger_lake,
  sample = sample,
  function(state_tmp){
    
    ## 1. Setup 
    if (!file.exists(sample$prism_cached_SpatRaster)){
      file.copy(sample$prism_SpatRaster, sample$prism_cached_SpatRaster)
    }
    r = terra::rast(sample$prism_cached_SpatRaster)

    ## 2. Get zones  for state
    zone_path = file.path(
      sam_tiger_lake, 
      glue::glue("tl_2010_{state_tmp}_tract10_albers"))
    z = terra::vect(zone_path)

    ## 3. Calculate zonal stat
    df_zonal_stats = terra::zonal(
      r,
      z,
      fun="mean",
      na.rm=TRUE)

    ## 4. Op dataframe result
    df_zonal_stats = dplyr::rename(df_zonal_stats, 'value' = 1)
    df_results_terra_1 = tibble::as_tibble(terra::as.data.frame(z))
    df_results_terra_2 = dplyr::select(df_results_terra_1, GEOID10)
    df_results_terra_3 = dplyr::mutate(df_results_terra_2,
                                       measure = sample$measure,
                                       date = sample$date)
    df_results_terra = dplyr::bind_cols(df_results_terra_3, df_zonal_stats)

    return(df_results_terra)
})

## Test
head(crate_calc_zonal_state('PA'))

GEOID10	measure	date	value
42003560500	tdmean	2019-01-01	7.104099
42003560400	tdmean	2019-01-01	7.127940
42003552400	tdmean	2019-01-01	7.067830
42003552300	tdmean	2019-01-01	7.072888
42003552200	tdmean	2019-01-01	7.199420
42003552100	tdmean	2019-01-01	7.067318

Benchmark

Legacy benchmark: 17 minutes
Terra benchmarks: UHC Workstation (max 6 cores)
- 1 core: 2.8 minutes
- 2 core: 1.5 minutes
- 4 cores: 1 minute
- 6 cores: minutes

1 core

So lets do this for a measure, day and all tracts. So essentially just loop through the state specific function.

Benchmark: 1 core

## Configure cores
plan(multisession, workers = 1)
opts <- furrr_options(globals = FALSE, seed = TRUE)

## Perform benchmark
tic()
furrr::future_map(
  contiguous_states, 
  crate_calc_zonal_state,
  .options = opts
) %>% 
  bind_rows()
toc()

2 cores

Benchmark: 2 cores

## Configure cores
plan(multisession, workers = 2)
opts <- furrr_options(globals = FALSE, seed = TRUE)

## Perform benchmark
tic()
df_measure_day_results_2_core =  furrr::future_map(
  contiguous_states, 
  crate_calc_zonal_state,
  .options = opts
) %>% 
  bind_rows()
toc()

4 cores

Benchmark: 4 Cores

## Configure cores
plan(multisession, workers = 4)
opts <- furrr_options(globals = FALSE, seed = TRUE)

## Perform benchmark
tic()
df_measure_day_results_4_core = furrr::future_map(
  contiguous_states, 
  crate_calc_zonal_state,
  .options = opts
)
toc()

107.58 sec elapsed

6 cores

Benchmark: 6 Cores

## Configure cores
plan(multisession, workers = 6)
opts <- furrr_options(globals = FALSE, seed = TRUE)

## Perform benchmark
tic()
furrr::future_map(
  contiguous_states, 
  crate_calc_zonal_state,
  .options = opts
)
toc()

8 cores

Benchmark: 8 Cores

## Configure cores
plan(multisession, workers = 8)
opts <- furrr_options(globals = FALSE, seed = TRUE)

## Perform benchmark
tic()
furrr::future_map(
  contiguous_states, 
  crate_calc_zonal_state,
  .options = opts
)
toc()

QC (US results)

For this computational block of ‘tmean’, ‘01/01/2019’ and all tracts in contiguous US lets validate the results compared to original results.

Code

## Combing data
xwalk_esri_results = df_reference %>% 
  filter(date == sample$date,
         measure == sample$measure) %>% 
  select(GEOID10, value_esri)
df_qc_us = df_measure_day_results_4_core %>% 
  bind_rows() %>% 
  rename(value_terra = value) %>% 
  left_join(xwalk_esri_pa_results, 
            by = 'GEOID10')

# head(df_qc_us)


## Get Correlation
correlation_coefficient <- cor(df_qc_us$value_terra, df_qc_us$value_esri, use = "complete.obs")

## Plot QC
ggplot(df_qc_us, aes(x = value_terra, y = value_esri)) +
  geom_point(alpha = 0.6) +  # Scatter plot
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Fitted line
  theme_minimal() +  # Use a minimal theme for a nice look
  labs(title = "USA/tmean/2019-01-01: 83571 data points",
       subtitle = paste("Correlation coefficient:", round(correlation_coefficient, 6)),
       x = "Value Terra",
       y = "Value Esri") +
  stat_cor(method = "pearson", label.x = 3)  # Automatically add correlation coefficient text

Summary

Clearly we see huge performance improvements - the 17 minute process in the legacy code is now down to 1 minute. Proposal for putting this into production.

Production function to accept arguments (measure, day, geogrpahic-level) for looping
Store results as Hive Partitioned similiar to reference results
Protocol for accessing partitioned PRISM zonal stats database
Create a flexible function for custom geographies (input is a an sf object)