## Dependencies
library(pacman)
::p_load(
pacman
dplyr, tidyr, stringr, lubridate, arrow, purrr, cli, glue, reactable,
skimr, readxl, xlsx, jsonlite,duckdb,
haven)
## Local context
= lst(
local_context ## Identifiers
model_id = 'source_pennsylvania_records_death',
## Processing
path_ccuh = '//files.drexel.edu/encrypted/SOPH/UHC/CCUH',
path_freeze = file.path(path_ccuh, 'ccuh-freeze',model_id),
path_raw = file.path(path_freeze, 'raw'),
path_cache = file.path(path_freeze, 'cache'),
## DBT
path_dbt = file.path(path_ccuh,'dbt'),
path_dbt_source_model = file.path(path_dbt, 'source', paste0(model_id,'.parquet'))
)
source_pennsylvania_records_death_v1
1. Input Files
Freeze
Here we freeze a copy of the originla datasets to maintain long term reproducibility (in case of upstream edits). The original files were frozen on 9-18-2024:
- we freeze 2006-2015 years of data from RDC (these years due to priority, we will get to the rest later) to
\\files.drexel.edu\encrypted\SOPH\UHC\CCUH\ccuh-freeze\source_pennsylvania_records_death\raw
- ‘deathYYYY.sas7bdat’ were individual files copied over from
\\files.drexel.edu\encrypted\SOPH\UHC\PA_Birth_Death\Death_Records\SAS\IndividualData
- ‘deathYYYY_dsph.sas7bdat’ were geocoded files copied over from
\\files.drexel.edu\encrypted\SOPH\UHC\PA_Birth_Death\Death_Records\SAS\geocodes
Lets just take a look at the raw frozen files for ingestion.
## Inventory of raw frozen files
= tibble(
df_files raw_file = list.files(local_context$path_raw,
full.names = T,
pattern = '.sas7bdat') ) %>%
mutate(
file = basename(raw_file) %>% str_remove('.sas7bdat'),
type = ifelse(str_detect(file, "_dsph"),
"geocoded","individual" ),
year = str_extract(file, "\\d{4}"),
freeze_file = file.path(
$path_freeze,
local_contextpaste0(file,'.parquet')
)%>%
) arrange(year) %>%
select(year, type, file, raw_file, freeze_file)
df_files
year | type | file | raw_file | freeze_file |
---|---|---|---|---|
2006 | individual | death2006 | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2006.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2006.parquet |
2006 | geocoded | death2006_dsph | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2006_dsph.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2006_dsph.parquet |
2007 | individual | death2007 | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2007.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2007.parquet |
2007 | geocoded | death2007_dsph | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2007_dsph.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2007_dsph.parquet |
2008 | individual | death2008 | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2008.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2008.parquet |
2008 | geocoded | death2008_dsph | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2008_dsph.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2008_dsph.parquet |
2009 | individual | death2009 | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2009.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2009.parquet |
2009 | geocoded | death2009_dsph | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2009_dsph.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2009_dsph.parquet |
2010 | individual | death2010 | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2010.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2010.parquet |
2010 | geocoded | death2010_dsph | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2010_dsph.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2010_dsph.parquet |
2011 | individual | death2011 | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2011.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2011.parquet |
2011 | geocoded | death2011_dsph | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2011_dsph.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2011_dsph.parquet |
2012 | individual | death2012 | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2012.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2012.parquet |
2012 | geocoded | death2012_dsph | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2012_dsph.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2012_dsph.parquet |
2013 | individual | death2013 | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2013.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2013.parquet |
2013 | geocoded | death2013_dsph | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2013_dsph.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2013_dsph.parquet |
2014 | individual | death2014 | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2014.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2014.parquet |
2014 | geocoded | death2014_dsph | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2014_dsph.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2014_dsph.parquet |
2015 | individual | death2015 | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2015.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2015.parquet |
2015 | geocoded | death2015_dsph | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/raw/death2015_dsph.sas7bdat | //files.drexel.edu/encrypted/SOPH/UHC/CCUH/ccuh-freeze/source_pennsylvania_records_death/death2015_dsph.parquet |
Okay. Lets convert to parquet just for better performance and metadata access.
## Convert sas7bdat to parquet
%>%
df_files group_by(row_number()) %>%
group_walk(~{
if (!file.exists(.x$freeze_file)){
$raw_file %>%
.x::read_sas() %>%
havenrename_with(toupper) %>%
::write_parquet(.x$freeze_file)
arrow
} })
Now lets harvest some metadata.
Code
## Generate measure level metadata
= df_files %>%
df_codebook group_by(row_number()) %>%
group_map(~{
## Connect to dataset
= .x
row = row$freeze_file %>% arrow::open_dataset()
ds
# get data type
= tibble(raw = ds$schema$ToString() %>%
df_types str_split("\n") %>%
unlist()) %>%
filter(str_detect(raw,":")) %>%
rowwise() %>%
mutate(raw_split = str_split(raw,":"),
var = raw_split[[1]],
value_type = raw_split[[2]] %>% str_trim()) %>%
select(var, value_type) %>%
mutate(file = row$file) %>%
left_join(df_files)
# get variabel definitions if available
= length(ds$metadata) > 0
has_metadata if (has_metadata){
= tibble(
codebook var = names(ds$metadata$r$columns) %>% str_to_upper(),
var_label = ds$metadata$r$columns %>% map_chr(~.x$attributes$label)
%>%
) left_join(df_types, by = "var")
else {
} = df_types %>%
codebook mutate(var_label = '')
}
return(codebook)
}%>%
) bind_rows()
= df_codebook %>%
xwalk_codebook_vars select(var, var_label) %>%
distinct() %>%
group_by(var) %>%
summarise(var_label = paste(var_label, collapse = " | ")) %>%
ungroup()
## Preview
%>% reactable(searchable = T) df_codebook
Great the codebook for each fo the files above is show above. Now lets just get a grasp of number of variables per table.
%>%
df_files select(year, type,file) %>%
left_join(
%>%
df_codebook count(file, name = 'n_vars')
%>%
) select(-file) %>%
pivot_wider(names_from = type, values_from = n_vars)
year | individual | geocoded |
---|---|---|
2006 | 43 | 27 |
2007 | 43 | 27 |
2008 | 43 | 27 |
2009 | 43 | 29 |
2010 | 43 | 28 |
2011 | 43 | 29 |
2012 | 85 | 36 |
2013 | 85 | 39 |
2014 | 85 | 39 |
2015 | 85 | 39 |
This is complex. The number of variables double from 2012 onwards. We really need to understand this change in metadata before we can harmonize across years. Lets take a look.
2. Metadata EDA
Lets just be simple and keep the consistent fields and drop the rest. So lets see which fields are consistent through all years.
Geocoded
Code
= df_codebook %>%
df_codebook_geocoded filter(type == 'geocoded')
= df_codebook_geocoded %>%
dfa count(var) %>%
filter(n == length(unique(df_codebook_geocoded$file))) %>%
arrange(var) %>%
left_join(xwalk_codebook_vars, by = 'var') %>%
select(n, var, var_label)
%>%
dfa reactable(searchable = T)
This looks good. The only variable that we need currently is GEOID10 which should be the tract of death.
Individual
## Individual vars
= df_codebook %>%
df_codebook_individual filter(type == 'individual')
= df_codebook_individual %>%
dfa count(var) %>%
filter(n == length(unique(df_codebook_individual$file))) %>%
arrange(var) %>%
left_join(xwalk_codebook_vars, by = 'var') %>%
select(n, var, var_label)
%>%
dfa reactable(searchable = T)