library(pacman)
p_load(tidyverse, arrow, cli, janitor)
US Tract10 Population 2000 - LTDB v1
1. Setup
First let’s load dependencies.
Lets declare some local objects to reuse through out this pipeline.
= lst(
pipeline_instance id = 'US_population_seer_v1'
)
= lst(
block_downstream id = 'us_tract10_denominators_ltdb_2000__v1',
path = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/freeze/v1.0/us_tract10_denominators_ltdb_2000__v1/us_tract10_denominators_ltdb_2000__v1.parquet',
path_dbt = '//files.drexel.edu/encrypted/SOPH/UHC/CCUH/DBT/ccuh-dbt/source/us_tract10_denominators_ltdb_2000__v1.parquet'
)
Import upstream blocks
We found the require upstream blocks on the pipeline notion page, and copied access import code for each of the blocks below.
= arrow::read_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/freeze/v1.0/xwalk_tract10_v1/xwalk_tract10_v1.parquet")
xwalk_tract10
## Raw LTDB
= read_csv('//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/freeze/v1.0/raw__population_ltbd_2000_v1/raw__population_ltbd_2000_v1.csv') df_ltdb_2000_raw
2. Data
2.1 EDA
glimpse(df_ltdb_2000_raw)
Rows: 72,693
Columns: 6
$ GEOID10 <dbl> 1001020100, 6083002402, 1001020200, 6083002102, 1001020300…
$ STATE <chr> "AL", "CA", "AL", "CA", "AL", "CA", "AL", "CA", "AL", "CA"…
$ COUNTY <chr> "Autauga County", "Santa Barbara County", "Autauga County"…
$ TRACT <chr> "Census Tract 201", "Census Tract 24.02", "Census Tract 20…
$ YEAR <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000…
$ CT10LDBCPOP <dbl> 1920.975, 6742.000, 1892.000, 2041.000, 3339.000, 3624.000…
dim(df_ltdb_2000_raw)
[1] 72693 6
Initial thoughts: - So roughly 72k rows makes sense for ~72k tracts in the US. - GEOID is numeric. and lead zeroes droppped so we need to pad that. - we’l keep GEOID, year, and CT10LDBCPOP
2.1.1 Pre-process
A few things to pre-process:
- String pad GEOID
- select only required columns
## Operation: Tract10 geoid + remove residual columns
= df_ltdb_2000_raw %>%
df_ltdb_processed mutate(tract10.census.geoid = str_pad(GEOID10, 11, pad = '0')) %>%
select(tract10.census.geoid, year = YEAR, pop = CT10LDBCPOP)
2.1.2 Sanity check
Lets confirm state totla popualtion makes sense
sum(df_ltdb_processed$pop)/10^6
[1] 281.4219
Acorind to census rouhgly 281.4 million population in year 2000. okay.
Quick plot for giggles.
%>%
df_ltdb_processed ggplot(aes(x = pop)) +
geom_histogram() +
labs(
title = '2000 LTDB Population Distribution',
subtitle = 'Histogram of population counts for 2000 LTDB data'
)
2.2 QC of raw data
This is pretty foundational data block so we we should do a few QC on the raw SEER data against our validate Tract10 Crosswalk.
- FLAG: Missing TIGER10 Tracts.
- 367 tracts are missing from the SEER data
- Not a big deal but we should be aware of this.
- Potentially
- VALID: No unknown tracts - all tract10 geoid in SEEERare in the TIGER10 crosswalk
- VALID: Valid tracts - all tract10 geoid should be 11 character strings
## Test: Missing TIGER10 Tracts
= df_ltdb_processed %>%
df_ltdb_tracts select(tract10.census.geoid) %>%
distinct() %>%
mutate(ltdb_tract = TRUE)
= xwalk_tract10 %>%
df_tiger10_tracts select(tract10.census.geoid) %>%
distinct() %>%
mutate(tiger10_tract = TRUE)
= df_tiger10_tracts %>%
df_qc full_join(df_ltdb_tracts, by = 'tract10.census.geoid') %>%
mutate(missing_tiger10_tract = is.na(ltdb_tract)) %>%
mutate(not_tiger10_tract = is.na(tiger10_tract))
%>%
df_qc count(missing_tiger10_tract, not_tiger10_tract)
missing_tiger10_tract | not_tiger10_tract | n |
---|---|---|
FALSE | FALSE | 72693 |
TRUE | FALSE | 1398 |
= df_qc %>%
vec__missing_tiger10_tract filter(missing_tiger10_tract) %>%
pull(tract10.census.geoid)
## How many msissing tracts?
length(vec__missing_tiger10_tract)
[1] 1398
## Distirbution by state
%>%
vec__missing_tiger10_tract str_sub(1,2) %>%
table()
.
01 06 08 09 10 12 13 15 17 18 19 20 22 23 24 25 26 27 28 31
2 27 2 5 4 47 6 25 2 3 1 2 12 7 12 7 50 4 3 1
32 33 34 35 36 37 39 41 42 44 45 47 48 51 53 55 66 72 78
4 1 6 1 36 13 6 8 2 3 6 2 13 13 12 16 57 945 32
- 1398 missing tracts
- most of these 945 is in PR
- the rest (453) are in the US
Not a big deal but we should be aware of this; we could just impute zero assume as missing.
## Test: Unknown Tracts
= df_qc %>%
df_invalid filter(not_tiger10_tract)
if (nrow(df_invalid) > 0) cli::cli_abort("ERROR: Unkown Tracts")
## Test: Invalid Tracts
= df_qc %>%
df_invalid mutate(valid = nchar(tract10.census.geoid) == 11) %>%
filter(!valid)
if (nrow(df_invalid) > 0) cli::cli_abort("ERROR: Invalid Tracts")
Looks okay. Lets format our arrow table for export.
2.3 Op. df_data
We just merge in LTDB to our tract10 crosswalk:
## LTDB data
# df_ltdb_processed
## Generate template for all tracts-year
= xwalk_tract10 %>%
template select(tract10.census.geoid, state.type) %>%
mutate(year = 2000)
## Merge
= template %>%
df_data left_join(df_ltdb_processed) %>%
mutate(pop = ifelse(is.na(pop)&state.type!='territory', 0, pop)) %>%
select(-state.type)
2.4 QC
Lets do a few QC checks:
- VALID: No missing data for non-territory tracts
## "Test: Non-territory tracts should have non-missing values"
= df_data %>%
df_invalid left_join(xwalk_tract10) %>%
filter(state.type != 'territory') %>%
filter(is.na(pop))
if (nrow(df_invalid) > 0) cli::cli_abort("ERROR: Missing data for non-territory tracts")
3. Metadata
3.1 Block level metadata
attr(df_data, "block_instance") = pipeline_instance$id
attr(df_data, "block_description") = "Year 2000 US Tract10 level population cleaned from the LTDB year 2000 file. "
attr(df_data, "block_contributor_orcid") = "https://orcid.org/0000-0002-4699-4755"
3.3 Column Definitions
##tract10.census.geoid
attr(df_data$tract10.census.geoid, "column_description") = "Tract10 FIP code."
## year
attr(df_data$year, "column_description") = "Year of data"
## pop
attr(df_data$pop, "column_description") = "Population count from LTDB. Note that there are some tract10 units missing from the LTDB data: 1398 total tract10 are missing data which compose of 945 tracts in PR and the rest 453 in the US."
3.4 Op. Schema and append metadata
= arrow::arrow_table(
tbl_data
df_data,schema = arrow::schema(
tract10.census.geoid = arrow::utf8(),
year = arrow::int16(),
pop = arrow::int32()
)
)
$schema tbl_data
Schema
tract10.census.geoid: string
year: int16
pop: int32
See $metadata for additional Schema metadata
$metadata tbl_data
$r
$r$attributes
$r$attributes$sf_column
[1] "geometry"
$r$attributes$agr
STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10 NAMELSAD10 MTFCC10
<NA> <NA> <NA> <NA> <NA> <NA> <NA>
FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 INTPTLON10 COUNTYFP STATEFP
<NA> <NA> <NA> <NA> <NA> <NA> <NA>
Levels: constant aggregate identity
$r$attributes$block_description
[1] "Year 2000 US Tract10 level population cleaned from the LTDB year 2000 file. "
$r$attributes$block_process_date
[1] "05-13-2024"
$r$attributes$block_maintainer_orcid
[1] "https://orcid.org/0000-0002-4699-4755"
$r$attributes$block_instance
[1] "US_population_seer_v1"
$r$attributes$block_contributor_orcid
[1] "https://orcid.org/0000-0002-4699-4755"
$r$columns
$r$columns$tract10.census.geoid
$r$columns$tract10.census.geoid$attributes
$r$columns$tract10.census.geoid$attributes$column_description
[1] "Tract10 FIP code."
$r$columns$tract10.census.geoid$columns
NULL
$r$columns$year
$r$columns$year$attributes
$r$columns$year$attributes$column_description
[1] "Year of data"
$r$columns$year$columns
NULL
$r$columns$pop
$r$columns$pop$attributes
$r$columns$pop$attributes$column_description
[1] "Population count from LTDB. Note that there are some tract10 units missing from the LTDB data: 1398 total tract10 are missing data which compose of 945 tracts in PR and the rest 453 in the US."
$r$columns$pop$columns
NULL
3.5 QC
Here we do completion checks:
- VALID: Column description complete
## "Test: Column description complete"
= tbl_data$metadata$r$columns %>%
invalid keep(~is.null(.x$attributes$column_description))
if (length(invalid) > 0) cli::cli_abort("ERROR: Missing column descriptions", invalid)
4. Export
4.1 Export blocks
Export parquet to ETL block location. Saving in the same format as how we we would load into DBT.
## ETL data block
%>%
tbl_data write_parquet(
$path
block_downstream
)
## Export to DBT
%>%
tbl_data write_parquet(
$path_dbt
block_downstream )
4.2 Validate DBT Source
## Data
$path_dbt %>%
block_downstream::read_parquet() %>%
arrowglimpse()
Rows: 74,091
Columns: 3
$ tract10.census.geoid <chr> "01005950300", "01005950900", "01005950800", "010…
$ year <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2…
$ pop <int> 2152, 4549, 2237, 1883, 2312, 3840, 4068, 4342, 3…
Data looks good. Lets check metadata
= arrow::open_dataset(block_downstream$path_dbt )
tbl
$schema tbl
Schema
tract10.census.geoid: string
year: int16
pop: int32
See $metadata for additional Schema metadata
$metadata tbl
$r
$r$attributes
$r$attributes$sf_column
[1] "geometry"
$r$attributes$agr
STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10 NAMELSAD10 MTFCC10
<NA> <NA> <NA> <NA> <NA> <NA> <NA>
FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 INTPTLON10 COUNTYFP STATEFP
<NA> <NA> <NA> <NA> <NA> <NA> <NA>
Levels: constant aggregate identity
$r$attributes$block_description
[1] "Year 2000 US Tract10 level population cleaned from the LTDB year 2000 file. "
$r$attributes$block_process_date
[1] "05-13-2024"
$r$attributes$block_maintainer_orcid
[1] "https://orcid.org/0000-0002-4699-4755"
$r$attributes$block_instance
[1] "US_population_seer_v1"
$r$attributes$block_contributor_orcid
[1] "https://orcid.org/0000-0002-4699-4755"
$r$columns
$r$columns$tract10.census.geoid
$r$columns$tract10.census.geoid$attributes
$r$columns$tract10.census.geoid$attributes$column_description
[1] "Tract10 FIP code."
$r$columns$tract10.census.geoid$columns
NULL
$r$columns$year
$r$columns$year$attributes
$r$columns$year$attributes$column_description
[1] "Year of data"
$r$columns$year$columns
NULL
$r$columns$pop
$r$columns$pop$attributes
$r$columns$pop$attributes$column_description
[1] "Population count from LTDB. Note that there are some tract10 units missing from the LTDB data: 1398 total tract10 are missing data which compose of 945 tracts in PR and the rest 453 in the US."
$r$columns$pop$columns
NULL