US Tract10 Population 2000 - LTDB v1

Author

Ran Li

Published

May 14, 2024

1. Setup

First let’s load dependencies.

library(pacman)
p_load(tidyverse, arrow, cli, janitor)

Lets declare some local objects to reuse through out this pipeline.

pipeline_instance = lst(
  id = 'US_population_seer_v1'
)

block_downstream = lst(
  id = 'us_tract10_denominators_ltdb_2000__v1',
  path = '//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/freeze/v1.0/us_tract10_denominators_ltdb_2000__v1/us_tract10_denominators_ltdb_2000__v1.parquet',
  path_dbt = '//files.drexel.edu/encrypted/SOPH/UHC/CCUH/DBT/ccuh-dbt/source/us_tract10_denominators_ltdb_2000__v1.parquet'
)

Import upstream blocks

We found the require upstream blocks on the pipeline notion page, and copied access import code for each of the blocks below.

xwalk_tract10 = arrow::read_parquet("//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/freeze/v1.0/xwalk_tract10_v1/xwalk_tract10_v1.parquet")

## Raw LTDB
df_ltdb_2000_raw = read_csv('//files.drexel.edu/colleges/SOPH/Shared/UHC/Projects/CCUH/freeze/v1.0/raw__population_ltbd_2000_v1/raw__population_ltbd_2000_v1.csv')

2. Data

2.1 EDA

glimpse(df_ltdb_2000_raw)

Rows: 72,693
Columns: 6
$ GEOID10     <dbl> 1001020100, 6083002402, 1001020200, 6083002102, 1001020300…
$ STATE       <chr> "AL", "CA", "AL", "CA", "AL", "CA", "AL", "CA", "AL", "CA"…
$ COUNTY      <chr> "Autauga County", "Santa Barbara County", "Autauga County"…
$ TRACT       <chr> "Census Tract 201", "Census Tract 24.02", "Census Tract 20…
$ YEAR        <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000…
$ CT10LDBCPOP <dbl> 1920.975, 6742.000, 1892.000, 2041.000, 3339.000, 3624.000…

dim(df_ltdb_2000_raw)

[1] 72693     6

Initial thoughts: - So roughly 72k rows makes sense for ~72k tracts in the US. - GEOID is numeric. and lead zeroes droppped so we need to pad that. - we’l keep GEOID, year, and CT10LDBCPOP

2.1.1 Pre-process

A few things to pre-process:

String pad GEOID
select only required columns

## Operation: Tract10 geoid + remove residual columns
df_ltdb_processed = df_ltdb_2000_raw %>% 
  mutate(tract10.census.geoid = str_pad(GEOID10, 11, pad = '0')) %>% 
  select(tract10.census.geoid, year = YEAR, pop = CT10LDBCPOP)

2.1.2 Sanity check

Lets confirm state totla popualtion makes sense

sum(df_ltdb_processed$pop)/10^6

[1] 281.4219

Acorind to census rouhgly 281.4 million population in year 2000. okay.

Quick plot for giggles.

df_ltdb_processed %>% 
  ggplot(aes(x = pop)) +
  geom_histogram() +
  labs(
    title = '2000 LTDB Population Distribution',
    subtitle = 'Histogram of population counts for 2000 LTDB data'
  )

2.2 QC of raw data

This is pretty foundational data block so we we should do a few QC on the raw SEER data against our validate Tract10 Crosswalk.

FLAG: Missing TIGER10 Tracts.
- 367 tracts are missing from the SEER data
- Not a big deal but we should be aware of this.
- Potentially
VALID: No unknown tracts - all tract10 geoid in SEEERare in the TIGER10 crosswalk
VALID: Valid tracts - all tract10 geoid should be 11 character strings

## Test: Missing TIGER10 Tracts
df_ltdb_tracts = df_ltdb_processed %>% 
  select(tract10.census.geoid) %>% 
  distinct() %>% 
  mutate(ltdb_tract = TRUE) 


df_tiger10_tracts = xwalk_tract10 %>% 
  select(tract10.census.geoid) %>% 
  distinct()  %>% 
  mutate(tiger10_tract = TRUE)

df_qc = df_tiger10_tracts %>% 
  full_join(df_ltdb_tracts, by = 'tract10.census.geoid') %>% 
  mutate(missing_tiger10_tract = is.na(ltdb_tract)) %>% 
  mutate(not_tiger10_tract = is.na(tiger10_tract)) 

df_qc %>% 
  count(missing_tiger10_tract, not_tiger10_tract)

missing_tiger10_tract	not_tiger10_tract	n
FALSE	FALSE	72693
TRUE	FALSE	1398

vec__missing_tiger10_tract = df_qc %>% 
  filter(missing_tiger10_tract) %>% 
  pull(tract10.census.geoid)


## How many msissing tracts?
length(vec__missing_tiger10_tract)

[1] 1398

## Distirbution by state
vec__missing_tiger10_tract %>% 
  str_sub(1,2) %>% 
  table()

.
 01  06  08  09  10  12  13  15  17  18  19  20  22  23  24  25  26  27  28  31 
  2  27   2   5   4  47   6  25   2   3   1   2  12   7  12   7  50   4   3   1 
 32  33  34  35  36  37  39  41  42  44  45  47  48  51  53  55  66  72  78 
  4   1   6   1  36  13   6   8   2   3   6   2  13  13  12  16  57 945  32

1398 missing tracts
- most of these 945 is in PR
- the rest (453) are in the US

Not a big deal but we should be aware of this; we could just impute zero assume as missing.

## Test: Unknown Tracts 
df_invalid = df_qc %>%
  filter(not_tiger10_tract) 
if (nrow(df_invalid) > 0) cli::cli_abort("ERROR: Unkown Tracts")

## Test: Invalid Tracts
df_invalid = df_qc %>%
  mutate(valid = nchar(tract10.census.geoid) == 11) %>% 
  filter(!valid)
if (nrow(df_invalid) > 0) cli::cli_abort("ERROR: Invalid Tracts")

Looks okay. Lets format our arrow table for export.

2.3 Op. `df_data`

We just merge in LTDB to our tract10 crosswalk:

## LTDB data
# df_ltdb_processed

## Generate template for all tracts-year
template = xwalk_tract10 %>% 
  select(tract10.census.geoid, state.type) %>% 
  mutate(year = 2000)

## Merge
df_data = template %>% 
  left_join(df_ltdb_processed) %>% 
  mutate(pop = ifelse(is.na(pop)&state.type!='territory', 0, pop)) %>% 
  select(-state.type)

2.4 QC

Lets do a few QC checks:

VALID: No missing data for non-territory tracts

## "Test: Non-territory tracts should have non-missing values"

df_invalid = df_data %>% 
  left_join(xwalk_tract10) %>% 
  filter(state.type != 'territory') %>%
  filter(is.na(pop))

if (nrow(df_invalid) > 0) cli::cli_abort("ERROR: Missing data for non-territory tracts")

3. Metadata

3.1 Block level metadata

attr(df_data, "block_instance") =  pipeline_instance$id

attr(df_data, "block_description") = "Year 2000 US Tract10 level population cleaned from the LTDB year 2000 file. "

attr(df_data, "block_contributor_orcid") = "https://orcid.org/0000-0002-4699-4755"

3.3 Column Definitions

##tract10.census.geoid
attr(df_data$tract10.census.geoid, "column_description") = "Tract10 FIP code."

## year
attr(df_data$year, "column_description") = "Year of data"

## pop
attr(df_data$pop, "column_description") = "Population count from LTDB. Note that there are some  tract10 units missing from the LTDB data: 1398 total tract10 are missing data which compose of 945 tracts in PR and the rest 453 in the US."

3.4 Op. Schema and append metadata

tbl_data =  arrow::arrow_table(
  df_data,
  schema = arrow::schema(
    tract10.census.geoid = arrow::utf8(),
    year = arrow::int16(),
    pop = arrow::int32()
  )
)

tbl_data$schema

Schema
tract10.census.geoid: string
year: int16
pop: int32

See $metadata for additional Schema metadata

tbl_data$metadata

$r
$r$attributes
$r$attributes$sf_column
[1] "geometry"

$r$attributes$agr
 STATEFP10 COUNTYFP10  TRACTCE10    GEOID10     NAME10 NAMELSAD10    MTFCC10 
      <NA>       <NA>       <NA>       <NA>       <NA>       <NA>       <NA> 
FUNCSTAT10    ALAND10   AWATER10 INTPTLAT10 INTPTLON10   COUNTYFP    STATEFP 
      <NA>       <NA>       <NA>       <NA>       <NA>       <NA>       <NA> 
Levels: constant aggregate identity

$r$attributes$block_description
[1] "Year 2000 US Tract10 level population cleaned from the LTDB year 2000 file. "

$r$attributes$block_process_date
[1] "05-13-2024"

$r$attributes$block_maintainer_orcid
[1] "https://orcid.org/0000-0002-4699-4755"

$r$attributes$block_instance
[1] "US_population_seer_v1"

$r$attributes$block_contributor_orcid
[1] "https://orcid.org/0000-0002-4699-4755"


$r$columns
$r$columns$tract10.census.geoid
$r$columns$tract10.census.geoid$attributes
$r$columns$tract10.census.geoid$attributes$column_description
[1] "Tract10 FIP code."


$r$columns$tract10.census.geoid$columns
NULL


$r$columns$year
$r$columns$year$attributes
$r$columns$year$attributes$column_description
[1] "Year of data"


$r$columns$year$columns
NULL


$r$columns$pop
$r$columns$pop$attributes
$r$columns$pop$attributes$column_description
[1] "Population count from LTDB. Note that there are some  tract10 units missing from the LTDB data: 1398 total tract10 are missing data which compose of 945 tracts in PR and the rest 453 in the US."


$r$columns$pop$columns
NULL

3.5 QC

Here we do completion checks:

VALID: Column description complete

## "Test: Column description complete"

invalid = tbl_data$metadata$r$columns %>% 
  keep(~is.null(.x$attributes$column_description))
if (length(invalid) > 0) cli::cli_abort("ERROR: Missing column descriptions", invalid)

4. Export

4.1 Export blocks

Export parquet to ETL block location. Saving in the same format as how we we would load into DBT.

## ETL data block
tbl_data %>% 
  write_parquet(
    block_downstream$path
  )

## Export to DBT
tbl_data %>% 
  write_parquet(
    block_downstream$path_dbt
  )

4.2 Validate DBT Source

## Data
block_downstream$path_dbt %>% 
  arrow::read_parquet() %>% 
  glimpse()

Rows: 74,091
Columns: 3
$ tract10.census.geoid <chr> "01005950300", "01005950900", "01005950800", "010…
$ year                 <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2…
$ pop                  <int> 2152, 4549, 2237, 1883, 2312, 3840, 4068, 4342, 3…

Data looks good. Lets check metadata

tbl = arrow::open_dataset(block_downstream$path_dbt )

tbl$schema

Schema
tract10.census.geoid: string
year: int16
pop: int32

See $metadata for additional Schema metadata

tbl$metadata

$r
$r$attributes
$r$attributes$sf_column
[1] "geometry"

$r$attributes$agr
 STATEFP10 COUNTYFP10  TRACTCE10    GEOID10     NAME10 NAMELSAD10    MTFCC10 
      <NA>       <NA>       <NA>       <NA>       <NA>       <NA>       <NA> 
FUNCSTAT10    ALAND10   AWATER10 INTPTLAT10 INTPTLON10   COUNTYFP    STATEFP 
      <NA>       <NA>       <NA>       <NA>       <NA>       <NA>       <NA> 
Levels: constant aggregate identity

$r$attributes$block_description
[1] "Year 2000 US Tract10 level population cleaned from the LTDB year 2000 file. "

$r$attributes$block_process_date
[1] "05-13-2024"

$r$attributes$block_maintainer_orcid
[1] "https://orcid.org/0000-0002-4699-4755"

$r$attributes$block_instance
[1] "US_population_seer_v1"

$r$attributes$block_contributor_orcid
[1] "https://orcid.org/0000-0002-4699-4755"


$r$columns
$r$columns$tract10.census.geoid
$r$columns$tract10.census.geoid$attributes
$r$columns$tract10.census.geoid$attributes$column_description
[1] "Tract10 FIP code."


$r$columns$tract10.census.geoid$columns
NULL


$r$columns$year
$r$columns$year$attributes
$r$columns$year$attributes$column_description
[1] "Year of data"


$r$columns$year$columns
NULL


$r$columns$pop
$r$columns$pop$attributes
$r$columns$pop$attributes$column_description
[1] "Population count from LTDB. Note that there are some  tract10 units missing from the LTDB data: 1398 total tract10 are missing data which compose of 945 tracts in PR and the rest 453 in the US."


$r$columns$pop$columns
NULL