prop_pop_elig_rem
, in the code below).An advantage of this approach is that, within each of the 20 strata, counties are randomly sampled, so in expectation, estimates will be representative of the full population of counties in that stratum. urban-rural-region stratum (n=20=4*5).
Load packages
library(tidyverse)
library(sf)
library(mapview)
library(RColorBrewer)
library(viridis)
library(tidycensus)
library(readxl)
library(here)
library(knitr)
A dataset of counties with urban-rural codes (6 codes; 2013), regions (4 regions across US), divisions (9 across US) and SVI data. SVI data is from 2018, and urban rural codes are from 2013.
Urban-rural classification scheme is from here: https://www.cdc.gov/nchs/data_access/urban_rural.htm
9 divisions within 4 regions https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States#Census_Bureau-designated_regions_and_divisions https://www2.census.gov/geo/pdfs/reference/GARM/Ch6GARM.pdf
Make some new variables, remove a few variables, and exclude Alaska and Hawaii.
To link in state abbreviations, urban-rural classification, and regions and divisions.
# Import these look-up tables from Github for convenience.
library(here)
source(here("scripts", "define-urls-look-up-tables.R"))
#Alternatively, they are available here:
us_state_fips_url = "https://raw.githubusercontent.com/michaeldgarber/lookups-county-state/main/us-state-fips.csv"
county_urban_rural_2013_url = "https://raw.githubusercontent.com/michaeldgarber/lookups-county-state/main/nchs-urban-rural-2013.csv"
region_division_url = "https://raw.githubusercontent.com/michaeldgarber/lookups-county-state/main/division-region.csv"
us_state_fips_lookup = us_state_fips_url %>%
url() %>%
read_csv()
county_urban_rural_2013_lookup = county_urban_rural_2013_url %>%
url() %>%
read_csv()
region_division_lookup = region_division_url %>%
url() %>%
read_csv()
names(us_state_fips_lookup)
## [1] "state_name" "state_fips" "state_abbrev"
names(county_urban_rural_2013_lookup)
## [1] "county_fips" "cbsa_name" "urban_rural_nchs_2013"
names(region_division_lookup)
## [1] "division_9_no" "division_9_name" "region_4_no" "region_4_name"
## [5] "state_name"
#Do not run this every time to avoid calling the census API frequently.
options(tigris_use_cache = TRUE)
vars_acs_2019 <- load_variables(2019, "acs5", cache = TRUE)
county_geo =get_acs(
year=2019,
geography = "county",
output = "wide", #make it wide form by default so variable names are in columns - geography = "county",
geometry = TRUE, #to grab geometry
variables = c(
pop = "B01003_001",
#race
race_pop_tot = "B02001_001", #denominator
race_white = "B02001_002",
race_black = "B02001_003",
#median home value
home_val_med = "B25077_001",
home_val_med_tot = "B25075_001",
#poverty
poverty_pop_tot = "B17003_001", #the denominator for the poverty variable
poverty_pop_below = "B17003_002",
# means of transportation to work
trans_to_work_pop_tot = "B08301_001",
trans_to_work_car = "B08301_002",
trans_to_work_public = "B08301_010",
trans_to_work_bike = "B08301_018",
trans_to_work_walk = "B08301_019",
trans_to_work_other = "B08301_020"
)
)
#save and re-load so don't have to call tidycensus every time
setwd(here("data-processed"))
save(county_geo, file = "county_geo.RData")
These bounds are determined later. Define them here (out of order) so that their values can be included in this data-prep step. Save the 10th and 20th population percentile of urban_rural_6=6. These are used as filters in future computations.
pop_ur_6_10th = 2815
pop_ur_6_20th = 4938
setwd(here("data-processed"))
load("county_geo.RData") #saved above
county_wrangle_geo = county_geo %>%
#remove the margin-of-errors for this purpose
dplyr::select(-ends_with("M")) %>%
rename(
county_fips = GEOID, #The GEOID in this case is indeed a county FIPS code, as we imported counties.
county_name = NAME,
) %>%
#for simplicity, remove the E suffix. E stands for estimate.
rename_with(~str_remove(., 'E')) %>% #https://stackoverflow.com/questions/45960269/removing-suffix-from-column-names-using-rename-all
mutate(
state_fips = str_sub(county_fips, 1,2), #obtain fips code for state
#proportions for demographic variables and bike mode share
prop_race_white = race_white/race_pop_tot,
prop_race_black = race_black/race_pop_tot,
prop_poverty = poverty_pop_below/poverty_pop_tot,
prop_trans_to_work_walk = trans_to_work_walk/trans_to_work_pop_tot,
prop_trans_to_work_bike = trans_to_work_bike/trans_to_work_pop_tot,
#make into quantiles as well
prop_trans_to_work_walk_cat = dplyr::ntile(prop_trans_to_work_walk, 4),
prop_trans_to_work_bike_cat = dplyr::ntile(prop_trans_to_work_bike, 4),
prop_race_white_cat = dplyr::ntile(prop_race_white, 4),
prop_poverty_cat = dplyr::ntile(prop_poverty, 4)
) %>%
#link in the state abbreviations, the nchs-urban-rural classifications, and the region-division lookup
left_join(us_state_fips_lookup, by = "state_fips") %>%
left_join(county_urban_rural_2013_lookup, by = "county_fips") %>%
left_join(region_division_lookup, by = "state_name") %>%
rename(urban_rural_6 = urban_rural_nchs_2013) %>% #simplify urban rural name. more descriptive on github.
st_as_sf() %>%
st_transform(4326) %>% #make sure it's 4326 for area calculation
mutate(
continental_48 = case_when(
state_name %in% c("Alaska", "Hawaii", "Northern Mariana Islands", "Guam", "Virgin Islands", "Puerto Rico") ~ 0,
TRUE ~ 1),
area_m2 = as.numeric(st_area(geometry)), #what is area in meters squared? 4326 is coordinate system, so meters are returned
area_mi2 = area_m2/2589988.11, #square miles
pop_per_mi2 = pop/area_mi2,
pop_log = log(pop), #in case needed as a weight
#indicator variables for above the 10th and 20th population percentile in the sixth urban-rural category
pop_above_ur_6_10th = case_when(pop >= pop_ur_6_10th ~ 1, TRUE ~ 0 ),
pop_above_ur_6_20th = case_when(pop >= pop_ur_6_20th ~ 1, TRUE ~ 0 ) ,
#indicator variable for most urban classification or not
ur_1 = case_when(
urban_rural_6 ==1 ~1,
TRUE ~0)
) %>%
#sort by population to note the top 150 and top 150
arrange(desc(pop)) %>%
mutate(
pop_rank = row_number(),
pop_top_150 = case_when(
pop_rank <=150 ~ 1,
TRUE ~ 0
),
pop_top_200 = case_when(
pop_rank <=200 ~ 1,
TRUE ~ 0)
) %>%
#restrict to contiguous 48
filter(continental_48==1)
#Make a version without geometry
county_wrangle_nogeo = county_wrangle_geo %>%
st_set_geometry(NULL) %>%
as_tibble()
county_geo_lookup = county_wrangle_geo %>%
dplyr::select(county_fips, geometry)
setwd(here("data-processed"))
save(county_wrangle_nogeo, file = "county_wrangle_nogeo.RData")
save(county_geo_lookup, file = "county_geo_lookup.RData")
division_9_sf = county_wrangle_geo %>%
group_by(division_9_no) %>%
summarise(area_mi2 = sum(area_mi2, na.rm=TRUE)) %>% #like a dissolve
st_cast()
region_4_sf = county_wrangle_geo %>%
group_by(region_4_no) %>%
summarise(area_mi2 = sum(area_mi2, na.rm=TRUE)) %>%
st_cast()
#Note these steps take some time. Save to avoid having to run the slow code
library(here)
setwd(here("data-processed"))
save(division_9_sf, file = "division_9_sf.RData")
save(region_4_sf, file = "region_4_sf.RData")
pop_dist_by_urban_rural_6 = county_wrangle_nogeo %>%
group_by(urban_rural_6) %>%
summarise(
pop_min = min(pop, na.rm=TRUE),
pop_10th = round(quantile(pop, probs = .1, na.rm=TRUE), digits = 0),
pop_20th = round(quantile(pop, probs = .2, na.rm=TRUE), digits = 0),
pop_25th = round(quantile(pop, probs = .25, na.rm=TRUE), digits = 0)
)
options(digits =4)
library(knitr)
pop_dist_by_urban_rural_6 %>%
knitr::kable()
urban_rural_6 | pop_min | pop_10th | pop_20th | pop_25th |
---|---|---|---|---|
1 | 157613 | 467485 | 663826 | 695333 |
2 | 4830 | 17295 | 28980 | 37572 |
3 | 1973 | 16372 | 26231 | 34105 |
4 | 728 | 10845 | 17045 | 20582 |
5 | 395 | 12825 | 22351 | 25072 |
6 | 98 | 2815 | 4938 | 6032 |
Map with ggplot rather than mapview for speed.
#Load strata maps
library(here)
setwd(here("data-processed"))
load(file = "division_9_sf.RData")
load(file = "region_4_sf.RData")
division_9_sf %>%
ggplot()+
geom_sf(color = "black", fill = "azure4")+
geom_sf_label(aes(label=division_9_no))+
theme_bw()
region_4_sf %>%
ggplot()+
geom_sf(color = "black", fill = "orange") +
geom_sf_label(aes(label=region_4_no))+
theme_bw()
#marginal total to not be confused with the joint values below
n_counties_marg = county_wrangle_nogeo %>%
count() %>%
pull()
n_counties_marg
## [1] 3108
pop_marg_total = county_wrangle_nogeo %>%
mutate(dummy=1) %>%
group_by(dummy) %>%
summarise(pop_total = sum(pop, na.rm=TRUE)) %>%
dplyr::select(-dummy) %>%
ungroup() %>%
pull()
pop_marg_total
## [1] 322538633
#Total number of eligible counties to be sampled. Define as object for later use.
n_counties_marg_elig =county_wrangle_nogeo %>%
filter(pop_above_ur_6_10th==1) %>%
count() %>%
pull()
n_counties_marg_elig
## [1] 2959
pop_marg_elig = county_wrangle_nogeo %>%
filter(pop_above_ur_6_10th==1) %>%
mutate(dummy=1) %>%
group_by(dummy) %>%
summarise(pop = sum(pop, na.rm=TRUE)) %>%
dplyr::select(-dummy) %>%
ungroup() %>%
pull()
pop_marg_elig
## [1] 322285160
by_ur = county_wrangle_nogeo %>%
group_by(urban_rural_6) %>%
summarise(
n_counties=n(),
prop = n_counties/n_counties_marg_elig, #use overall here rather than restricting to the overall numbers.
pop = sum(pop, na.rm=TRUE),
prop_pop =pop/ pop_marg_total) %>%
ungroup()
by_ur
n_counties_ur_1 = by_ur %>%
filter(urban_rural_6==1) %>%
dplyr::select(n_counties) %>%
pull()
The most urban category comprises 68 counties, which is 2% of the counties in the sampling frame and 31% of the population.
It looks like every category has a long right tail. Most counties in each category are lower population. The most rural category especially has a right skew.
county_wrangle_geo %>%
ggplot(aes(pop))+
geom_histogram() +
facet_wrap(~urban_rural_6, scales = "free") + #allow axes to vary freely.
scale_x_continuous(labels=scales::comma)+
theme(axis.text.x = element_text(angle = 90))+
ylab("Number of counties") +
xlab("Population")
Examine logged version to see if has a more normal distribution. Could sample weighted by log(pop) if want to weight proportionally to population without such extreme weights.
county_wrangle_geo %>%
ggplot(aes(pop_log))+
geom_histogram() +
facet_wrap(~urban_rural_6, scales = "free") + #allow axes to vary freely.
scale_x_continuous(labels=scales::comma)+
theme(axis.text.x = element_text(angle = 90))+
ylab("Number of counties") +
xlab("Natural logarithm of population")
county_wrangle_geo %>%
filter(urban_rural_6==1) %>%
mutate(
pop_urban_rural_cat = dplyr::ntile(pop, 4)
)%>%
dplyr::select(state_abbrev, pop_urban_rural_cat) %>%
mapview(
zcol = "pop_urban_rural_cat",
layer.name = "Population quartiles (1=min-25th; 4=75th-max)",
map.types = c("CartoDB.Positron"),
lwd=1.5,
col.regions = viridis_pal(alpha = 1, begin = .25, end = 1, direction = 1, option = "H"), #turbo :)
popup = FALSE #for faster rendering
)
#Create some values for future use.
#The remaining eligible population and number of counties to be sampled after the most urban is selected.
margin_total_rem = county_wrangle_nogeo %>% #rem for remaining
filter(pop_above_ur_6_10th==1) %>%
filter(ur_1==0) %>%
group_by(ur_1) %>%
summarise(
n_counties = n(),
pop = sum(pop, na.rm=TRUE))
pop_marg_elig_rem = margin_total_rem %>%
dplyr::select(pop) %>%
pull()
n_counties_marg_elig_rem = margin_total_rem %>%
dplyr::select(n_counties) %>%
pull()
n_counties_left = 300- n_counties_ur_1 #the number of counties we have left.
urban_rural_region_strata = county_wrangle_nogeo %>%
filter(pop_above_ur_6_10th==1) %>%
filter(ur_1==0) %>%
group_by(urban_rural_6, region_4_no) %>%
summarise(
n_counties = n(), #remaining counties in the sampling frame by stratum
pop_stratum = sum(pop, na.rm=TRUE)
) %>%
ungroup() %>%
mutate(
#Note pop_marg_elig_rem is not in this dataset. It is a value created just above.
#the proportion of the population remaining in each urban-rural-region stratum
prop_pop_elig_rem = pop_stratum/pop_marg_elig_rem,
n_counties_to_sample = n_counties_left*prop_pop_elig_rem, #number of counties left times the proportion of remaining population in that stratum.
n_counties_to_sample_rnd = round(n_counties_to_sample), #for sample_n, we need a rounded version
prop_counties_rem_to_sample = n_counties_to_sample/n_counties_marg_elig_rem,#the sampling fraction
#re-calculate the proportion of counties actually sampling based on the rounded value
#doing this because the rounded value is used in the sample_n function below.
#This variable can be used to calculate weights
#for pooled estimates. Note it's not actually a rounded version of prop_counties_rem_to_sample.
#It's a re-calculated proportion based on the rounded value of n_counties_to_sample.
prop_counties_rem_to_sample_rnd = n_counties_to_sample_rnd/n_counties_left,
wt = 1/prop_counties_rem_to_sample_rnd, #weights (empirical). note that the most urban stratum would have a weight of 1.
wt_prop = wt/sum(wt) #each weight's relative weight. sums to 1. easier than using the actual weight.
)
options(scipen = 99, digits = 2) #set to fewer decimals before printing table
urban_rural_region_strata
setwd(here("data-processed"))
save(urban_rural_region_strata, file = "urban_rural_region_strata.RData")
#Create a dataset of weights only.
wts = urban_rural_region_strata %>%
dplyr::select(urban_rural_6, region_4_no, wt, wt_prop)
Does the total number sampled in each stratum add up to the total remaining? No, due to rounding, 2 are missing.
n_counties_left
## [1] 232
urban_rural_region_strata %>%
mutate(dummy=1) %>%
group_by(dummy) %>%
summarise( n_counties_to_sample_rnd=sum(n_counties_to_sample_rnd)) %>%
pull(n_counties_to_sample_rnd)
## [1] 230
set.seed(123) #set seed so that the sample is the same each time.
samp = county_wrangle_nogeo %>%
filter(ur_1==0) %>%
filter(pop_above_ur_6_10th==1) %>%
left_join(urban_rural_region_strata, by = c("urban_rural_6", "region_4_no")) %>%
group_by(urban_rural_6, region_4_no) %>%
#slice_sample is the more recent update, but sample_n is better for this application.
#Use the solution by thc here: https://stackoverflow.com/questions/51671856/dplyr-sample-n-by-group-with-unique-size-argument-per-group/59186490#59186490l
sample_n(
size=n_counties_to_sample_rnd[1],
# the subset operator, [] takes the first value in that column by group.
replace=FALSE) %>%
mutate(sampled=1)
nrow(samp)
## [1] 230
mv_samp = samp %>%
left_join(county_geo_lookup, by = "county_fips") %>%
st_as_sf() %>%
dplyr::select(state_name, division_9_no, urban_rural_6, sampled) %>%
mapview(
zcol = "urban_rural_6",
layer.name = "Urban-rural classification",
map.types = c("CartoDB.Positron")
)
mv_ur_1 = county_wrangle_geo %>%
filter(ur_1==1) %>%
mapview(
layer.name = "urban-rural=1",
col.regions="red")
mv_samp + mv_ur_1
draw_sample <-function(s_id_val){
sample_df = county_wrangle_nogeo %>%
filter(ur_1==0) %>%
filter(pop_above_ur_6_10th==1) %>%
left_join(urban_rural_region_strata, by = c("urban_rural_6", "region_4_no")) %>%
group_by(urban_rural_6, region_4_no) %>%
#hmm, need to be able to vary sampling proportion by group - https://github.com/tidyverse/dplyr/issues/5299
#slice_sample is the more recent dplyr update, but sample_n is better for this application.
#Use the solution by thc here: https://stackoverflow.com/questions/51671856/dplyr-sample-n-by-group-with-unique-size-argument-per-group/59186490#59186490l
sample_n(
#the subset operator, [] takes the first value in that column by group. see above for definition.
size=n_counties_to_sample_rnd[1],
replace=FALSE) %>%
mutate(
sampled=1,
s_id = s_id_val) #iterate through this.
return(sample_df) #return this dataset.
}
Run the function using map_dfr()
. Each time the function
runs, it stacks the output below the previous run, creating a
dataframe.
#set up
n_reps = 10
s_id_val_list <- seq(from = 1, to = n_reps, by = 1)
samp_fun_df = s_id_val_list %>% map_dfr(draw_sample)
samp_fun_df_min = samp_fun_df %>%
group_by(s_id) %>% #summarize over county within sample id
summarise(
n_counties=n(),
pop_sampled = sum(pop, na.rm=TRUE),
pop_min = min(pop, na.rm=TRUE)
)
summary(samp_fun_df_min$pop_min)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2969 3050 3458 3414 3669 3988
#Reminders
#n_counties_marg_elig_rem #The number of eligible counties in the bottom 5 urban-rural categories, already defined above.
#pop_marg_elig_rem #The total population in the eligible remaining counties
#n_counties_marg_elig
#pop_marg_elig
samp_overall = samp_fun_df %>%
group_by(s_id) %>% #summarize over county within sample id
summarise(
n_counties_samp=n(),
pop_samp = sum(pop, na.rm=TRUE)
) %>%
mutate(overall=1) %>%
group_by(overall) %>% #summarise over sample id
summarise(
n_counties_samp_exp= mean(n_counties_samp, na.rm=TRUE),
n_counties_samp_sd= sd(n_counties_samp, na.rm=TRUE), #always the same
pop_sampled_exp = mean(pop_samp, na.rm=TRUE),
pop_sampled_sd = sd(pop_samp, na.rm=TRUE),
pop_sampled_min = min(pop_samp, na.rm=TRUE)
) %>%
mutate(
prop_counties_exp = n_counties_samp_exp/n_counties_marg_elig_rem,
prop_pop_exp = pop_sampled_exp/pop_marg_elig_rem,
prop_pop_sd = pop_sampled_sd/pop_marg_elig_rem
)
samp_overall
About 18% of the population and 8% of counties are sampled.
by_ur %>% filter(urban_rural_6==1)
Adding that to the 31% of the population sampled by selecting the most urban counties brings the total proportion of the population sampled to about 50%.
samp_by_urban_rural_region = samp_fun_df %>%
group_by(s_id, urban_rural_6, region_4_no) %>%
summarise(
n_counties_samp=n(),
pop_samp = sum(pop, na.rm=TRUE)
) %>%
group_by(urban_rural_6, region_4_no) %>% #summarise over sample id
summarise(
n_counties_samp_exp= mean(n_counties_samp, na.rm=TRUE),
n_counties_samp_sd= sd(n_counties_samp, na.rm=TRUE), #always the same
pop_sampled_exp = mean(pop_samp, na.rm=TRUE),
pop_sampled_sd = sd(pop_samp, na.rm=TRUE),
pop_sampled_min = min(pop_samp, na.rm=TRUE)
) %>%
mutate(
prop_counties_exp = n_counties_samp_exp/n_counties_marg_elig_rem,
prop_pop_exp = pop_sampled_exp/pop_marg_elig_rem,
prop_pop_sd = pop_sampled_sd/pop_marg_elig_rem
)
samp_by_urban_rural_region