We will need to load in both the household and person level data since some variables exist at one level but not another.
### EX: NEW MEXICO (2019)
#### Person Level
nm_p <- read.csv("~/Downloads/csv_pnm/psam_p35.csv",
header=TRUE)
dim(nm_p)
## [1] 19281 288
## Variables of Interest:
### SERIALNO
### CIT: citizenship status
### NATIVITY: 1 (native), 2 (foreign born)
### FCITP: citizenship allocation flag 0 (no), 1 (yes)
#### House Level
nm_h <- read.csv("~/Downloads/csv_hnm/psam_h35.csv",
header=TRUE)
dim(nm_h)
## [1] 10220 235
## Variables of Interest:
### SERIALNO
### NP : number of persons in household
### HHL: household language
### LNGI: limited english (aka linguistically isolated) -2 (limited)
### HINCP: household income
### RESMODE: response mode
## NOTE: might be interested in other variables about multigenerational houses
Although the ACS contains person level data it is sent to households and therefore observations for individuals are not truly independent, but rather nested within households.
We first group by SERIALNO
then summarise to have data at the household level
###### TIDYVERSE
library(tidyverse)
## GROUP_BY and SUMMARISE
## MUTATE
allocCit<-nm_p%>%
mutate(NAT01=NATIVITY-1)%>%
group_by(SERIALNO)%>%
summarise(n=n(),
n_FCITP=sum(FCITP, na.rm=TRUE),
n_NAT=sum(NAT01, na.rm=TRUE))%>%
mutate(p_FCITP=n_FCITP/n,
p_NAT=n_NAT/n)
Once data from the person level is summarised to the household level, it can then be joined to the household file.
## JOIN
hp<-nm_h%>%
left_join(allocCit)%>%
#filter(WGTP!=0)%>%
mutate(YEAR=2019)%>%
select(SERIALNO, PUMA, ST, WGTP, NP, RESMODE, HHL, LNGI,
n, n_FCITP, n_NAT, p_FCITP, p_NAT,
WGTP1:WGTP80)%>%
mutate(h_FCITP=n_FCITP>0, # indicator for if a member of a hh cit was allocated
h_NAT=n_NAT>0) # indicated for a foreign born in hh
## Joining, by = "SERIALNO"
Let’s start asking quesitons!
### What's going on in the household where citizenship is allocated?
hp%>%
filter(p_FCITP!=0)%>%
ggplot(aes(x=p_FCITP))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hp%>%
filter(p_FCITP!=0)%>%
ggplot(aes(x=p_FCITP))+
geom_histogram()+
facet_grid(.~h_NAT)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Since these data come from a finite sample, observations have weights that need to be accounted for when creating population estimates. We must first define the survey design.
library(survey)
h_design<-svrepdesign(weights= ~WGTP,
repweights = 'WGTP[1-9]+',
scale=4/80,
rscales = ncol('WGTP[1-9]+'),
mse=T,
combined.weights = T,
type='JK1',
data= hp)
svy
FunctionsWe can use functions like svymean
or svytotal
to create estimates and standard errors that account for the design.
svytotal(~n_FCITP, design=h_design, na.rm=TRUE)
## total SE
## n_FCITP 106694 7916.6
We can also use svyby
to create estimates for subgroups.
rmRHL<-svyby(~p_FCITP,~HHL, svymean, design=h_design)
rmRHL
## HHL p_FCITP se
## 1 1 0.03724156 0.003350652
## 2 2 0.05718054 0.006792114
## 3 3 0.01786736 0.008121584
## 4 4 0.07222556 0.021873983
## 5 5 0.03237394 0.007778060
#rmRHL
this.year=2019
# MEANS ACROSS SUBSETS FOR EACH YEAR
rmRHL<-as.data.frame(rmRHL)%>%
mutate(YEAR=this.year)%>%
filter(HHL!=5)
# LABELS FOR GRAPHIC
modeLabs<-data.frame(RESMODE=c(1, 2, 3),
resmodeLab=c("1. Mail", "2. CATI/CAPI", "3. Internet"),
jitter2=c(-.1, 0, .1))
langLabs<-data.frame(HHL=c(1, 2, 3, 4, 5),
hhlLab=c("1. English", "2. Spanish", "3. Indo European", "4. Asian / Pacific Islander", "5. Other"),
jitter=c(-.2, -.1, 0, .1, .2))
lngiLab<-data.frame(LNGI=c(1, 2),
lngiLab=c("1. English Proficient",
"2. Limited English"))
## WRANGLING
allRHL2<-rmRHL%>%
left_join(langLabs)%>%
#left_join(lngiLab)%>%
filter(HHL!=5)
## Joining, by = "HHL"
ggplot(allRHL2, aes(x=hhlLab, y=p_FCITP))+
geom_errorbar(aes(ymin=p_FCITP-se, ymax=p_FCITP+se), width=.2)+
geom_point()